From a1ce4fcd4b933fdbe1654397ebcc76425d251e49 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 26 Jul 2020 10:50:57 -0700 Subject: [PATCH 01/44] Make df[col] insert a new array, never overwrite --- pandas/_testing.py | 2 + pandas/core/internals/blocks.py | 26 ------------ pandas/core/internals/managers.py | 20 +++++---- pandas/tests/frame/methods/test_rename.py | 2 +- pandas/tests/frame/test_block_internals.py | 10 +++-- pandas/tests/groupby/test_apply.py | 21 ---------- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/internals/test_internals.py | 47 +++++++--------------- pandas/tests/reshape/merge/test_merge.py | 2 +- 9 files changed, 35 insertions(+), 97 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index fc6df7a95e348..d6a0143155803 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1089,6 +1089,8 @@ def _get_base(obj): raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") def _raise(left, right, err_msg): + __tracebackhide__ = True + if err_msg is None: if left.shape != right.shape: raise_assert_detail( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cc0f09ced7399..07cad041f7da9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -634,20 +634,6 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def should_store(self, value: ArrayLike) -> bool: - """ - Should we set self.values[indexer] = value inplace or do we need to cast? - - Parameters - ---------- - value : np.ndarray or ExtensionArray - - Returns - ------- - bool - """ - return is_dtype_equal(value.dtype, self.dtype) - def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format """ values = self.values @@ -1581,12 +1567,6 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def should_store(self, value: ArrayLike) -> bool: - """ - Can we set the given array-like value inplace? - """ - return isinstance(value, self._holder) - def set(self, locs, values): assert locs.tolist() == [0] self.values[:] = values @@ -1976,9 +1956,6 @@ def _can_hold_element(self, element: Any) -> bool: element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) - def should_store(self, value: ArrayLike) -> bool: - return issubclass(value.dtype.type, np.complexfloating) - class IntBlock(NumericBlock): __slots__ = () @@ -2142,7 +2119,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") - should_store = Block.should_store array_values = ExtensionBlock.array_values @property @@ -2616,8 +2592,6 @@ class CategoricalBlock(ExtensionBlock): _verify_integrity = True _can_hold_na = True - should_store = Block.should_store - def __init__(self, values, placement, ndim=None): # coerce to categorical if we can values = extract_array(values) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 895385b170c91..5915c90a27873 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1062,18 +1062,16 @@ def value_getitem(placement): for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] - if blk.should_store(value): - blk.set(blk_locs, value_getitem(val_locs)) - else: - unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) - unfit_val_locs.append(val_locs) - # If all block items are unfit, schedule the block for removal. - if len(val_locs) == len(blk.mgr_locs): - removed_blknos.append(blkno) - else: - blk.delete(blk_locs) - self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) if len(removed_blknos): # Remove blocks & update blknos accordingly diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index eb908e9472fe2..11e08ec162311 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -140,7 +140,7 @@ def test_rename_multiindex(self): def test_rename_nocopy(self, float_frame): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) - renamed["foo"] = 1.0 + renamed["foo"][:] = 1.0 assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c9fec3215d57f..097bcf88d566c 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -322,7 +322,7 @@ def test_copy_blocks(self, float_frame): blocks = df._to_dict_of_blocks(copy=True) for dtype, _df in blocks.items(): if column in _df: - _df.loc[:, column] = _df[column] + 1 + _df.loc[:, column].values[:] = _df[column] + 1 # make sure we did not change the original DataFrame assert not _df[column].equals(df[column]) @@ -334,12 +334,14 @@ def test_no_copy_blocks(self, float_frame): # use the copy=False, change a column blocks = df._to_dict_of_blocks(copy=False) - for dtype, _df in blocks.items(): + for _, _df in blocks.items(): if column in _df: - _df.loc[:, column] = _df[column] + 1 + _df.loc[:, column].values[:] = _df[column] + 1 + # FIXME: I think the need for .values here means we are doing something wrong # make sure we did change the original DataFrame - assert _df[column].equals(df[column]) + tm.assert_series_equal(df[column], _df[column]) + #assert _df[column].equals(df[column]) def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..1864483f856fe 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -966,27 +966,6 @@ def test_apply_function_index_return(function): tm.assert_series_equal(result, expected) -def test_apply_function_with_indexing(): - # GH: 33058 - df = pd.DataFrame( - {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} - ) - - def fn(x): - x.col2[x.index[-1]] = 0 - return x.col2 - - result = df.groupby(["col1"], as_index=False).apply(fn) - expected = pd.Series( - [1, 2, 0, 4, 5, 0], - index=pd.MultiIndex.from_tuples( - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] - ), - name="col2", - ) - tm.assert_series_equal(result, expected) - - def test_apply_function_with_indexing_return_column(): # GH: 7002 df = DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 30b13b6ea9fce..d815af3d2b8df 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -877,7 +877,7 @@ def test_identity_slice_returns_new_object(self): assert original_df[:] is not original_df # should be a shallow copy - original_df["a"] = [4, 4, 4] + original_df["a"][:] = [4, 4, 4] assert (sliced_df["a"] == 4).all() # These should not return copies diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 06ccdd2484a2a..5a7375346f596 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -642,33 +642,31 @@ def test_get_numeric_data(self): item_shape=(3,), ) mgr.iset(5, np.array([1, 2, 3], dtype=np.object_)) - numeric = mgr.get_numeric_data() tm.assert_index_equal( numeric.items, pd.Index(["int", "float", "complex", "bool"]) ) + mgr_idx = mgr.items.get_loc("float") + num_idx = numeric.items.get_loc("float") + tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - numeric.iget(numeric.items.get_loc("float")).internal_values(), + mgr.iget(mgr_idx).internal_values(), + numeric.iget(num_idx).internal_values(), ) # Check sharing - numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0])) + numeric.iget(num_idx).internal_values()[:] = [100.0, 200.0, 300.0] tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), + mgr.iget(mgr_idx).internal_values(), np.array([100.0, 200.0, 300.0]), ) numeric2 = mgr.get_numeric_data(copy=True) tm.assert_index_equal( numeric.items, pd.Index(["int", "float", "complex", "bool"]) ) - numeric2.iset( - numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0]) - ) + numeric2.iget(num_idx).internal_values()[:] = [1000.0, 2000.0, 3000.0] tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), + mgr.iget(mgr_idx).internal_values(), np.array([100.0, 200.0, 300.0]), ) def test_get_bool_data(self): @@ -686,18 +684,20 @@ def test_get_bool_data(self): bools.iget(bools.items.get_loc("bool")).internal_values(), ) + # GH#33457 setting a new array on bools does _not_ alter the original + # array in-place, so mgr is unchanged bools.iset(0, np.array([True, False, True])) tm.assert_numpy_array_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), + np.array([True, True, True]), ) - # Check sharing + # Check non-sharing bools2 = mgr.get_bool_data(copy=True) - bools2.iset(0, np.array([False, True, False])) + bools2.blocks[0].values[:] = [[False, True, False]] tm.assert_numpy_array_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), + np.array([True, True, True]), ) def test_unicode_repr_doesnt_raise(self): @@ -1181,23 +1181,6 @@ def test_binop_other(self, op, value, dtype): tm.assert_series_equal(result, expected) -class TestShouldStore: - def test_should_store_categorical(self): - cat = pd.Categorical(["A", "B", "C"]) - df = pd.DataFrame(cat) - blk = df._mgr.blocks[0] - - # matching dtype - assert blk.should_store(cat) - assert blk.should_store(cat[:-1]) - - # different dtype - assert not blk.should_store(cat.as_ordered()) - - # ndarray instead of Categorical - assert not blk.should_store(np.asarray(cat)) - - @pytest.mark.parametrize( "typestr, holder", [ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4fd3c688b8771..8a6285361c1f5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -284,7 +284,7 @@ def test_merge_nocopy(self): merged = merge(left, right, left_index=True, right_index=True, copy=False) - merged["a"] = 6 + merged.loc[:, "a"] = 6 assert (left["a"] == 6).all() merged["d"] = "peekaboo" From e60023725693586e8b9b47b9d8eaa14e09ea0ca0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 27 Jul 2020 08:01:47 -0500 Subject: [PATCH 02/44] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c2a4abbea107c..e4a518c93b421 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -351,6 +351,83 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. +Assigning with ``DataFrame.__setitem__`` consistently creates a new array +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`) + +Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the +underlying array, and sometimes assign a new array. Fixing this inconsistency +can have behavior-changing implications for workloads that relied on inplace +mutation. The two most common cases are creating a ``DataFrame`` from an array +and slicing a ``DataFrame``. + +*Previous Behavior* + +The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype. + +.. code-block:: python + + >>> import pandas as pd + >>> import numpy as np + >>> a = np.array([1, 2, 3]) + >>> df = pd.DataFrame(a, columns=['a']) + >>> df['a'] = 0 + >>> a # mutated inplace + array([0, 0, 0]) + +But not others, like :class:`Int64Dtype`. + +.. code-block:: python + + >>> import pandas as pd + >>> import numpy as np + >>> a = pd.array([1, 2, 3], dtype="Int64") + >>> df = pd.DataFrame(a, columns=['a']) + >>> df['a'] = 0 + >>> a # not mutated + + [1, 2, 3] + Length: 3, dtype: Int64 + + +*New Behavior* + +In pandas 1.1.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than +mutating the existing array inplace. + +.. ipython:: python + +For NumPy's int64 dtype + + import pandas as pd + import numpy as np + a = np.array([1, 2, 3]) + df = pd.DataFrame(a, columns=['a']) + df['a'] = 0 + a # not mutated + +For :class:`Int64Dtype`. + + import pandas as pd + import numpy as np + a = pd.array([1, 2, 3], dtype="Int64") + df = pd.DataFrame(a, columns=['a']) + df['a'] = 0 + a # not mutated + +This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``. + +.. code-block:: python + + df = pd.DataFrame({"A": [1, 2, 3]}) + df2 = df[['A']] + df['A'] = np.array([0, 0, 0]) + +Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a +new array is consistently assigned, so ``df2`` is not mutated. + ``MultiIndex.get_indexer`` interprets ``method`` argument correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 140f5f28999143f2302813a264a22234004ae917 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Jul 2020 09:53:18 -0700 Subject: [PATCH 03/44] update tests --- pandas/core/frame.py | 6 ++--- pandas/core/generic.py | 6 ++--- pandas/core/indexing.py | 8 ++++-- pandas/core/internals/blocks.py | 26 ++++++++++++++++++++ pandas/core/internals/managers.py | 22 +++++++++-------- pandas/tests/frame/indexing/test_indexing.py | 6 ++--- pandas/tests/indexing/test_iloc.py | 2 +- 7 files changed, 54 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f52341ed782d8..13dbbc77eaae6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3085,18 +3085,18 @@ def _setitem_frame(self, key, value): self._check_setitem_copy() self._where(-key, value, inplace=True) - def _iset_item(self, loc: int, value): + def _iset_item(self, loc: int, value, inplace: bool = False): self._ensure_valid_index(value) # technically _sanitize_column expects a label, not a position, # but the behavior is the same as long as we pass broadcast=False value = self._sanitize_column(loc, value, broadcast=False) - NDFrame._iset_item(self, loc, value) + NDFrame._iset_item(self, loc, value, inplace=inplace) # check if we are modifying a copy # try to set first as we want an invalid # value exception to occur first - if len(self): + if len(self): # FIXME: this should depend on inplace, right? self._check_setitem_copy() def _set_item(self, key, value): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..464f44be9f8a9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3193,7 +3193,7 @@ def _maybe_cache_changed(self, item, value) -> None: The object has called back to us saying maybe it has changed. """ loc = self._info_axis.get_loc(item) - self._mgr.iset(loc, value) + self._mgr.iset(loc, value, inplace=False) @property def _is_cached(self) -> bool_t: @@ -3556,8 +3556,8 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result - def _iset_item(self, loc: int, value) -> None: - self._mgr.iset(loc, value) + def _iset_item(self, loc: int, value, inplace: bool = False) -> None: + self._mgr.iset(loc, value, inplace=inplace) self._clear_item_cache() def _set_item(self, key, value) -> None: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 04d1dbceb3342..3bbb4683c993d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1692,7 +1692,7 @@ def isetter(loc, v): ser._maybe_update_cacher(clear=True) # reset the sliced object if unique - self.obj._iset_item(loc, ser) + self.obj._iset_item(loc, ser, inplace=True) # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) @@ -1780,7 +1780,11 @@ def isetter(loc, v): ) and item_labels.is_unique ): - self.obj[item_labels[indexer[info_axis]]] = value + + col = item_labels[indexer[info_axis]] + loc = item_labels.get_loc(col) + self.obj._iset_item(loc, value, inplace=True) + #self.obj[loc] = value return indexer = maybe_convert_ix(*indexer) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 07cad041f7da9..cc0f09ced7399 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -634,6 +634,20 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, dtype) return isinstance(element, dtype) + def should_store(self, value: ArrayLike) -> bool: + """ + Should we set self.values[indexer] = value inplace or do we need to cast? + + Parameters + ---------- + value : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + return is_dtype_equal(value.dtype, self.dtype) + def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format """ values = self.values @@ -1567,6 +1581,12 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values + def should_store(self, value: ArrayLike) -> bool: + """ + Can we set the given array-like value inplace? + """ + return isinstance(value, self._holder) + def set(self, locs, values): assert locs.tolist() == [0] self.values[:] = values @@ -1956,6 +1976,9 @@ def _can_hold_element(self, element: Any) -> bool: element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) + def should_store(self, value: ArrayLike) -> bool: + return issubclass(value.dtype.type, np.complexfloating) + class IntBlock(NumericBlock): __slots__ = () @@ -2119,6 +2142,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") + should_store = Block.should_store array_values = ExtensionBlock.array_values @property @@ -2592,6 +2616,8 @@ class CategoricalBlock(ExtensionBlock): _verify_integrity = True _can_hold_na = True + should_store = Block.should_store + def __init__(self, values, placement, ndim=None): # coerce to categorical if we can values = extract_array(values) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5915c90a27873..6bb5b4422918c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1011,7 +1011,7 @@ def idelete(self, indexer): ) self._rebuild_blknos_and_blklocs() - def iset(self, loc: Union[int, slice, np.ndarray], value): + def iset(self, loc: Union[int, slice, np.ndarray], value, inplace: bool = False): """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items @@ -1062,16 +1062,18 @@ def value_getitem(placement): for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] - - unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) - unfit_val_locs.append(val_locs) - - # If all block items are unfit, schedule the block for removal. - if len(val_locs) == len(blk.mgr_locs): - removed_blknos.append(blkno) + if inplace and blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs)) else: - blk.delete(blk_locs) - self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) if len(removed_blknos): # Remove blocks & update blknos accordingly diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index d27487dfb8aaa..55212518f5284 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -881,7 +881,7 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - sliced["C"] = 4.0 + sliced.loc[:, "C"] = 4.0 assert (float_frame["C"] == 4).all() @@ -1581,7 +1581,7 @@ def test_iloc_row(self): # setting it makes it raise/warn msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - result[2] = 0.0 + result.loc[:, 2] = 0.0 exp_col = df[2].copy() exp_col[4:8] = 0.0 @@ -1613,7 +1613,7 @@ def test_iloc_col(self): # and that we are setting a copy msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - result[8] = 0.0 + result.loc[:, 8] = 0.0 assert (df[8] == 0).all() diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c5f40102874dd..d7e08c606667b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -670,7 +670,7 @@ def test_identity_slice_returns_new_object(self): assert sliced_df is not original_df # should be a shallow copy - original_df["a"] = [4, 4, 4] + original_df.loc[:, "a"] = [4, 4, 4] assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) From 989ba977800b7de2348baa2277689ff5a65b9454 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Jul 2020 20:33:15 -0700 Subject: [PATCH 04/44] update whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..9c05df6744cd6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -101,7 +101,7 @@ Interval Indexing ^^^^^^^^ - +- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`35417`) - - diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c2811f7b6e0d7..e6515cc1f5755 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -694,8 +694,8 @@ def test_series_indexing_zerodim_np_array(self): result = s.iloc[np.array(0)] assert result == 1 - @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457") def test_iloc_setitem_categorical_updates_inplace(self): + # GH#35417 # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = pd.Categorical(["A", "B", "C"]) df = pd.DataFrame({1: cat, 2: [1, 2, 3]}) From 59e447a825a6e52d439ae1e27b98cda10cc548aa Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 15 Aug 2020 15:42:56 -0700 Subject: [PATCH 05/44] Update merge test --- pandas/core/internals/blocks.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f3286b3c20965..de9ac9c8e498f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1587,7 +1587,7 @@ def should_store(self, value: ArrayLike) -> bool: def set(self, locs, values): assert locs.tolist() == [0] - self.values = values + self.values[:] = values def putmask( self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8a6285361c1f5..c079c4f77f24e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -287,7 +287,7 @@ def test_merge_nocopy(self): merged.loc[:, "a"] = 6 assert (left["a"] == 6).all() - merged["d"] = "peekaboo" + merged.loc[:, "d"] = "peekaboo" assert (right["d"] == "peekaboo").all() def test_intelligently_handle_join_key(self): From 11b8093e5d691a89beaee1969cfed83508c3fcd7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Aug 2020 12:00:39 -0700 Subject: [PATCH 06/44] Fix cython groubpy.apply func for funcs that mutate inplace --- pandas/_libs/reduction.pyx | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 7b36bc8baf891..2ecdf47261f57 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -404,7 +404,8 @@ cdef class BlockSlider: object frame, dummy, index int nblocks Slider idx_slider - list blocks + list blocks, blk_values + ndarray orig_blklocs, orig_blknos cdef: char **base_ptrs @@ -418,20 +419,27 @@ cdef class BlockSlider: self.dummy = frame[:0] self.index = self.dummy.index - self.blocks = [b.values for b in self.dummy._mgr.blocks] + # GH#35417 attributes we need to restore at each step in case + # the function modified them. + mgr = self.dummy._mgr + self.orig_blklocs = mgr.blklocs + self.orig_blknos = mgr.blknos + self.blocks = [x for x in self.dummy._mgr.blocks] - for x in self.blocks: + self.blk_values = [b.values for b in self.dummy._mgr.blocks] + + for x in self.blk_values: util.set_array_not_contiguous(x) - self.nblocks = len(self.blocks) + self.nblocks = len(self.blk_values) # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference to a 1-d # ndarray like datetime / timedelta / period. self.idx_slider = Slider( self.frame.index._index_data, self.dummy.index._index_data) - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) - for i, block in enumerate(self.blocks): + self.base_ptrs = malloc(sizeof(char*) * len(self.blk_values)) + for i, block in enumerate(self.blk_values): self.base_ptrs[i] = (block).data def __dealloc__(self): @@ -442,9 +450,11 @@ cdef class BlockSlider: ndarray arr Py_ssize_t i + self._restore_blocks() + # move blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] + arr.strides[1] * start @@ -456,14 +466,25 @@ cdef class BlockSlider: object.__setattr__(self.index, '_index_data', self.idx_slider.buf) self.index._engine.clear_mapping() + cdef _restore_blocks(self): + """ + Ensure that we have the original blocks, blknos, and blklocs. + """ + mgr = self.dummy._mgr + mgr.blocks = self.blocks + mgr._blklocs = self.orig_blklocs + mgr._blknos = self.orig_blknos + cdef reset(self): cdef: ndarray arr Py_ssize_t i + self._restore_blocks() + # reset blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] From 42713039f6d109e6ff9870f77c91c5fc0a34d147 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Aug 2020 12:02:30 -0700 Subject: [PATCH 07/44] cleanup commented-out --- pandas/core/indexing.py | 1 - pandas/tests/frame/test_block_internals.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d93c33c6ae754..afccf61a7256f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1795,7 +1795,6 @@ def isetter(loc, v): col = item_labels[indexer[info_axis]] loc = item_labels.get_loc(col) self.obj._iset_item(loc, value, inplace=True) - #self.obj[loc] = value return indexer = maybe_convert_ix(*indexer) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 097bcf88d566c..971f07a8e5b6d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -337,11 +337,11 @@ def test_no_copy_blocks(self, float_frame): for _, _df in blocks.items(): if column in _df: _df.loc[:, column].values[:] = _df[column] + 1 - # FIXME: I think the need for .values here means we are doing something wrong + # FIXME: I think the need for .values here means we are + # doing something wrong # make sure we did change the original DataFrame tm.assert_series_equal(df[column], _df[column]) - #assert _df[column].equals(df[column]) def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() From ed0af51dd63e43bcdbf0336323e60dc34a66ba47 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Aug 2020 13:16:07 -0700 Subject: [PATCH 08/44] mypy fixup --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 39e7b34e6c13c..fa6791d371d1e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3641,7 +3641,7 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result - def _iset_item(self, loc: int, value, inplace: bool = False) -> None: + def _iset_item(self, loc: int, value, inplace: bool_t = False) -> None: self._mgr.iset(loc, value, inplace=inplace) self._clear_item_cache() From 53992df8592867101429575797a207d3edad8961 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Aug 2020 08:50:33 -0700 Subject: [PATCH 09/44] Test for #35731 --- pandas/tests/frame/test_block_internals.py | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index aa84cc0429f69..cef4f1b03f92e 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -646,3 +646,27 @@ def test_to_dict_of_blocks_item_cache(): assert df.loc[0, "b"] == "foo" assert df["b"] is ser + + +def test_fillna_sets_valid_block_values(): + # GH#35731 fillna call was setting Block.values to a Series + df = pd.DataFrame( + { + "a": pd.Series([np.nan, 2.0, 3.0, 1.0]).astype("category"), + "b": pd.Series(["A", "A", "B", "C"]).astype("category"), + "c": pd.Series(["D", "E", "E", np.nan]).astype("category"), + } + ) + cats = df["a"].cat.categories.tolist() + # append new category + cats.append(0.0) + df["a"] = df["a"].astype(pd.CategoricalDtype(categories=cats, ordered=False)) + + # fillna with that new category + df["a"].fillna(0, inplace=True) + + # check we havent put a Series into any block.values + assert all(isinstance(blk.values, pd.Categorical) for blk in df._mgr.blocks) + + # smoketest for OP bug from GH#35731 + df.isnull().sum() From f9498a3a93f064a53caf82317521a26f54707b55 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Oct 2020 12:50:27 -0700 Subject: [PATCH 10/44] revert accidental --- pandas/_libs/reduction.pyx | 75 ++++++++++---------------------------- 1 file changed, 20 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index e938499ee3515..9459cd297c758 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,7 +1,5 @@ from copy import copy -from cython import Py_ssize_t - from libc.stdlib cimport free, malloc import numpy as np @@ -11,14 +9,14 @@ from numpy cimport int64_t, ndarray cnp.import_array() -from pandas._libs cimport util +from pandas._libs.util cimport is_array, set_array_not_contiguous from pandas._libs.lib import is_scalar, maybe_convert_objects cpdef check_result_array(object obj, Py_ssize_t cnt): - if (util.is_array(obj) or + if (is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Must produce aggregated value') @@ -33,7 +31,7 @@ cdef class _BaseGrouper: if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') - if util.is_array(values) and not values.flags.contiguous: + if is_array(values) and not values.flags.contiguous: # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values @@ -106,7 +104,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.f = f values = series.values - if util.is_array(values) and not values.flags.c_contiguous: + if is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -204,7 +202,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.f = f values = series.values - if util.is_array(values) and not values.flags.c_contiguous: + if is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -288,9 +286,9 @@ cpdef inline extract_result(object res, bint squeeze=True): res = res._values if squeeze and res.ndim == 1 and len(res) == 1: res = res[0] - if hasattr(res, 'values') and util.is_array(res.values): + if hasattr(res, 'values') and is_array(res.values): res = res.values - if util.is_array(res): + if is_array(res): if res.ndim == 0: res = res.item() elif squeeze and res.ndim == 1 and len(res) == 1: @@ -304,7 +302,7 @@ cdef class Slider: """ cdef: ndarray values, buf - Py_ssize_t stride, orig_len, orig_stride + Py_ssize_t stride char *orig_data def __init__(self, ndarray values, ndarray buf): @@ -316,11 +314,9 @@ cdef class Slider: self.values = values self.buf = buf - self.stride = values.strides[0] + self.stride = values.strides[0] self.orig_data = self.buf.data - self.orig_len = self.buf.shape[0] - self.orig_stride = self.buf.strides[0] self.buf.data = self.values.data self.buf.strides[0] = self.stride @@ -333,10 +329,8 @@ cdef class Slider: self.buf.shape[0] = end - start cdef reset(self): - - self.buf.shape[0] = self.orig_len self.buf.data = self.orig_data - self.buf.strides[0] = self.orig_stride + self.buf.shape[0] = 0 class InvalidApply(Exception): @@ -408,37 +402,24 @@ cdef class BlockSlider: """ Only capable of sliding on axis=0 """ - - cdef public: - object frame, dummy, index - int nblocks - Slider idx_slider - list blocks, blk_values - ndarray orig_blklocs, orig_blknos - cdef: + object frame, dummy, index, block + list blk_values + ndarray values + Slider idx_slider char **base_ptrs + int nblocks + Py_ssize_t i def __init__(self, object frame): - cdef: - Py_ssize_t i - object b - self.frame = frame self.dummy = frame[:0] self.index = self.dummy.index - # GH#35417 attributes we need to restore at each step in case - # the function modified them. - mgr = self.dummy._mgr - self.orig_blklocs = mgr.blklocs - self.orig_blknos = mgr.blknos - self.blocks = [x for x in self.dummy._mgr.blocks] + self.blk_values = [block.values for block in self.dummy._mgr.blocks] - self.blk_values = [b.values for b in self.dummy._mgr.blocks] - - for x in self.blk_values: - util.set_array_not_contiguous(x) + for values in self.blk_values: + set_array_not_contiguous(values) self.nblocks = len(self.blk_values) # See the comment in indexes/base.py about _index_data. @@ -447,7 +428,7 @@ cdef class BlockSlider: self.idx_slider = Slider( self.frame.index._index_data, self.dummy.index._index_data) - self.base_ptrs = malloc(sizeof(char*) * len(self.blk_values)) + self.base_ptrs = malloc(sizeof(char*) * self.nblocks) for i, block in enumerate(self.blk_values): self.base_ptrs[i] = (block).data @@ -458,9 +439,6 @@ cdef class BlockSlider: cdef: ndarray arr Py_ssize_t i - - self._restore_blocks() - # move blocks for i in range(self.nblocks): arr = self.blk_values[i] @@ -476,23 +454,10 @@ cdef class BlockSlider: self.index._engine.clear_mapping() self.index._cache.clear() # e.g. inferred_freq must go - cdef _restore_blocks(self): - """ - Ensure that we have the original blocks, blknos, and blklocs. - """ - mgr = self.dummy._mgr - mgr.blocks = self.blocks - mgr._blklocs = self.orig_blklocs - mgr._blknos = self.orig_blknos - cdef reset(self): cdef: ndarray arr Py_ssize_t i - - self._restore_blocks() - - # reset blocks for i in range(self.nblocks): arr = self.blk_values[i] From ea49ae2e8387057044b432c547cdc4fff80b2632 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 12 Oct 2020 08:13:49 -0700 Subject: [PATCH 11/44] move whatsnew note --- doc/source/whatsnew/v1.1.0.rst | 77 ---------------------- doc/source/whatsnew/v1.2.0.rst | 84 ++++++++++++++++++++++++ pandas/tests/internals/test_internals.py | 6 +- 3 files changed, 88 insertions(+), 79 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8114d19702e55..e054ac830ce41 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -351,83 +351,6 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -Assigning with ``DataFrame.__setitem__`` consistently creates a new array -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - -Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`) - -Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the -underlying array, and sometimes assign a new array. Fixing this inconsistency -can have behavior-changing implications for workloads that relied on inplace -mutation. The two most common cases are creating a ``DataFrame`` from an array -and slicing a ``DataFrame``. - -*Previous Behavior* - -The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype. - -.. code-block:: python - - >>> import pandas as pd - >>> import numpy as np - >>> a = np.array([1, 2, 3]) - >>> df = pd.DataFrame(a, columns=['a']) - >>> df['a'] = 0 - >>> a # mutated inplace - array([0, 0, 0]) - -But not others, like :class:`Int64Dtype`. - -.. code-block:: python - - >>> import pandas as pd - >>> import numpy as np - >>> a = pd.array([1, 2, 3], dtype="Int64") - >>> df = pd.DataFrame(a, columns=['a']) - >>> df['a'] = 0 - >>> a # not mutated - - [1, 2, 3] - Length: 3, dtype: Int64 - - -*New Behavior* - -In pandas 1.1.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than -mutating the existing array inplace. - -.. ipython:: python - -For NumPy's int64 dtype - - import pandas as pd - import numpy as np - a = np.array([1, 2, 3]) - df = pd.DataFrame(a, columns=['a']) - df['a'] = 0 - a # not mutated - -For :class:`Int64Dtype`. - - import pandas as pd - import numpy as np - a = pd.array([1, 2, 3], dtype="Int64") - df = pd.DataFrame(a, columns=['a']) - df['a'] = 0 - a # not mutated - -This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``. - -.. code-block:: python - - df = pd.DataFrame({"A": [1, 2, 3]}) - df2 = df[['A']] - df['A'] = np.array([0, 0, 0]) - -Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a -new array is consistently assigned, so ``df2`` is not mutated. - ``MultiIndex.get_indexer`` interprets ``method`` argument correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e5567224fff07..15bc6b2fa0f30 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -192,6 +192,90 @@ Other enhancements - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) +.. --------------------------------------------------------------------------- + +.. whatsnew_120.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +Assigning with ``DataFrame.__setitem__`` consistently creates a new array +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`) + +Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the +underlying array, and sometimes assign a new array. Fixing this inconsistency +can have behavior-changing implications for workloads that relied on inplace +mutation. The two most common cases are creating a ``DataFrame`` from an array +and slicing a ``DataFrame``. + +*Previous Behavior* + +The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype. + +.. code-block:: python + + >>> import pandas as pd + >>> import numpy as np + >>> a = np.array([1, 2, 3]) + >>> df = pd.DataFrame(a, columns=['a']) + >>> df['a'] = 0 + >>> a # mutated inplace + array([0, 0, 0]) + +But not others, like :class:`Int64Dtype`. + +.. code-block:: python + + >>> import pandas as pd + >>> import numpy as np + >>> a = pd.array([1, 2, 3], dtype="Int64") + >>> df = pd.DataFrame(a, columns=['a']) + >>> df['a'] = 0 + >>> a # not mutated + + [1, 2, 3] + Length: 3, dtype: Int64 + + +*New Behavior* + +In pandas 1.1.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than +mutating the existing array inplace. + +.. ipython:: python + +For NumPy's int64 dtype + + import pandas as pd + import numpy as np + a = np.array([1, 2, 3]) + df = pd.DataFrame(a, columns=['a']) + df['a'] = 0 + a # not mutated + +For :class:`Int64Dtype`. + + import pandas as pd + import numpy as np + a = pd.array([1, 2, 3], dtype="Int64") + df = pd.DataFrame(a, columns=['a']) + df['a'] = 0 + a # not mutated + +This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``. + +.. code-block:: python + + df = pd.DataFrame({"A": [1, 2, 3]}) + df2 = df[['A']] + df['A'] = np.array([0, 0, 0]) + +Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a +new array is consistently assigned, so ``df2`` is not mutated. .. _whatsnew_120.api_breaking.python: Increased minimum version for Python diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c6b9868f5d30d..00ed0c998d51a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -649,7 +649,8 @@ def test_get_numeric_data(self): # Check sharing numeric.iget(num_idx).internal_values()[:] = [100.0, 200.0, 300.0] tm.assert_almost_equal( - mgr.iget(mgr_idx).internal_values(), np.array([100.0, 200.0, 300.0]), + mgr.iget(mgr_idx).internal_values(), + np.array([100.0, 200.0, 300.0]), ) numeric2 = mgr.get_numeric_data(copy=True) @@ -658,7 +659,8 @@ def test_get_numeric_data(self): ) numeric2.iget(num_idx).internal_values()[:] = [1000.0, 2000.0, 3000.0] tm.assert_almost_equal( - mgr.iget(mgr_idx).internal_values(), np.array([100.0, 200.0, 300.0]), + mgr.iget(mgr_idx).internal_values(), + np.array([100.0, 200.0, 300.0]), ) def test_get_bool_data(self): From fed47829b7779fdffdfa438b3cb15361ec1806e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 12 Oct 2020 10:04:22 -0700 Subject: [PATCH 12/44] update version ref --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 24cfba1dece61..a66461029fa58 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -244,7 +244,7 @@ But not others, like :class:`Int64Dtype`. *New Behavior* -In pandas 1.1.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than +In pandas 1.2.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than mutating the existing array inplace. .. ipython:: python From a00702a6347d86cb05a30703309a0bf50b3775cb Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 12 Oct 2020 20:33:22 -0700 Subject: [PATCH 13/44] restore --- pandas/_libs/reduction.pyx | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 9459cd297c758..1978e2988f85d 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -404,7 +404,8 @@ cdef class BlockSlider: """ cdef: object frame, dummy, index, block - list blk_values + list blocks, blk_values + ndarray orig_blklocs, orig_blknos ndarray values Slider idx_slider char **base_ptrs @@ -416,6 +417,13 @@ cdef class BlockSlider: self.dummy = frame[:0] self.index = self.dummy.index + # GH#35417 attributes we need to restore at each step in case + # the function modified them. + mgr = self.dummy._mgr + self.orig_blklocs = mgr.blklocs + self.orig_blknos = mgr.blknos + self.blocks = [x for x in self.dummy._mgr.blocks] + self.blk_values = [block.values for block in self.dummy._mgr.blocks] for values in self.blk_values: @@ -439,6 +447,9 @@ cdef class BlockSlider: cdef: ndarray arr Py_ssize_t i + + self._restore_blocks() + # move blocks for i in range(self.nblocks): arr = self.blk_values[i] @@ -458,9 +469,21 @@ cdef class BlockSlider: cdef: ndarray arr Py_ssize_t i + + self._restore_blocks() + for i in range(self.nblocks): arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] arr.shape[1] = 0 + + cdef _restore_blocks(self): + """ + Ensure that we have the original blocks, blknos, and blklocs. + """ + mgr = self.dummy._mgr + mgr.blocks = self.blocks + mgr._blklocs = self.orig_blklocs + mgr._blknos = self.orig_blknos From 1dd58ab944f5d0e290fb19d561937f3ecd0d6caa Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Oct 2020 09:50:42 -0700 Subject: [PATCH 14/44] split_path compat --- pandas/core/frame.py | 2 +- pandas/core/indexing.py | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 002a76740935a..66602c45b0b76 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3090,7 +3090,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: - self.loc._ensure_listlike_indexer(key, axis=1) + self.loc._ensure_listlike_indexer(key, axis=1, value=value) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 93cbb473ef7fa..93fd7c913df21 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -633,7 +633,7 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e - def _ensure_listlike_indexer(self, key, axis=None): + def _ensure_listlike_indexer(self, key, axis=None, value=None): """ Ensure that a list-like of column labels are all present by adding them if they do not already exist. @@ -663,9 +663,15 @@ def _ensure_listlike_indexer(self, key, axis=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for k in key: + for i, k in enumerate(key): if k not in self.obj: - self.obj[k] = np.nan + if value is None: + self.obj[k] = np.nan + elif is_list_like(value): + # TODO: be more careful about this, maybe check before calling? + self.obj[k] = value[i] + else: + self.obj[k] = value def __setitem__(self, key, value): if isinstance(key, tuple): @@ -1571,11 +1577,11 @@ def _setitem_with_indexer(self, indexer, value): # essentially this separates out the block that is needed # to possibly be modified if self.ndim > 1 and i == info_axis: - # add the new item, and set the value # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a # list-like + len_non_info_axes = ( len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i ) @@ -1589,7 +1595,11 @@ def _setitem_with_indexer(self, indexer, value): return # add a new item with the dtype setup - self.obj[key] = infer_fill_value(value) + if com.is_null_slice(indexer[0]): + # We are setting the entire column + self.obj[key] = value + else: + self.obj[key] = infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes @@ -1781,6 +1791,13 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) for idx in pi ): + blk = ser._mgr.blocks[0] + if blk._can_hold_element(value) and is_scalar(value): + # FIXME: ExtensionBlock._can_hold_element + # We can do an inplace-setting, do it directly on _values + # to get our underlying + ser._values[plane_indexer] = value + return ser = value else: # set the item, possibly having a dtype change From 7e9ea2d76de1bbb6a8bbedf8dabba07a52a786d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 23 Oct 2020 16:42:20 -0700 Subject: [PATCH 15/44] lint fixup --- pandas/tests/internals/test_internals.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index bf5e180ff8f00..a4e3c51428988 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -635,9 +635,7 @@ def test_get_numeric_data(self): ) mgr.iset(5, np.array([1, 2, 3], dtype=np.object_)) numeric = mgr.get_numeric_data() - tm.assert_index_equal( - numeric.items, pd.Index(["int", "float", "complex", "bool"]) - ) + tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) mgr_idx = mgr.items.get_loc("float") num_idx = numeric.items.get_loc("float") @@ -654,9 +652,7 @@ def test_get_numeric_data(self): ) numeric2 = mgr.get_numeric_data(copy=True) - tm.assert_index_equal( - numeric.items, pd.Index(["int", "float", "complex", "bool"]) - ) + tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) numeric2.iget(num_idx).internal_values()[:] = [1000.0, 2000.0, 3000.0] tm.assert_almost_equal( mgr.iget(mgr_idx).internal_values(), From 04af8fadb8569553ce5f79ba962f109f85316bf3 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Nov 2020 07:53:43 -0800 Subject: [PATCH 16/44] doc fixup, split tests --- doc/source/whatsnew/v1.2.0.rst | 6 ++++-- pandas/tests/internals/test_internals.py | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 196e4a79ea26a..740f6f3bbeacf 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -413,10 +413,10 @@ But not others, like :class:`Int64Dtype`. In pandas 1.2.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than mutating the existing array inplace. -.. ipython:: python - For NumPy's int64 dtype +.. ipython:: python + import pandas as pd import numpy as np a = np.array([1, 2, 3]) @@ -426,6 +426,8 @@ For NumPy's int64 dtype For :class:`Int64Dtype`. +.. ipython:: python + import pandas as pd import numpy as np a = pd.array([1, 2, 3], dtype="Int64") diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3042a284ce264..1eb6850b50a36 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -669,6 +669,17 @@ def test_get_numeric_data(self): np.array([100.0, 200.0, 300.0]), ) + def test_get_numeric_data_copy(self): + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.iset(5, np.array([1, 2, 3], dtype=np.object_)) + numeric = mgr.get_numeric_data() + mgr_idx = mgr.items.get_loc("float") + num_idx = numeric.items.get_loc("float") + numeric2 = mgr.get_numeric_data(copy=True) tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) numeric2.iget(num_idx).internal_values()[:] = [1000.0, 2000.0, 3000.0] @@ -700,7 +711,15 @@ def test_get_bool_data(self): np.array([True, True, True]), ) - # Check non-sharing + def test_get_bool_data_copy(self): + # GH#35417 + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.iset(6, np.array([True, False, True], dtype=np.object_)) + bools2 = mgr.get_bool_data(copy=True) bools2.blocks[0].values[:] = [[False, True, False]] tm.assert_numpy_array_equal( From 1d2b6d7107bcb4201b726282034f1aee171fd74e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Nov 2020 09:38:06 -0800 Subject: [PATCH 17/44] update test expected --- pandas/tests/internals/test_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 1eb6850b50a36..06a75d2cc2a13 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -685,7 +685,7 @@ def test_get_numeric_data_copy(self): numeric2.iget(num_idx).internal_values()[:] = [1000.0, 2000.0, 3000.0] tm.assert_almost_equal( mgr.iget(mgr_idx).internal_values(), - np.array([100.0, 200.0, 300.0]), + np.array([1.0, 1.0, 1.0]), ) def test_get_bool_data(self): From 6c8b15fcb4629a7c137f3df56264dd1f20625bb1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Dec 2020 17:03:40 -0800 Subject: [PATCH 18/44] fix broken test --- pandas/core/indexing.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 579e5ad9840b5..d7f25f80a2b4f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,6 +11,7 @@ from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc +from pandas.core.dtypes.cast import maybe_infer_dtype_type from pandas.core.dtypes.common import ( is_array_like, is_hashable, @@ -675,8 +676,22 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): # GH#38148 keys = self.obj.columns.union(key, sort=False) + # Try to get the right dtype when we do this reindex. + fv = None + if is_scalar(value): + fv = value + else: + dtype = maybe_infer_dtype_type(value) + if dtype is not None: + fv = dtype.type(0) # TODO: or try to just do value[0]? + self.obj._mgr = self.obj._mgr.reindex_axis( - keys, axis=0, copy=False, consolidate=False, only_slice=True + keys, + axis=0, + copy=False, + consolidate=False, + only_slice=True, + fill_value=fv, ) def __setitem__(self, key, value): From 687d262186dfb507744ea1503fd6902b23286bae Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Dec 2020 09:40:02 -0800 Subject: [PATCH 19/44] troubleshoot 32bit builds --- pandas/tests/frame/indexing/test_indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index c1367af4bfa4b..03d01f64b1eb4 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -168,6 +168,7 @@ def test_setitem_list_of_tuples(self, float_frame): DataFrame( [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], columns=["A", "B", "C", "D"], + dtype=np.intp, ), ), ( From 02585c528556d5d0616139acb3b1e617a31c1be7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 8 Dec 2020 08:43:54 -0800 Subject: [PATCH 20/44] move whatsnew --- doc/source/whatsnew/v1.2.0.rst | 87 ---------------------------------- doc/source/whatsnew/v1.3.0.rst | 77 +++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 88 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f6021d026b733..96d76ba5877f0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -413,93 +413,6 @@ instead of casting to a NumPy array which may have different semantics (:issue:` df.any() - -.. --------------------------------------------------------------------------- - -.. whatsnew_120.notable_bug_fixes: - -Notable bug fixes -~~~~~~~~~~~~~~~~~ - -These are bug fixes that might have notable behavior changes. - -Assigning with ``DataFrame.__setitem__`` consistently creates a new array -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`) - -Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the -underlying array, and sometimes assign a new array. Fixing this inconsistency -can have behavior-changing implications for workloads that relied on inplace -mutation. The two most common cases are creating a ``DataFrame`` from an array -and slicing a ``DataFrame``. - -*Previous Behavior* - -The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype. - -.. code-block:: python - - >>> import pandas as pd - >>> import numpy as np - >>> a = np.array([1, 2, 3]) - >>> df = pd.DataFrame(a, columns=['a']) - >>> df['a'] = 0 - >>> a # mutated inplace - array([0, 0, 0]) - -But not others, like :class:`Int64Dtype`. - -.. code-block:: python - - >>> import pandas as pd - >>> import numpy as np - >>> a = pd.array([1, 2, 3], dtype="Int64") - >>> df = pd.DataFrame(a, columns=['a']) - >>> df['a'] = 0 - >>> a # not mutated - - [1, 2, 3] - Length: 3, dtype: Int64 - - -*New Behavior* - -In pandas 1.2.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than -mutating the existing array inplace. - -For NumPy's int64 dtype - -.. ipython:: python - - import pandas as pd - import numpy as np - a = np.array([1, 2, 3]) - df = pd.DataFrame(a, columns=['a']) - df['a'] = 0 - a # not mutated - -For :class:`Int64Dtype`. - -.. ipython:: python - - import pandas as pd - import numpy as np - a = pd.array([1, 2, 3], dtype="Int64") - df = pd.DataFrame(a, columns=['a']) - df['a'] = 0 - a # not mutated - -This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``. - -.. code-block:: python - - df = pd.DataFrame({"A": [1, 2, 3]}) - df2 = df[['A']] - df['A'] = np.array([0, 0, 0]) - -Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a -new array is consistently assigned, so ``df2`` is not mutated. .. _whatsnew_120.api_breaking.python: Increased minimum version for Python diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b40f012f034b6..d66949414eff5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -31,7 +31,82 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. +Assigning with ``DataFrame.__setitem__`` consistently creates a new array +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`) + +Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the +underlying array, and sometimes assign a new array. Fixing this inconsistency +can have behavior-changing implications for workloads that relied on inplace +mutation. The two most common cases are creating a ``DataFrame`` from an array +and slicing a ``DataFrame``. + +*Previous Behavior* + +The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype. + +.. code-block:: python + + >>> import pandas as pd + >>> import numpy as np + >>> a = np.array([1, 2, 3]) + >>> df = pd.DataFrame(a, columns=['a']) + >>> df['a'] = 0 + >>> a # mutated inplace + array([0, 0, 0]) + +But not others, like :class:`Int64Dtype`. + +.. code-block:: python + + >>> import pandas as pd + >>> import numpy as np + >>> a = pd.array([1, 2, 3], dtype="Int64") + >>> df = pd.DataFrame(a, columns=['a']) + >>> df['a'] = 0 + >>> a # not mutated + + [1, 2, 3] + Length: 3, dtype: Int64 + + +*New Behavior* + +In pandas 1.3.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than +mutating the existing array inplace. + +For NumPy's int64 dtype + +.. ipython:: python + + import pandas as pd + import numpy as np + a = np.array([1, 2, 3]) + df = pd.DataFrame(a, columns=['a']) + df['a'] = 0 + a # not mutated + +For :class:`Int64Dtype`. + +.. ipython:: python + + import pandas as pd + import numpy as np + a = pd.array([1, 2, 3], dtype="Int64") + df = pd.DataFrame(a, columns=['a']) + df['a'] = 0 + a # not mutated + +This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``. + +.. code-block:: python + + df = pd.DataFrame({"A": [1, 2, 3]}) + df2 = df[['A']] + df['A'] = np.array([0, 0, 0]) + +Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a new array is consistently assigned, so ``df2`` is not mutated. .. _whatsnew_130.api_breaking.deps: @@ -127,7 +202,7 @@ Interval Indexing ^^^^^^^^ - +- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`35417`) - - From 3731fc8260e263d24d37d5ac6c14fe2509a03864 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 8 Dec 2020 08:44:20 -0800 Subject: [PATCH 21/44] restore 1.2.0 whatnsew --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 96d76ba5877f0..4294871b56bcb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -413,6 +413,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:` df.any() + .. _whatsnew_120.api_breaking.python: Increased minimum version for Python @@ -664,8 +665,7 @@ Interval Indexing ^^^^^^^^ -- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) -- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`35417`) +- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__getitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp``. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) From 8503671d5a31781ac1220bec051eb38bb6434eb0 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 15:08:12 -0800 Subject: [PATCH 22/44] troubleshoot --- pandas/tests/frame/indexing/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 03d01f64b1eb4..eca7fb9b71993 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -168,7 +168,7 @@ def test_setitem_list_of_tuples(self, float_frame): DataFrame( [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], columns=["A", "B", "C", "D"], - dtype=np.intp, + dtype=np.int64, ), ), ( From fe5844137b8c36bb7af533126c23f5c8b65332b0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 11 Dec 2020 13:45:52 -0800 Subject: [PATCH 23/44] troubleshoot --- pandas/core/indexing.py | 6 ++++-- pandas/tests/frame/indexing/test_indexing.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d7f25f80a2b4f..75c951690bd38 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -678,12 +678,14 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): # Try to get the right dtype when we do this reindex. fv = None - if is_scalar(value): + if not is_list_like(value): fv = value + elif len(value) and not is_list_like(value[0]): + fv = value[0] else: dtype = maybe_infer_dtype_type(value) if dtype is not None: - fv = dtype.type(0) # TODO: or try to just do value[0]? + fv = dtype.type(0) self.obj._mgr = self.obj._mgr.reindex_axis( keys, diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index eca7fb9b71993..c1367af4bfa4b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -168,7 +168,6 @@ def test_setitem_list_of_tuples(self, float_frame): DataFrame( [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], columns=["A", "B", "C", "D"], - dtype=np.int64, ), ), ( From 1e605376f59e8206f44bf247f7227f942eca98f3 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 30 Dec 2020 14:14:22 -0800 Subject: [PATCH 24/44] flesh out tests --- pandas/tests/frame/indexing/test_indexing.py | 9 +++++++++ pandas/tests/indexing/test_iloc.py | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index c1367af4bfa4b..afe502a63f3b9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -807,12 +807,21 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): # setting it triggers setting with copy sliced = float_frame.iloc[:, -3:] + # check that the setitem below is not a no-op + assert not (float_frame["C"] == 4).all() + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): sliced.loc[:, "C"] = 4.0 assert (float_frame["C"] == 4).all() + # GH#35417 setting with setitem creates a new array, so we get the warning + # but do not modify the original + with pytest.raises(com.SettingWithCopyError, match=msg): + sliced["C"] = 5.0 + assert (float_frame["C"] == 4).all() + def test_getitem_setitem_non_ix_labels(self): df = tm.makeTimeDataFrame() diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 75251c979e0c8..7da6fafc95db6 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -707,6 +707,11 @@ def test_identity_slice_returns_new_object(self): original_df.loc[:, "a"] = [4, 4, 4] assert (sliced_df["a"] == 4).all() + # GH#35417 but setting with setitem creates a new array, so + # sliced_df is not changed + original_df["a"] = [5, 5, 5] + assert (sliced_df["a"] == 4).all() + original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] assert sliced_series is not original_series From 9783bce07b27e93234ab33acb7891d179e06fa2e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 16 Jan 2021 13:31:20 -0800 Subject: [PATCH 25/44] arraymanager mypy fixup --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 99edec3c606d4..7b9c91554ae49 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -658,7 +658,7 @@ def idelete(self, indexer): self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] self._axes = [self._axes[0], self._axes[1][to_keep]] - def iset(self, loc: Union[int, slice, np.ndarray], value): + def iset(self, loc: Union[int, slice, np.ndarray], value, inplace: bool = False): """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items From 51d102bd0a10baf0fe5ad16bebda170033024cf4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Mar 2021 09:47:07 -0700 Subject: [PATCH 26/44] un-xfail --- pandas/tests/frame/indexing/test_setitem.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e8cdcfcaafa86..2e0053d03370c 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -927,14 +927,8 @@ def test_setitem_duplicate_columns_not_inplace(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("value", [1, np.array([[1], [1]]), [[1], [1]]]) - def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request): + def test_setitem_same_dtype_not_inplace(self, value): # GH#39510 - if not using_array_manager: - mark = pytest.mark.xfail( - reason="Setitem with same dtype still changing inplace" - ) - request.node.add_marker(mark) - cols = ["A", "B"] df = DataFrame(0, index=[0, 1], columns=cols) df_copy = df.copy() From 62f24378cfdf4c83c499d50d06f4fc1e7065160f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Mar 2021 07:39:08 -0700 Subject: [PATCH 27/44] 32bit compat --- pandas/tests/frame/indexing/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 2e0053d03370c..b66f28834519d 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -935,7 +935,7 @@ def test_setitem_same_dtype_not_inplace(self, value): df_view = df[:] df[["B"]] = value - expected = DataFrame([[0, 1], [0, 1]], columns=cols) + expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=np.intp) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 36b3302718d5aa52ccd28c7fadc66af57434ae92 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Mar 2021 13:42:03 -0700 Subject: [PATCH 28/44] troublehsoot --- pandas/tests/frame/indexing/test_setitem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index b66f28834519d..730592578b79a 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -936,6 +936,7 @@ def test_setitem_same_dtype_not_inplace(self, value): df[["B"]] = value expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=np.intp) + assert (expected.dtypes == np.intp).all() # troubleshoot 32bit builds tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 37c9d228060e543152fe7243e3b81fa6d5ce5c32 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Mar 2021 16:26:28 -0700 Subject: [PATCH 29/44] troubleshoot 32bit builds --- pandas/tests/frame/indexing/test_setitem.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 730592578b79a..c72deeb5622a2 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -935,8 +935,8 @@ def test_setitem_same_dtype_not_inplace(self, value): df_view = df[:] df[["B"]] = value - expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=np.intp) - assert (expected.dtypes == np.intp).all() # troubleshoot 32bit builds + expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=int) + assert (expected.dtypes == int).all() # troubleshoot 32bit builds tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 38d8106489f197cc57c12dd9aa73044564693c6b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Mar 2021 10:11:18 -0700 Subject: [PATCH 30/44] troubleshoot 32 bit --- pandas/tests/frame/indexing/test_setitem.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c72deeb5622a2..6ee4f1555a9f2 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -935,8 +935,7 @@ def test_setitem_same_dtype_not_inplace(self, value): df_view = df[:] df[["B"]] = value - expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=int) - assert (expected.dtypes == int).all() # troubleshoot 32bit builds + expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=np.int64) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 647a3939c89f236033c06afe120c8e879bf1f853 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Mar 2021 10:26:16 -0700 Subject: [PATCH 31/44] DOC: suppress warnings from CategoricalBlock deprecation --- doc/source/user_guide/io.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cf153ddd2cbbd..3b7a6037a9715 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5240,6 +5240,7 @@ Write to a feather file. Read from a feather file. .. ipython:: python + :okwarning: result = pd.read_feather("example.feather") result @@ -5323,6 +5324,7 @@ Write to a parquet file. Read from a parquet file. .. ipython:: python + :okwarning: result = pd.read_parquet("example_fp.parquet", engine="fastparquet") result = pd.read_parquet("example_pa.parquet", engine="pyarrow") From 8285ecef8f645a56f5b7054844c6af3a19212068 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Mar 2021 09:43:01 -0700 Subject: [PATCH 32/44] troubleshoot 32 bit builds --- pandas/tests/frame/indexing/test_setitem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 6ee4f1555a9f2..47335dac803ff 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -935,6 +935,7 @@ def test_setitem_same_dtype_not_inplace(self, value): df_view = df[:] df[["B"]] = value + assert (df.dtypes == np.int64).all() # troubleshooting 32 bit builds expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=np.int64) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 270be1e107dbed54dd2eccac69d54b073900d688 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Mar 2021 09:47:09 -0700 Subject: [PATCH 33/44] troubleshoot 32bit builds --- pandas/tests/frame/indexing/test_setitem.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 47335dac803ff..805e1aa217ff7 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -935,8 +935,10 @@ def test_setitem_same_dtype_not_inplace(self, value): df_view = df[:] df[["B"]] = value - assert (df.dtypes == np.int64).all() # troubleshooting 32 bit builds - expected = DataFrame([[0, 1], [0, 1]], columns=cols, dtype=np.int64) + # 32bit builds end up with heterogeneous column dtypes + expected = DataFrame( + {"A": np.array([0, 1], dtype=np.int64), "B": np.array([0, 1], dtype=int)} + ) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 9cf69c99ba84c5eb738782c0fd25be07b88b8897 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Mar 2021 10:36:53 -0700 Subject: [PATCH 34/44] typo fixup --- pandas/tests/frame/indexing/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 805e1aa217ff7..18ebf9f92afb5 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -937,7 +937,7 @@ def test_setitem_same_dtype_not_inplace(self, value): # 32bit builds end up with heterogeneous column dtypes expected = DataFrame( - {"A": np.array([0, 1], dtype=np.int64), "B": np.array([0, 1], dtype=int)} + {"A": np.array([0, 0], dtype=np.int64), "B": np.array([1, 1], dtype=int)} ) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From ca260c523cdb1fc9755327e1679084e021e2a73f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Mar 2021 14:42:22 -0700 Subject: [PATCH 35/44] 32bit troubleshoot --- pandas/tests/frame/indexing/test_setitem.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 4237a39723616..f6b29abd58853 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -935,10 +935,17 @@ def test_setitem_same_dtype_not_inplace(self, value): df_view = df[:] df[["B"]] = value - # 32bit builds end up with heterogeneous column dtypes - expected = DataFrame( - {"A": np.array([0, 0], dtype=np.int64), "B": np.array([1, 1], dtype=int)} - ) + # 32bit builds have unfortunately clumsy behavior + if isinstance(value, np.ndarray): + expected = DataFrame( + { + "A": np.array([0, 0], dtype=np.int64), + "B": np.array([1, 1], dtype=int), + } + ) + else: + expected = DataFrame([[0, 1], [0, 1]], columns=cols) + tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From a1c5732f5ed86bc371dd92c7a9bb58dd02c937c4 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Mar 2021 12:22:20 -0700 Subject: [PATCH 36/44] revert troubleshoot --- pandas/tests/frame/indexing/test_setitem.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index a655a0fe7be8a..aa7a05d7519ca 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -937,16 +937,7 @@ def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, reques df_view = df[:] df[["B"]] = value - # 32bit builds have unfortunately clumsy behavior - if isinstance(value, np.ndarray): - expected = DataFrame( - { - "A": np.array([0, 0], dtype=np.int64), - "B": np.array([1, 1], dtype=int), - } - ) - else: - expected = DataFrame([[0, 1], [0, 1]], columns=cols) + expected = DataFrame([[0, 1], [0, 1]], columns=cols) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 0821a60e5626ef03cdc5a29c79f5e16e1d02935f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 31 Mar 2021 18:05:19 -0700 Subject: [PATCH 37/44] update test --- pandas/core/internals/blocks.py | 2 -- pandas/tests/indexing/test_iloc.py | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9d86662b4d112..8a0c706ff9ce6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1483,8 +1483,6 @@ def iget(self, col): return self.values def set_inplace(self, locs, values): - # NB: This is a misnomer, is supposed to be inplace but is not, - # see GH#33457 assert locs.tolist() == [0] self.values[:] = values diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c356e97224957..705ba40097b83 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -877,7 +877,9 @@ def test_iloc_setitem_categorical_updates_inplace( request.node.add_marker(mark) cat = Categorical(["A", "B", "C"]) - df = DataFrame({1: cat, 2: [1, 2, 3]}) + df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False) + blk = df._mgr.blocks[1] + assert blk.values is cat # This should modify our original values in-place df.iloc[:, 0] = cat[::-1] From a66afa058874a33b4c59fd32196c6922a9104afa Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Apr 2021 14:53:14 -0700 Subject: [PATCH 38/44] revert duplicated whatsnew notes --- doc/source/whatsnew/v1.3.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index bcf14d4e6369e..24d8b110178e2 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -754,7 +754,6 @@ Indexing - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) - Bug in :meth:`DataFrame.loc`, :meth:`Series.loc`, :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` returning incorrect elements for non-monotonic :class:`DatetimeIndex` for string slices (:issue:`33146`) -- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`35417`) - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` with timezone aware indexes raising ``TypeError`` for ``method="ffill"`` and ``method="bfill"`` and specified ``tolerance`` (:issue:`38566`) - Bug in :meth:`DataFrame.reindex` with ``datetime64[ns]`` or ``timedelta64[ns]`` incorrectly casting to integers when the ``fill_value`` requires casting to object dtype (:issue:`39755`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` with empty :class:`DataFrame` and specified columns for string indexer and non empty :class:`DataFrame` to set (:issue:`38831`) @@ -763,7 +762,6 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` not raising ``ValueError`` when right hand side is a :class:`DataFrame` with wrong number of columns (:issue:`38604`) - Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`) - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`) -- Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`) - Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`) - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) From fb95732228637a5a2f44db8d1d940a511c394539 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 11 Apr 2021 16:55:43 -0700 Subject: [PATCH 39/44] revert unrelated test fix --- pandas/tests/indexing/test_iloc.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 705ba40097b83..3ab10ba7f94c8 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -865,21 +865,11 @@ def test_series_indexing_zerodim_np_array(self): result = s.iloc[np.array(0)] assert result == 1 - def test_iloc_setitem_categorical_updates_inplace( - self, using_array_manager, request - ): - # GH#35417 + @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457") + def test_iloc_setitem_categorical_updates_inplace(self): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer - if using_array_manager: - mark = pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33457" - ) - request.node.add_marker(mark) - cat = Categorical(["A", "B", "C"]) - df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False) - blk = df._mgr.blocks[1] - assert blk.values is cat + df = DataFrame({1: cat, 2: [1, 2, 3]}) # This should modify our original values in-place df.iloc[:, 0] = cat[::-1] From 4342f5d4adefc9af07dfee4f9f01d11a815c5314 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Nov 2021 10:17:33 -0800 Subject: [PATCH 40/44] ArrayManager fixup --- pandas/tests/indexing/test_loc.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 5ac7136eea4f9..7247ce9f852ed 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1013,13 +1013,8 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager, request): + def test_identity_slice_returns_new_object(self, request): # GH13873 - if using_array_manager: - mark = pytest.mark.xfail( - reason="setting with .loc[:, 'a'] does not alter inplace" - ) - request.node.add_marker(mark) original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.loc[:] From bebb12fe81b5688a081eaab2c23f6036edd65d61 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Nov 2021 09:28:43 -0800 Subject: [PATCH 41/44] avoid warning --- pandas/tests/io/formats/test_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d9bd8f6809c73..3217a968c1f17 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1307,7 +1307,7 @@ def test_index_with_nan(self): # all-nan in mi df2 = df.copy() - df2.loc[:, "id2"] = np.nan + df2["id2"] = np.nan y = df2.set_index("id2") result = y.to_string() expected = ( From fe9fe6611734ea0bd09d9e19715fe26d8a560afc Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Nov 2021 14:59:26 -0800 Subject: [PATCH 42/44] fixed on AM --- pandas/tests/frame/methods/test_rename.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 691f3945eb1c2..b55abc3f0f07c 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -4,8 +4,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, @@ -170,7 +168,6 @@ def test_rename_multiindex(self): renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view def test_rename_nocopy(self, float_frame): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) renamed["foo"][:] = 1.0 From 65243311106c374baedbf53feded645d89014216 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Dec 2021 12:37:32 -0800 Subject: [PATCH 43/44] revert whitespace change --- pandas/tests/frame/indexing/test_setitem.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 84d23d28d9466..cd0a0a0467742 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1112,7 +1112,6 @@ def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, reques df[["B"]] = value expected = DataFrame([[0, 1], [0, 1]], columns=cols) - tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df_view, df_copy) From 123568dafe8f7e651de8c252b185a44e89ca0e9e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Dec 2021 14:43:05 -0800 Subject: [PATCH 44/44] revert no-longer-necessary --- pandas/core/generic.py | 4 ---- pandas/core/indexing.py | 27 ++------------------------- 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 774ccce3df35e..0034d0511c15e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -322,10 +322,6 @@ def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT: # fastpath of passing a manager doesn't check the option/manager class return self._constructor(new_mgr).__finalize__(self) - @property - def _uses_array_manager(self) -> bool_t: - return isinstance(self._mgr, ArrayManager) - # ---------------------------------------------------------------------- # attrs and flags diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 085c9ff86ebaa..71ebb45cf23b4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -18,7 +18,6 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import maybe_infer_dtype_type from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -694,23 +693,8 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): # GH#38148 keys = self.obj.columns.union(key, sort=False) - # Try to get the right dtype when we do this reindex. - fv = None - if not is_list_like(value): - fv = value - elif len(value) and not is_list_like(value[0]): - fv = value[0] - else: - dtype = maybe_infer_dtype_type(value) - if dtype is not None: - fv = dtype.type(0) - self.obj._mgr = self.obj._mgr.reindex_axis( - keys, - axis=0, - consolidate=False, - only_slice=True, - fill_value=fv, + keys, axis=0, consolidate=False, only_slice=True ) def __setitem__(self, key, value): @@ -1604,6 +1588,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # essentially this separates out the block that is needed # to possibly be modified if self.ndim > 1 and i == info_axis: + # add the new item, and set the value # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a @@ -1863,14 +1848,6 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # multi-dim object # GH#6149 (null slice), GH#10408 (full bounds) if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): - if not self.obj._uses_array_manager: - blk = ser._mgr.blocks[0] - if blk._can_hold_element(value) and is_scalar(value): - # FIXME: ExtensionBlock._can_hold_element - # We can do an inplace-setting, do it directly on _values - # to get our underlying - ser._values[plane_indexer] = value - return ser = value elif ( is_array_like(value)