From a4ab763e67d4ed7c97dfc559705ba0e6b4aff7b0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Aug 2022 13:42:59 -0700 Subject: [PATCH 1/3] ENH: set_index copy kwd --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 15 ++++++++++++++- pandas/tests/frame/methods/test_set_index.py | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6bf0f288645b7..ff768c4e91599 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -293,6 +293,7 @@ Other enhancements - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) - :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`) +- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`??`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a7a6494d04eb..c75ced620242a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5824,6 +5824,7 @@ def set_index( append: bool = ..., inplace: Literal[False] = ..., verify_integrity: bool = ..., + copy: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -5836,6 +5837,7 @@ def set_index( append: bool = ..., inplace: Literal[True], verify_integrity: bool = ..., + copy: bool | lib.NoDefault = ..., ) -> None: ... @@ -5847,6 +5849,7 @@ def set_index( append: bool = False, inplace: bool = False, verify_integrity: bool = False, + copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | None: """ Set the DataFrame index using existing columns. @@ -5873,6 +5876,11 @@ def set_index( Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this method. + copy : bool, default True + Whether to make a copy of the underlying data when returning a new + DataFrame. + + .. versionadded:: 1.5.0 Returns ------- @@ -5938,6 +5946,11 @@ def set_index( 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if copy is not lib.no_default: + raise ValueError("Cannot specify copy when inplace=True") + copy = False + self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -5973,7 +5986,7 @@ def set_index( if inplace: frame = self else: - frame = self.copy() + frame = self.copy(deep=copy) arrays = [] names: list[Hashable] = [] diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 4c39cf99f18ff..c70b14c3995a3 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -25,6 +25,24 @@ class TestSetIndex: + def test_set_index_copy(self): + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + expected = DataFrame({"B": [3, 4], "C": [5, 6]}, index=Index([1, 2], name="A")) + + res = df.set_index("A", copy=True) + tm.assert_frame_equal(res, expected) + assert not any(tm.shares_memory(df[col], res[col]) for col in res.columns) + + res = df.set_index("A", copy=False) + tm.assert_frame_equal(res, expected) + assert all(tm.shares_memory(df[col], res[col]) for col in res.columns) + + msg = "Cannot specify copy when inplace=True" + with pytest.raises(ValueError, match=msg): + df.set_index("A", inplace=True, copy=True) + with pytest.raises(ValueError, match=msg): + df.set_index("A", inplace=True, copy=False) + def test_set_index_multiindex(self): # segfault in GH#3308 d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} From 0e077d87ecbab1298bf2dac55b061f174b02c81c Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Aug 2022 13:44:27 -0700 Subject: [PATCH 2/3] GH ref --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/tests/frame/methods/test_set_index.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ff768c4e91599..599ef6150168a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -293,7 +293,7 @@ Other enhancements - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) - :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`) -- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`??`) +- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index c70b14c3995a3..9392d3c146942 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -26,6 +26,7 @@ class TestSetIndex: def test_set_index_copy(self): + # GH#48043 df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) expected = DataFrame({"B": [3, 4], "C": [5, 6]}, index=Index([1, 2], name="A")) From ea15dd0a61cd7544ab9592081522856129ad1bb5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Aug 2022 07:39:24 -0700 Subject: [PATCH 3/3] mypy fixup --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c75ced620242a..8c4924a2483be 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5950,6 +5950,8 @@ def set_index( if copy is not lib.no_default: raise ValueError("Cannot specify copy when inplace=True") copy = False + elif copy is lib.no_default: + copy = True self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list):