Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manual Backport PR #48417 on branch 1.5.x (Revert set_index inplace and copy keyword changes) #48552

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions doc/source/user_guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1723,12 +1723,13 @@ the given columns to a MultiIndex:
frame

Other options in ``set_index`` allow you not drop the index columns or to add
the index without creating a copy of the underlying data:
the index in-place (without creating a new object):

.. ipython:: python

data.set_index('c', drop=False)
data.set_index(['a', 'b'], copy=False)
data.set_index(['a', 'b'], inplace=True)
data

Reset the index
~~~~~~~~~~~~~~~
Expand Down
2 changes: 0 additions & 2 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,6 @@ Other enhancements
- :meth:`DataFrame.quantile` gained a ``method`` argument that can accept ``table`` to evaluate multi-column quantiles (:issue:`43881`)
- :class:`Interval` now supports checking whether one interval is contained by another interval (:issue:`46613`)
- Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`)
- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
- The ``Dockerfile`` now installs a dedicated ``pandas-dev`` virtual environment for pandas development instead of using the ``base`` environment (:issue:`48427`)

Expand Down Expand Up @@ -934,7 +933,6 @@ Other Deprecations
- Deprecated the ``inplace`` keyword in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis`, use ``obj = obj.set_axis(..., copy=False)`` instead (:issue:`48130`)
- Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`)
- Fixed up warning message of deprecation of :meth:`MultiIndex.lesort_depth` as public method, as the message previously referred to :meth:`MultiIndex.is_lexsorted` instead (:issue:`38701`)
- Deprecated the ``inplace`` keyword in :meth:`DataFrame.set_index`, use ``df = df.set_index(..., copy=False)`` instead (:issue:`48115`)
- Deprecated the ``sort_columns`` argument in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`).
- Deprecated positional arguments for all but the first argument of :meth:`DataFrame.to_stata` and :func:`read_stata`, use keyword arguments instead (:issue:`48128`).
- Deprecated the ``mangle_dupe_cols`` argument in :func:`read_csv`, :func:`read_fwf`, :func:`read_table` and :func:`read_excel`. The argument was never implemented, and a new argument where the renaming pattern can be specified will be added instead (:issue:`47718`)
Expand Down
37 changes: 4 additions & 33 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5855,9 +5855,8 @@ def set_index(
*,
drop: bool = ...,
append: bool = ...,
inplace: Literal[False] | lib.NoDefault = ...,
inplace: Literal[False] = ...,
verify_integrity: bool = ...,
copy: bool | lib.NoDefault = ...,
) -> DataFrame:
...

Expand All @@ -5870,7 +5869,6 @@ def set_index(
append: bool = ...,
inplace: Literal[True],
verify_integrity: bool = ...,
copy: bool | lib.NoDefault = ...,
) -> None:
...

Expand All @@ -5880,9 +5878,8 @@ def set_index(
keys,
drop: bool = True,
append: bool = False,
inplace: bool | lib.NoDefault = lib.no_default,
inplace: bool = False,
verify_integrity: bool = False,
copy: bool | lib.NoDefault = lib.no_default,
) -> DataFrame | None:
"""
Set the DataFrame index using existing columns.
Expand All @@ -5905,18 +5902,10 @@ def set_index(
Whether to append columns to existing index.
inplace : bool, default False
Whether to modify the DataFrame rather than creating a new one.

.. deprecated:: 1.5.0

verify_integrity : bool, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method.
copy : bool, default True
Whether to make a copy of the underlying data when returning a new
DataFrame.

.. versionadded:: 1.5.0

Returns
-------
Expand Down Expand Up @@ -5981,25 +5970,7 @@ def set_index(
3 9 7 2013 84
4 16 10 2014 31
"""
if inplace is not lib.no_default:
inplace = validate_bool_kwarg(inplace, "inplace")
warnings.warn(
"The 'inplace' keyword in DataFrame.set_index is deprecated "
"and will be removed in a future version. Use "
"`df = df.set_index(..., copy=False)` instead.",
FutureWarning,
stacklevel=find_stack_level(inspect.currentframe()),
)
else:
inplace = False

if inplace:
if copy is not lib.no_default:
raise ValueError("Cannot specify copy when inplace=True")
copy = False
elif copy is lib.no_default:
copy = True

inplace = validate_bool_kwarg(inplace, "inplace")
self._check_inplace_and_allows_duplicate_labels(inplace)
if not isinstance(keys, list):
keys = [keys]
Expand Down Expand Up @@ -6035,7 +6006,7 @@ def set_index(
if inplace:
frame = self
else:
frame = self.copy(deep=copy)
frame = self.copy()

arrays = []
names: list[Hashable] = []
Expand Down
19 changes: 8 additions & 11 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,9 +783,9 @@ def get_result(self, copy: bool = True) -> DataFrame:
if self.indicator:
result = self._indicator_post_merge(result)

result = self._maybe_add_join_keys(result, left_indexer, right_indexer)
self._maybe_add_join_keys(result, left_indexer, right_indexer)

result = self._maybe_restore_index_levels(result)
self._maybe_restore_index_levels(result)

self._maybe_drop_cross_column(result, self._cross)

Expand Down Expand Up @@ -852,7 +852,7 @@ def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
return result

def _maybe_restore_index_levels(self, result: DataFrame) -> DataFrame:
def _maybe_restore_index_levels(self, result: DataFrame) -> None:
"""
Restore index levels specified as `on` parameters

Expand All @@ -870,7 +870,7 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> DataFrame:

Returns
-------
DataFrame
None
"""
names_to_restore = []
for name, left_key, right_key in zip(
Expand All @@ -894,15 +894,14 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> DataFrame:
names_to_restore.append(name)

if names_to_restore:
result = result.set_index(names_to_restore, copy=False)
return result
result.set_index(names_to_restore, inplace=True)

def _maybe_add_join_keys(
self,
result: DataFrame,
left_indexer: np.ndarray | None,
right_indexer: np.ndarray | None,
) -> DataFrame:
) -> None:

left_has_missing = None
right_has_missing = None
Expand Down Expand Up @@ -993,12 +992,11 @@ def _maybe_add_join_keys(
for level_name in result.index.names
]

result = result.set_index(idx_list, copy=False)
result.set_index(idx_list, inplace=True)
else:
result.index = Index(key_col, name=name)
else:
result.insert(i, name or f"key_{i}", key_col)
return result

def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
"""return the join indexers"""
Expand Down Expand Up @@ -1768,8 +1766,7 @@ def get_result(self, copy: bool = True) -> DataFrame:
result = self._reindex_and_concat(
join_index, left_join_indexer, right_join_indexer, copy=copy
)

result = self._maybe_add_join_keys(result, left_indexer, right_indexer)
self._maybe_add_join_keys(result, left_indexer, right_indexer)

return result

Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
# String case
if item not in frame.columns:
raise ValueError(f"Index {item} invalid")
frame = frame.set_index(self.index_col, drop=True, copy=False)
frame.set_index(self.index_col, drop=True, inplace=True)
# Clear names if headerless and no name given
if self.header is None and not multi_index_named:
frame.index.names = [None] * len(frame.index.names)
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4667,7 +4667,7 @@ def read(
columns.insert(0, n)
s = super().read(where=where, columns=columns, start=start, stop=stop)
if is_multi_index:
s = s.set_index(self.levels, copy=False)
s.set_index(self.levels, inplace=True)

s = s.iloc[:, 0]

Expand Down
6 changes: 3 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _wrap_result(
frame = _parse_date_columns(frame, parse_dates)

if index_col is not None:
frame = frame.set_index(index_col, copy=False)
frame.set_index(index_col, inplace=True)

return frame

Expand Down Expand Up @@ -980,7 +980,7 @@ def _query_iterator(
self._harmonize_columns(parse_dates=parse_dates)

if self.index is not None:
self.frame = self.frame.set_index(self.index, copy=False)
self.frame.set_index(self.index, inplace=True)

yield self.frame

Expand Down Expand Up @@ -1021,7 +1021,7 @@ def read(
self._harmonize_columns(parse_dates=parse_dates)

if self.index is not None:
self.frame = self.frame.set_index(self.index, copy=False)
self.frame.set_index(self.index, inplace=True)

return self.frame

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,8 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
)
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
df = df.set_index(["a", "b"], copy=False)
df2 = df2.set_index(["a", "b"], copy=False)
df.set_index(["a", "b"], inplace=True)
df2.set_index(["a", "b"], inplace=True)
result = df.combine_first(df2)
expected = DataFrame(
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
Expand Down
26 changes: 1 addition & 25 deletions pandas/tests/frame/methods/test_set_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,6 @@


class TestSetIndex:
def test_set_index_copy(self):
# GH#48043
df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
expected = DataFrame({"B": [3, 4], "C": [5, 6]}, index=Index([1, 2], name="A"))

res = df.set_index("A", copy=True)
tm.assert_frame_equal(res, expected)
assert not any(tm.shares_memory(df[col], res[col]) for col in res.columns)

res = df.set_index("A", copy=False)
tm.assert_frame_equal(res, expected)
assert all(tm.shares_memory(df[col], res[col]) for col in res.columns)

msg = "Cannot specify copy when inplace=True"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="The 'inplace'"):
df.set_index("A", inplace=True, copy=True)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="The 'inplace'"):
df.set_index("A", inplace=True, copy=False)

def test_set_index_multiindex(self):
# segfault in GH#3308
d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
Expand Down Expand Up @@ -199,10 +178,7 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):

if inplace:
result = df.copy()
with tm.assert_produces_warning(
FutureWarning, match="The 'inplace' keyword"
):
return_value = result.set_index(keys, drop=drop, inplace=True)
return_value = result.set_index(keys, drop=drop, inplace=True)
assert return_value is None
else:
result = df.set_index(keys, drop=drop)
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,7 @@ def _check_f(base, f):

# set_index
f = lambda x: x.set_index("a", inplace=True)
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
_check_f(data.copy(), f)
_check_f(data.copy(), f)

# reset_index
f = lambda x: x.reset_index(inplace=True)
Expand Down
18 changes: 6 additions & 12 deletions pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,7 @@ def test_date_index_query(self):
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
return_value = df.set_index("dates1", inplace=True, drop=True)
return_value = df.set_index("dates1", inplace=True, drop=True)
assert return_value is None
res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
Expand All @@ -450,8 +449,7 @@ def test_date_index_query_with_NaT(self):
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.iloc[0, 0] = pd.NaT
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
return_value = df.set_index("dates1", inplace=True, drop=True)
return_value = df.set_index("dates1", inplace=True, drop=True)
assert return_value is None
res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
Expand All @@ -465,8 +463,7 @@ def test_date_index_query_with_NaT_duplicates(self):
d["dates3"] = date_range("1/1/2014", periods=n)
df = DataFrame(d)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
return_value = df.set_index("dates1", inplace=True, drop=True)
return_value = df.set_index("dates1", inplace=True, drop=True)
assert return_value is None
res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)]
Expand Down Expand Up @@ -797,8 +794,7 @@ def test_date_index_query(self):
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
return_value = df.set_index("dates1", inplace=True, drop=True)
return_value = df.set_index("dates1", inplace=True, drop=True)
assert return_value is None
res = df.query(
"(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
Expand All @@ -813,8 +809,7 @@ def test_date_index_query_with_NaT(self):
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.iloc[0, 0] = pd.NaT
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
return_value = df.set_index("dates1", inplace=True, drop=True)
return_value = df.set_index("dates1", inplace=True, drop=True)
assert return_value is None
res = df.query(
"(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
Expand All @@ -829,8 +824,7 @@ def test_date_index_query_with_NaT_duplicates(self):
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
with tm.assert_produces_warning(FutureWarning, match="The 'inplace' keyword"):
return_value = df.set_index("dates1", inplace=True, drop=True)
return_value = df.set_index("dates1", inplace=True, drop=True)
assert return_value is None
msg = r"'BoolOp' nodes are not implemented"
with pytest.raises(NotImplementedError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ def test_apply_groupby_datetimeindex():
result = df.groupby("Name").sum()

expected = DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]})
expected = expected.set_index("Name", copy=False)
expected.set_index("Name", inplace=True)

tm.assert_frame_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_builtins_apply(keys, f):

if f != sum:
expected = gb.agg(fname).reset_index()
expected = expected.set_index(keys, copy=False, drop=False)
expected.set_index(keys, inplace=True, drop=False)
tm.assert_frame_equal(result, expected, check_dtype=False)

tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
Expand Down Expand Up @@ -454,7 +454,7 @@ def test_groupby_non_arithmetic_agg_types(dtype, method, data):
df_out = DataFrame(exp)

df_out["b"] = df_out.b.astype(out_type)
df_out = df_out.set_index("a", copy=False)
df_out.set_index("a", inplace=True)

grpd = df.groupby("a")
t = getattr(grpd, method)(*data["args"])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/multi/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_insert(idx):
idx.insert(0, ("foo2",))

left = pd.DataFrame([["a", "b", 0], ["b", "d", 1]], columns=["1st", "2nd", "3rd"])
left = left.set_index(["1st", "2nd"], copy=False)
left.set_index(["1st", "2nd"], inplace=True)
ts = left["3rd"].copy(deep=True)

left.loc[("b", "x"), "3rd"] = 2
Expand Down Expand Up @@ -65,7 +65,7 @@ def test_insert(idx):
],
columns=["1st", "2nd", "3rd"],
)
right = right.set_index(["1st", "2nd"], copy=False)
right.set_index(["1st", "2nd"], inplace=True)
# FIXME data types changes to float because
# of intermediate nan insertion;
tm.assert_frame_equal(left, right, check_dtype=False)
Expand Down
Loading