diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c26a8a40d97b8..3268575c7064d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1147,8 +1147,8 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, - has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) +- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types, + and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 309fb3b841461..b4f79bda25517 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4042,12 +4042,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False, Set the DataFrame index using existing columns. Set the DataFrame index (row labels) using one or more existing - columns. The index can replace the existing index or expand on it. + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. Parameters ---------- - keys : label or list of label - Name or names of the columns that will be used as the index. + keys : label or array-like or list of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4092,7 +4096,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 7 2013 84 10 2014 31 - Create a multi-index using columns 'year' and 'month': + Create a MultiIndex using columns 'year' and 'month': >>> df.set_index(['year', 'month']) sale @@ -4102,35 +4106,51 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2013 7 84 2014 10 31 - Create a multi-index using a set of values and a column: + Create a MultiIndex using an Index and a column: - >>> df.set_index([[1, 2, 3, 4], 'year']) + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) month sale year 1 2012 1 55 2 2014 4 40 3 2013 7 84 4 2014 10 31 + + Create a MultiIndex using two Series: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, 'inplace') - if not isinstance(keys, list): + + err_msg = ('The parameter "keys" may be a column key, one-dimensional ' + 'array, or a list containing only valid column keys and ' + 'one-dimensional arrays.') + + if (is_scalar(keys) or isinstance(keys, tuple) + or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))): + # make sure we have a container of keys/arrays we can iterate over + # tuples can appear as valid column keys! keys = [keys] + elif not isinstance(keys, list): + raise ValueError(err_msg) missing = [] for col in keys: - if (is_scalar(col) or isinstance(col, tuple)) and col in self: - # tuples can be both column keys or list-likes - # if they are valid column keys, everything is fine - continue - elif is_scalar(col) and col not in self: - # tuples that are not column keys are considered list-like, - # not considered missing - missing.append(col) - elif (not is_list_like(col, allow_sets=False) + if (is_scalar(col) or isinstance(col, tuple)): + # if col is a valid column key, everything is fine + # tuples are always considered keys, never as list-likes + if col not in self: + missing.append(col) + elif (not isinstance(col, (ABCIndexClass, ABCSeries, + np.ndarray, list)) or getattr(col, 'ndim', 1) > 1): - raise TypeError('The parameter "keys" may only contain a ' - 'combination of valid column keys and ' - 'one-dimensional list-likes') + raise ValueError(err_msg) if missing: raise KeyError('{}'.format(missing)) @@ -4163,12 +4183,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False, elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) - elif (is_list_like(col) - and not (isinstance(col, tuple) and col in self)): - # all other list-likes (but avoid valid column keys) - col = list(col) # ensure iterator do not get read twice etc. - arrays.append(col) - names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index b63151dfb459e..c2355742199dc 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -118,7 +118,7 @@ def test_set_index_after_mutation(self): # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) @pytest.mark.parametrize('box', [Series, Index, np.array, - list, tuple, iter, lambda x: [list(x)], + list, lambda x: [list(x)], lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -135,7 +135,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, with pytest.raises(KeyError, match=msg): df.set_index(key, drop=drop, append=append) else: - # np.array/tuple/iter/list-of-list "forget" the name of B + # np.array/list-of-list "forget" the name of B name_mi = getattr(key, 'names', None) name = [getattr(key, 'name', None)] if name_mi is None else name_mi @@ -150,8 +150,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, - list, tuple, iter, + @pytest.mark.parametrize('box', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'A'), (True, 'B'), @@ -163,7 +162,7 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array/list/tuple/iter "forget" the name of B + # np.array/list "forget" the name of B names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] result = df.set_index(keys, drop=drop, append=append) @@ -179,12 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, - list, tuple, iter, + @pytest.mark.parametrize('box2', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, - list, tuple, iter, + @pytest.mark.parametrize('box1', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -198,9 +195,6 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] result = df.set_index(keys, drop=drop, append=append) - # if either box was iter, the content has been consumed; re-read it - keys = [box1(df['A']), box2(df['A'])] - # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers @@ -208,7 +202,7 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise - # box = list would be illegal + # box = list would be interpreted as keys expected = df.set_index([keys[0]], drop=first_drop, append=append) expected = expected.set_index([keys[1]], drop=drop, append=True) tm.assert_frame_equal(result, expected) @@ -238,7 +232,7 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_raise(self, frame_of_index_cols, drop, append): + def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df = frame_of_index_cols with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): @@ -249,14 +243,31 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append): with pytest.raises(KeyError, match='X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) - msg = 'The parameter "keys" may only contain a combination of.*' - # forbidden type, e.g. set - with pytest.raises(TypeError, match=msg): - df.set_index(set(df['A']), drop=drop, append=append) + msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" + # tuples always raise KeyError + with pytest.raises(KeyError, match=msg): + df.set_index(tuple(df['A']), drop=drop, append=append) + + # also within a list + with pytest.raises(KeyError, match=msg): + df.set_index(['A', df['A'], tuple(df['A'])], + drop=drop, append=append) + + @pytest.mark.parametrize('append', [True, False]) + @pytest.mark.parametrize('drop', [True, False]) + @pytest.mark.parametrize('box', [set, iter]) + def test_set_index_raise_on_type(self, frame_of_index_cols, box, + drop, append): + df = frame_of_index_cols + + msg = 'The parameter "keys" may be a column key, .*' + # forbidden type, e.g. set/tuple/iter + with pytest.raises(ValueError, match=msg): + df.set_index(box(df['A']), drop=drop, append=append) - # forbidden type in list, e.g. set - with pytest.raises(TypeError, match=msg): - df.set_index(['A', df['A'], set(df['A'])], + # forbidden type in list, e.g. set/tuple/iter + with pytest.raises(ValueError, match=msg): + df.set_index(['A', df['A'], box(df['A'])], drop=drop, append=append) def test_construction_with_categorical_index(self):