From 97e299658a97138fa84a56a0d2349e110a674a51 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 24 Apr 2018 19:11:02 +0200 Subject: [PATCH 01/10] API: str.cat will align on index (collected) --- doc/source/text.rst | 77 +++++++- doc/source/whatsnew/v0.23.0.txt | 33 ++++ pandas/core/strings.py | 318 +++++++++++++++++++++++++++----- pandas/tests/series/test_api.py | 4 +- pandas/tests/test_strings.py | 300 ++++++++++++++++++++++++++---- 5 files changed, 645 insertions(+), 87 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index bcbf7f0ef78d7..728c41643577b 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -247,27 +247,84 @@ Missing values on either side will result in missing values in the result as wel s.str.cat(t) s.str.cat(t, na_rep='-') -Series are *not* aligned on their index before concatenation: +Concatenating a Series and something array-like into a Series +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.23.0 + +The parameter ``others`` can also be two-dimensional. In this case, the number or rows must match the lengths of the calling ``Series`` (or ``Index``). .. ipython:: python - u = pd.Series(['b', 'd', 'e', 'c'], index=[1, 3, 4, 2]) - # without alignment + d = pd.concat([t, s], axis=1) + d + s.str.cat(d, na_rep='-') + +Concatenating a Series and an indexed object into a Series, with alignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.23.0 + +For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the respective indexes before concatenation by setting +the ``join``-keyword, which controls the manner of alignment. + +.. ipython:: python + + u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2]) s.str.cat(u) - # with separate alignment - v, w = s.align(u) - v.str.cat(w, na_rep='-') + s.str.cat(u, join='left') + +.. warning:: + + If the ``join`` keyword is not passed, the method :meth:`~Series.str.cat` will currently fall back to the behavior before version 0.23.0 (i.e. no alignment), + but a ``FutureWarning`` will be raised, since this default will change to ``join='left'`` in a future version. + +To usual options are available for ``join`` (one of ``'left', 'outer', 'inner', 'right'``). +In particular, alignment also means that the different lengths do not need to coincide anymore. + +.. ipython:: python + + v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4]) + s.str.cat(v, join='left', na_rep='-') + s.str.cat(v, join='outer', na_rep='-') + +The same alignment can be used when ``others`` is a ``DataFrame``: + +.. ipython:: python + + f = d.loc[[3, 2, 1, 0], :] + f + s.str.cat(f, join='left', na_rep='-') Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -List-likes (excluding iterators, ``dict``-views, etc.) can be arbitrarily combined in a list. -All elements of the list must match in length to the calling ``Series`` (resp. ``Index``): +All list-likes (as well as ``DataFrame`` and two-dimensional ``ndarray``) can be arbitrarily combined in a list-like container: + +.. ipython:: python + + s.str.cat([u, t.values, ['A', 'B', 'C', 'D'], d.values, f], na_rep='-') + +All elements must match in length to the calling ``Series``, except those having an index if ``join`` is not None: .. ipython:: python - x = pd.Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D']) - s.str.cat([['A', 'B', 'C', 'D'], s, s.values, x.index]) + s.str.cat([u, v, ['A', 'B', 'C', 'D'], d.values, f.loc[[1]]], + join='outer', na_rep='-') + +If using ``join='right'`` on a list of ``others`` that contains different indexes, +the union of these indexes will be used as the basis for the final concatenation: + +.. ipython:: python + + s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join='right', na_rep='-') + +Finally, the surrounding container can also be an :obj:`Iterable` other than a ``list`` (e.g. an iterator, or a ``dict``-view, etc.): + +.. ipython:: python + + from collections import OrderedDict + s.str.cat(d.to_dict('series', into=OrderedDict).values(), na_rep='-') Indexing with ``.str`` ---------------------- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c194d98a89789..f023cd54cf9a1 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -308,6 +308,39 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python df.assign(A=df.A+1, C= lambda df: df.A* -1) +.. _whatsnew_0230.enhancements.str_cat_align: + +``Series.str.cat`` has gained the ``join`` kwarg +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, :meth:`Series.str.cat` did not -- in contrast to most of ``pandas`` -- align :class:`Series` on their index before concatenation (see :issue:`18657`). +The method has now gained a keyword ``join`` to control the manner of alignment. In v.0.23 it will default to None (meaning no alignment), but this default will change +to ``'left'`` in a future version of pandas. + +.. ipython:: python + + s = pd.Series(['a', 'b', 'c', 'd']) + t = pd.Series(['b', 'd', 'e', 'c'], index=[1, 3, 4, 2]) + s.str.cat(t) + s.str.cat(t, join='left', na_rep='-') + +In particular, ``others`` does not need to be of the same length as the calling ``Series`` (if both have an index and ``join is not None``). +For more examples, see :ref:`here `. + +Additionally, ``str.cat`` now allows ``others`` to be a ``DataFrame`` or two-dimensional ``np.ndarray``. + +.. ipython:: python + + u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2]) + d = pd.concat([s, u], axis=1) + t.str.cat(d.values) + s.str.cat(d, join='left', na_rep='-') + +Furthermore, any combination of "concatenateable" arguments can be passed in a list-like container (e.g. an iterator). + +For categorical data, it is now possible to call :meth:`Series.str.cat` for ``CategoricalIndex`` as well (previously raised a ``ValueError``). +Finally, if ``others is not None``, the resulting ``Series``/``Index`` will now remain categorical if the calling +``Series``/``Index`` is categorical. .. _whatsnew_0230.enhancements.astype_category: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c6d45ce5413ac..0b575f0cb0838 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -65,7 +65,7 @@ def _get_array_list(arr, others): def str_cat(arr, others=None, sep=None, na_rep=None): """ - Concatenate strings in the Series/Index with given separator. + Auxiliary function for :meth:`str.cat` If `others` is specified, this function concatenates the Series/Index and elements of `others` element-wise. @@ -84,42 +84,9 @@ def str_cat(arr, others=None, sep=None, na_rep=None): Returns ------- - concat : Series/Index of objects or str - - See Also - -------- - split : Split each string in the Series/Index - - Examples - -------- - When not passing `other`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'c']) - >>> s.str.cat(sep=' ') - 'a b c' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> pd.Series(['a', 'b', np.nan, 'c']).str.cat(sep=' ', na_rep='?') - 'a b ? c' - - If `others` is specified, corresponding values are - concatenated with the separator. Result will be a Series of strings. - - >>> pd.Series(['a', 'b', 'c']).str.cat(['A', 'B', 'C'], sep=',') - 0 a,A - 1 b,B - 2 c,C - dtype: object - - Also, you can pass a list of list-likes. - - >>> pd.Series(['a', 'b']).str.cat([['x', 'y'], ['1', '2']], sep=',') - 0 a,x,1 - 1 b,y,2 - dtype: object + concat + ndarray containing concatenated results (if `others is not None`) + or str (if `others is None`) """ if sep is None: sep = '' @@ -1833,7 +1800,8 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._validate(data) self._is_categorical = is_categorical_dtype(data) - self._data = data.cat.categories if self._is_categorical else data + # .values.categories works for both Series/Index + self._data = data.values.categories if self._is_categorical else data # save orig to blow up categoricals to the right type self._orig = data self._freeze() @@ -1859,7 +1827,11 @@ def _validate(data): # see src/inference.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if data.inferred_type not in allowed_types: + if is_categorical_dtype(data.dtype): + inf_type = data.categories.inferred_type + else: + inf_type = data.inferred_type + if inf_type not in allowed_types: message = ("Can only use .str accessor with string values " "(i.e. inferred_type is 'string', 'unicode' or " "'mixed')") @@ -1962,11 +1934,269 @@ def cons_row(x): cons = self._orig._constructor return cons(result, name=name, index=index) - @copy(str_cat) - def cat(self, others=None, sep=None, na_rep=None): - data = self._orig if self._is_categorical else self._data - result = str_cat(data, others=others, sep=sep, na_rep=na_rep) - return self._wrap_result(result, use_codes=(not self._is_categorical)) + def _str_cat_los(self, input, ignore_index=False): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into list of Series (elements without an index must match the length of + the calling Series/Index). + + Parameters + ---------- + input : Series, DataFrame, np.ndarrary, list-like or list-like of those + ignore_index : Boolean + Determines whether to forcefully align with index of the caller + + Returns + ------- + tuple : first element: input transformed into list of Series + second element: Boolean whether FutureWarning should be raised + """ + + # once str.cat defaults to alignment, this function can be simplified; + # will not need `ignore_index` and the second boolean output anymore + + from pandas.core.index import Index + from pandas.core.series import Series + from pandas.core.frame import DataFrame + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, Index) else self._orig.index + + if isinstance(input, Series): + los = [Series(input.values, index=idx) if ignore_index else input] + return (los, True) + elif isinstance(input, Index): + los = [Series(input.values, + index=(idx if ignore_index else input))] + return (los, True) + elif isinstance(input, DataFrame): + if ignore_index: + # without copy, this could change (the corresponding list + # element of) "others" that was passed to str.cat + input = input.copy() + input.index = idx + return ([input[x] for x in input], True) + elif isinstance(input, np.ndarray) and input.ndim == 2: + input = DataFrame(input, index=idx) + return ([input[x] for x in input], False) + elif is_list_like(input): + input = list(input) # ensure iterators do not get read twice, etc. + if all(is_list_like(x) for x in input): + los = [] + fuwa = False + while input: + tmp = self._str_cat_los(input.pop(0), + ignore_index=ignore_index) + los = los + tmp[0] + fuwa = fuwa or tmp[1] + return (los, fuwa) + else: + return ([Series(input, index=idx)], False) + else: + raise ValueError('input must be Series, Index, DataFrame, ' + 'np.ndarrary or list-like') + + def cat(self, others=None, sep=None, na_rep=None, join=None): + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarrary or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains an arbitrary combination of + the above, then all elements will be unpacked and must satisfy the + above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : string or None, default None + If None, concatenates without any separator. + na_rep : string or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default None + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). If None, + alignment is disabled, but this option will be removed in a future + version of pandas and replaced with a default of `'left'`. To + disable alignment, use `.values` on any Series/Index/DataFrame in + `others`. + + .. versionadded:: 0.23.0 + + Returns + ------- + concat : str if `other is None`, Series/Index of objects if `others is + not None`. In the latter case, the result will remain categorical + if the calling Series/Index is categorical. + + See Also + -------- + split : Split each string in the Series/Index + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join=None, na_rep='-') + 0 ad + 1 ba + 2 -e + 3 dc + dtype: object + >>> + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + from pandas.core.index import Index + from pandas.core.series import Series + from pandas.core.reshape.concat import concat + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + + if isinstance(self._orig, Index): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series into itself if no "others" + if others is None: + result = str_cat(data, others=others, sep=sep, na_rep=na_rep) + return self._wrap_result(result, + use_codes=(not self._is_categorical)) + + try: + # turn anything in "others" into lists of Series + others, fuwa = self._str_cat_los(others, + ignore_index=(join is None)) + except ValueError: + if join is None: + # legacy warning + raise ValueError('All arrays must be same length') + else: + raise ValueError('If `others` contains arrays or lists (or ' + 'other list-likes without an index), these ' + 'must all be of the same length as the ' + 'calling Series/Index.') + + if join is None and fuwa: + warnings.warn("A future version of pandas will perform index " + "alignment when `others` is a Series/Index/" + "DataFrame (or a list-like containing one). To " + "disable alignment (the behavior before v.0.23) and " + "silence this warning, use `.values` on any Series/" + "Index/DataFrame in `others`. To enable alignment " + "and silence this warning, pass `join='left'|" + "'outer'|'inner'|'right'`. The future default will " + "be `join='left'`.", FutureWarning, stacklevel=2) + + # align if required + if join is not None: + # Need to add keys for uniqueness in case of duplicate columns + others = concat(others, axis=1, + join=(join if join == 'inner' else 'outer'), + keys=range(len(others))) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + # str_cat discards index + res = str_cat(data, others=others, sep=sep, na_rep=na_rep) + + dtype = 'category' if self._is_categorical else None + if isinstance(self._orig, Index): + res = Index(res, dtype=dtype) + else: # Series + res = Series(res, index=data.index, dtype=dtype) + return res @copy(str_split) def split(self, pat=None, n=-1, expand=False): diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index f7f1ea019a3f0..65b33f0682e6f 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -608,7 +608,6 @@ def test_str_accessor_api_for_categorical(self): # str functions, which need special arguments special_func_defs = [ - ('cat', (list("zyxw"),), {"sep": ","}), ('center', (10,), {}), ('contains', ("a",), {}), ('count', ("a",), {}), @@ -644,11 +643,12 @@ def test_str_accessor_api_for_categorical(self): ] _special_func_names = [f[0] for f in special_func_defs] + # * cat tested extensively with categorical data in test_strings.py # * get, join: they need a individual elements of type lists, but # we can't make a categorical with lists as individual categories. # -> `s.str.split(" ").astype("category")` will error! # * `translate` has different interfaces for py2 vs. py3 - _ignore_names = ["get", "join", "translate"] + _ignore_names = ["cat", "get", "join", "translate"] str_func_names = [f for f in dir(s.str) if not ( f.startswith("_") or diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ac8d269c75f52..4f46ee01564da 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -11,14 +11,21 @@ from pandas.compat import range, u import pandas.compat as compat -from pandas import Index, Series, DataFrame, isna, MultiIndex, notna +from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, assert_index_equal import pandas.util.testing as tm import pandas.core.strings as strings +def assert_series_or_index_equal(left, right): + if isinstance(left, Series): + assert_series_equal(left, right) + else: + assert_index_equal(left, right) + + class TestStringMethods(object): def test_api(self): @@ -125,6 +132,263 @@ def test_cat(self): exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) + @pytest.mark.parametrize('ser_or_ind', ['series', 'index']) + def test_str_cat(self, ser_or_ind): + # test_cat above tests "str_cat" from ndarray to ndarray; + # here testing "str.cat" from Series/Index to Series/Index/ndarray/list + s = Index(['a', 'a', 'b', 'b', 'c', np.nan]) + if ser_or_ind == 'series': + s = Series(s) + t = Index(['a', np.nan, 'b', 'd', 'foo', np.nan]) + + # single array + result = s.str.cat() + exp = 'aabbc' + assert result == exp + + result = s.str.cat(na_rep='-') + exp = 'aabbc-' + assert result == exp + + result = s.str.cat(sep='_', na_rep='NA') + exp = 'a_a_b_b_c_NA' + assert result == exp + + # Series/Index with Index + exp = Index(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) + if ser_or_ind == 'series': + exp = Series(exp) + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) + + # Series/Index with Series + t = Series(t) + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) + + # Series/Index with array (no warning necessary) + assert_series_or_index_equal(s.str.cat(t.values, na_rep='-'), exp) + + # Series/Index with list (no warning necessary) + assert_series_or_index_equal(s.str.cat(list(t), na_rep='-'), exp) + + # errors for incorrect lengths + rgx = 'All arrays must be same length' + z = Series(['1', '2', '3']) + + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(z) + + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(z.values) + + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(list(z)) + + @pytest.mark.parametrize('ser_or_ind', ['series', 'index']) + def test_str_cat_raises_intuitive_error(self, ser_or_ind): + # https://github.com/pandas-dev/pandas/issues/11334 + s = Index(['a', 'b', 'c', 'd']) + if ser_or_ind == 'series': + s = Series(s) + message = "Did you mean to supply a `sep` keyword?" + with tm.assert_raises_regex(ValueError, message): + s.str.cat('|') + with tm.assert_raises_regex(ValueError, message): + s.str.cat(' ') + + @pytest.mark.parametrize('ser_or_ind, dtype_caller, dtype_target', [ + ('series', 'object', 'object'), + ('series', 'object', 'category'), + ('series', 'category', 'object'), + ('series', 'category', 'category'), + ('index', 'object', 'object'), + ('index', 'object', 'category'), + ('index', 'category', 'object'), + ('index', 'category', 'category') + ]) + def test_str_cat_categorical(self, ser_or_ind, dtype_caller, dtype_target): + s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller) + if ser_or_ind == 'series': + s = Series(s) + t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) + + exp = Index(['ab', 'aa', 'bb', 'ac'], dtype=dtype_caller) + if ser_or_ind == 'series': + exp = Series(exp) + # Series/Index with Index + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t), exp) + + # Series/Index with Series + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(Series(t)), exp) + + @pytest.mark.parametrize('ser_or_ind', ['series', 'index']) + def test_str_cat_mixed_inputs(self, ser_or_ind): + s = Index(['a', 'b', 'c', 'd']) + if ser_or_ind == 'series': + s = Series(s) + t = Series(['A', 'B', 'C', 'D']) + d = concat([t, Series(s)], axis=1) + + exp = Index(['aAa', 'bBb', 'cCc', 'dDd']) + if ser_or_ind == 'series': + exp = Series(exp) + # Series/Index with DataFrame + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(d), exp) + + # Series/Index with two-dimensional ndarray (no warning necessary) + assert_series_or_index_equal(s.str.cat(d.values), exp) + + # Series/Index with list of Series + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat([t, s]), exp) + + # Series/Index with list of list-likes (no warning necessary) + assert_series_or_index_equal(s.str.cat([t.values, list(s)]), exp) + + # Series/Index with mixed list of Series/list-like + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat([t.values, s]), exp) + + # Series/Index with iterator of list-likes (no warning necessary) + assert_series_or_index_equal(s.str.cat(iter([t.values, list(s)])), exp) + + # errors for incorrect lengths + rgx = 'All arrays must be same length' + z = Series(['1', '2', '3']) + e = concat([z, z], axis=1) + + # DataFrame + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(e) + + # two-dimensional ndarray + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(e.values) + + # list of Series + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([z, s]) + + # list of list-likes + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([z.values, list(s)]) + + # mixed list of Series/list-like + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([z, list(s)]) + + @pytest.mark.parametrize('ser_or_ind, join', [ + ('series', 'left'), ('series', 'outer'), + ('series', 'inner'), ('series', 'right'), + ('index', 'left'), ('index', 'outer'), + ('index', 'inner'), ('index', 'right') + ]) + def test_str_cat_align_indexed(self, ser_or_ind, join): + # https://github.com/pandas-dev/pandas/issues/18657 + s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) + t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) + sa, ta = s.align(t, join=join) + if ser_or_ind == 'index': + s = Index(s) + sa = Index(sa) + + with tm.assert_produces_warning(expected_warning=FutureWarning): + # result of mamnual alignmnent of inputs + exp = sa.str.cat(ta, na_rep='-') + + assert_series_or_index_equal(s.str.cat(t, join=join, na_rep='-'), exp) + + @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) + def test_str_cat_align_mixed_inputs(self, join): + s = Series(['a', 'b', 'c', 'd']) + t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + d = concat([t, t], axis=1) + + exp_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee']) + sa, ta = s.align(t, join=join) + exp = exp_outer.loc[ta.index] + + # list of Series + tm.assert_series_equal(s.str.cat([t, t], join=join, na_rep='-'), exp) + + # DataFrame + tm.assert_series_equal(s.str.cat(d, join=join, na_rep='-'), exp) + + # mixed list of indexed/unindexed + u = ['A', 'B', 'C', 'D'] + exp_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + e = concat([t, s], axis=1, join=(join if join == 'inner' else 'outer')) + sa, ea = s.align(e, join=join) + exp = exp_outer.loc[ea.index] + tm.assert_series_equal(s.str.cat([t, u], join=join, na_rep='-'), exp) + + # errors for incorrect lengths + rgx = 'If `others` contains arrays or lists.*' + z = ['1', '2', '3'] + + # unindexed object of wrong length + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(z, join=join) + + # unindexed object of wrong length in list + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([t, z], join=join) + + def test_str_cat_special_cases(self): + s = Series(['a', 'b', 'c', 'd']) + t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + d = concat([t, t], axis=1) + + # lists of elements with different types - unaligned + mix = [t, t.values, ['A', 'B', 'C', 'D'], d, d.values] + exp = Series(['addAdddd', 'baaBaaaa', 'ceeCeeee', 'dbbDbbbb']) + with tm.assert_produces_warning(expected_warning=FutureWarning): + tm.assert_series_equal(s.str.cat(mix, join=None), exp) + + # lists of elements with different types - aligned with na_rep + exp = Series(['aadAaadd', 'bbaBbbaa', 'c-eC--ee', 'ddbDddbb']) + tm.assert_series_equal(s.str.cat(mix, join='left', na_rep='-'), exp) + + # iterator of elements with different types + exp = Series(['aadAaadd', 'bbaBbbaa', 'c-eC--ee', + 'ddbDddbb', '-e--ee--']) + tm.assert_series_equal(s.str.cat(iter(mix), join='outer', na_rep='-'), + exp) + + # right-align with different indexes in other + exp = Series(['aa--', 'd-dd'], index=[0, 3]) + tm.assert_series_equal(s.str.cat([t.loc[[0]], d.loc[[3]]], + join='right', na_rep='-'), exp) + + def test_cat_on_filtered_index(self): + df = DataFrame(index=MultiIndex.from_product( + [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) + + df = df.reset_index() + df = df[df.month > 1] + + str_year = df.year.astype('str') + str_month = df.month.astype('str') + str_both = str_year.str.cat(str_month, sep=' ', join='left') + + assert str_both.loc[1] == '2011 2' + + str_multiple = str_year.str.cat([str_month, str_month], + sep=' ', join='left') + + assert str_multiple.loc[1] == '2011 2 2' + def test_count(self): values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], dtype=np.object_) @@ -1263,7 +1527,7 @@ def test_empty_str_methods(self): # GH7241 # (extract) on empty series - tm.assert_series_equal(empty_str, empty.str.cat(empty)) + tm.assert_series_equal(empty_str, empty.str.cat(empty, join='left')) assert '' == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count('a')) @@ -2772,32 +3036,6 @@ def test_normalize(self): result = s.str.normalize('NFKC') tm.assert_index_equal(result, expected) - def test_cat_on_filtered_index(self): - df = DataFrame(index=MultiIndex.from_product( - [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) - - df = df.reset_index() - df = df[df.month > 1] - - str_year = df.year.astype('str') - str_month = df.month.astype('str') - str_both = str_year.str.cat(str_month, sep=' ') - - assert str_both.loc[1] == '2011 2' - - str_multiple = str_year.str.cat([str_month, str_month], sep=' ') - - assert str_multiple.loc[1] == '2011 2 2' - - def test_str_cat_raises_intuitive_error(self): - # https://github.com/pandas-dev/pandas/issues/11334 - s = Series(['a', 'b', 'c', 'd']) - message = "Did you mean to supply a `sep` keyword?" - with tm.assert_raises_regex(ValueError, message): - s.str.cat('|') - with tm.assert_raises_regex(ValueError, message): - s.str.cat(' ') - def test_index_str_accessor_visibility(self): from pandas.core.strings import StringMethods @@ -2857,9 +3095,9 @@ def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: - pytest.raises(TypeError, lhs.str.cat, rhs) + pytest.raises(TypeError, lhs.str.cat, rhs, join='left') else: - result = lhs.str.cat(rhs) + result = lhs.str.cat(rhs, join='left') expected = Series(np.array( ['ad', 'be', 'cf'], 'S2').astype(object)) tm.assert_series_equal(result, expected) From 84cde8b2107b8e2d1dc9f036b22952d585814e76 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 25 Apr 2018 00:00:52 +0200 Subject: [PATCH 02/10] Incorporate review feedback --- doc/source/text.rst | 17 +++++---- doc/source/whatsnew/v0.23.0.txt | 11 ------ pandas/core/strings.py | 65 ++++++++++++++++----------------- pandas/tests/test_strings.py | 37 ++++++++++--------- 4 files changed, 59 insertions(+), 71 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 728c41643577b..a23de69fe2d99 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -257,6 +257,7 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o .. ipython:: python d = pd.concat([t, s], axis=1) + s d s.str.cat(d, na_rep='-') @@ -271,6 +272,8 @@ the ``join``-keyword, which controls the manner of alignment. .. ipython:: python u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2]) + s + u s.str.cat(u) s.str.cat(u, join='left') @@ -285,6 +288,8 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4]) + s + v s.str.cat(v, join='left', na_rep='-') s.str.cat(v, join='outer', na_rep='-') @@ -293,13 +298,14 @@ The same alignment can be used when ``others`` is a ``DataFrame``: .. ipython:: python f = d.loc[[3, 2, 1, 0], :] + s f s.str.cat(f, join='left', na_rep='-') Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All list-likes (as well as ``DataFrame`` and two-dimensional ``ndarray``) can be arbitrarily combined in a list-like container: +All list-likes (including iterators, ``dict``-views, etc.) can be arbitrarily combined in a list-like container: .. ipython:: python @@ -317,15 +323,10 @@ the union of these indexes will be used as the basis for the final concatenation .. ipython:: python + u.loc[[3]] + v.loc[[-1, 0]] s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join='right', na_rep='-') -Finally, the surrounding container can also be an :obj:`Iterable` other than a ``list`` (e.g. an iterator, or a ``dict``-view, etc.): - -.. ipython:: python - - from collections import OrderedDict - s.str.cat(d.to_dict('series', into=OrderedDict).values(), na_rep='-') - Indexing with ``.str`` ---------------------- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f023cd54cf9a1..29085a56dbf69 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -326,17 +326,6 @@ to ``'left'`` in a future version of pandas. In particular, ``others`` does not need to be of the same length as the calling ``Series`` (if both have an index and ``join is not None``). For more examples, see :ref:`here `. - -Additionally, ``str.cat`` now allows ``others`` to be a ``DataFrame`` or two-dimensional ``np.ndarray``. - -.. ipython:: python - - u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2]) - d = pd.concat([s, u], axis=1) - t.str.cat(d.values) - s.str.cat(d, join='left', na_rep='-') - -Furthermore, any combination of "concatenateable" arguments can be passed in a list-like container (e.g. an iterator). For categorical data, it is now possible to call :meth:`Series.str.cat` for ``CategoricalIndex`` as well (previously raised a ``ValueError``). Finally, if ``others is not None``, the resulting ``Series``/``Index`` will now remain categorical if the calling diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0b575f0cb0838..2b4bb4cb60656 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1800,6 +1800,7 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._validate(data) self._is_categorical = is_categorical_dtype(data) + # .values.categories works for both Series/Index self._data = data.values.categories if self._is_categorical else data # save orig to blow up categoricals to the right type @@ -1934,7 +1935,7 @@ def cons_row(x): cons = self._orig._constructor return cons(result, name=name, index=index) - def _str_cat_los(self, input, ignore_index=False): + def _get_series_list(self, others, ignore_index=False): """ Auxiliary function for :meth:`str.cat`. Turn potentially mixed input into list of Series (elements without an index must match the length of @@ -1943,7 +1944,7 @@ def _str_cat_los(self, input, ignore_index=False): Parameters ---------- input : Series, DataFrame, np.ndarrary, list-like or list-like of those - ignore_index : Boolean + ignore_index : boolean, default False Determines whether to forcefully align with index of the caller Returns @@ -1955,46 +1956,44 @@ def _str_cat_los(self, input, ignore_index=False): # once str.cat defaults to alignment, this function can be simplified; # will not need `ignore_index` and the second boolean output anymore - from pandas.core.index import Index - from pandas.core.series import Series - from pandas.core.frame import DataFrame + from pandas import Index, Series, DataFrame # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index - if isinstance(input, Series): - los = [Series(input.values, index=idx) if ignore_index else input] + if isinstance(others, Series): + los = [Series(others.values, index=idx) + if ignore_index else others] return (los, True) - elif isinstance(input, Index): - los = [Series(input.values, - index=(idx if ignore_index else input))] + elif isinstance(others, Index): + los = [Series(others.values, + index=(idx if ignore_index else others))] return (los, True) - elif isinstance(input, DataFrame): + elif isinstance(others, DataFrame): if ignore_index: # without copy, this could change (the corresponding list # element of) "others" that was passed to str.cat - input = input.copy() - input.index = idx - return ([input[x] for x in input], True) - elif isinstance(input, np.ndarray) and input.ndim == 2: - input = DataFrame(input, index=idx) - return ([input[x] for x in input], False) - elif is_list_like(input): - input = list(input) # ensure iterators do not get read twice, etc. - if all(is_list_like(x) for x in input): + others = others.copy() + others.index = idx + return ([others[x] for x in others], True) + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return ([others[x] for x in others], False) + elif is_list_like(others): + others = list(others) # ensure iterators do not get read twice etc + if all(is_list_like(x) for x in others): los = [] fuwa = False - while input: - tmp = self._str_cat_los(input.pop(0), - ignore_index=ignore_index) + while others: + tmp = self._get_series_list(others.pop(0), + ignore_index=ignore_index) los = los + tmp[0] fuwa = fuwa or tmp[1] return (los, fuwa) else: - return ([Series(input, index=idx)], False) - else: - raise ValueError('input must be Series, Index, DataFrame, ' - 'np.ndarrary or list-like') + return ([Series(others, index=idx)], False) + raise ValueError('others must be Series, Index, DataFrame, ' + 'np.ndarrary or list-like') def cat(self, others=None, sep=None, na_rep=None, join=None): """ @@ -2136,11 +2135,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): For more examples, see :ref:`here `. """ - from pandas.core.index import Index - from pandas.core.series import Series - from pandas.core.reshape.concat import concat + from pandas import Index, Series, concat - if isinstance(others, str): + if isinstance(others, compat.string_types): raise ValueError("Did you mean to supply a `sep` keyword?") if isinstance(self._orig, Index): @@ -2156,8 +2153,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): try: # turn anything in "others" into lists of Series - others, fuwa = self._str_cat_los(others, - ignore_index=(join is None)) + tmp = self._get_series_list(others, ignore_index=(join is None)) + others, fut_warn = tmp except ValueError: if join is None: # legacy warning @@ -2168,7 +2165,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 'must all be of the same length as the ' 'calling Series/Index.') - if join is None and fuwa: + if join is None and fut_warn: warnings.warn("A future version of pandas will perform index " "alignment when `others` is a Series/Index/" "DataFrame (or a list-like containing one). To " diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4f46ee01564da..e8fae1a47bafd 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -132,12 +132,12 @@ def test_cat(self): exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) - @pytest.mark.parametrize('ser_or_ind', ['series', 'index']) - def test_str_cat(self, ser_or_ind): + @pytest.mark.parametrize('series_or_index', ['series', 'index']) + def test_str_cat(self, series_or_index): # test_cat above tests "str_cat" from ndarray to ndarray; # here testing "str.cat" from Series/Index to Series/Index/ndarray/list s = Index(['a', 'a', 'b', 'b', 'c', np.nan]) - if ser_or_ind == 'series': + if series_or_index == 'series': s = Series(s) t = Index(['a', np.nan, 'b', 'd', 'foo', np.nan]) @@ -156,7 +156,7 @@ def test_str_cat(self, ser_or_ind): # Series/Index with Index exp = Index(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) - if ser_or_ind == 'series': + if series_or_index == 'series': exp = Series(exp) with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default @@ -187,11 +187,11 @@ def test_str_cat(self, ser_or_ind): with tm.assert_raises_regex(ValueError, rgx): s.str.cat(list(z)) - @pytest.mark.parametrize('ser_or_ind', ['series', 'index']) - def test_str_cat_raises_intuitive_error(self, ser_or_ind): + @pytest.mark.parametrize('series_or_index', ['series', 'index']) + def test_str_cat_raises_intuitive_error(self, series_or_index): # https://github.com/pandas-dev/pandas/issues/11334 s = Index(['a', 'b', 'c', 'd']) - if ser_or_ind == 'series': + if series_or_index == 'series': s = Series(s) message = "Did you mean to supply a `sep` keyword?" with tm.assert_raises_regex(ValueError, message): @@ -199,7 +199,7 @@ def test_str_cat_raises_intuitive_error(self, ser_or_ind): with tm.assert_raises_regex(ValueError, message): s.str.cat(' ') - @pytest.mark.parametrize('ser_or_ind, dtype_caller, dtype_target', [ + @pytest.mark.parametrize('series_or_index, dtype_caller, dtype_target', [ ('series', 'object', 'object'), ('series', 'object', 'category'), ('series', 'category', 'object'), @@ -209,14 +209,15 @@ def test_str_cat_raises_intuitive_error(self, ser_or_ind): ('index', 'category', 'object'), ('index', 'category', 'category') ]) - def test_str_cat_categorical(self, ser_or_ind, dtype_caller, dtype_target): + def test_str_cat_categorical(self, series_or_index, + dtype_caller, dtype_target): s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller) - if ser_or_ind == 'series': + if series_or_index == 'series': s = Series(s) t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) exp = Index(['ab', 'aa', 'bb', 'ac'], dtype=dtype_caller) - if ser_or_ind == 'series': + if series_or_index == 'series': exp = Series(exp) # Series/Index with Index with tm.assert_produces_warning(expected_warning=FutureWarning): @@ -228,16 +229,16 @@ def test_str_cat_categorical(self, ser_or_ind, dtype_caller, dtype_target): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(Series(t)), exp) - @pytest.mark.parametrize('ser_or_ind', ['series', 'index']) - def test_str_cat_mixed_inputs(self, ser_or_ind): + @pytest.mark.parametrize('series_or_index', ['series', 'index']) + def test_str_cat_mixed_inputs(self, series_or_index): s = Index(['a', 'b', 'c', 'd']) - if ser_or_ind == 'series': + if series_or_index == 'series': s = Series(s) t = Series(['A', 'B', 'C', 'D']) d = concat([t, Series(s)], axis=1) exp = Index(['aAa', 'bBb', 'cCc', 'dDd']) - if ser_or_ind == 'series': + if series_or_index == 'series': exp = Series(exp) # Series/Index with DataFrame with tm.assert_produces_warning(expected_warning=FutureWarning): @@ -288,18 +289,18 @@ def test_str_cat_mixed_inputs(self, ser_or_ind): with tm.assert_raises_regex(ValueError, rgx): s.str.cat([z, list(s)]) - @pytest.mark.parametrize('ser_or_ind, join', [ + @pytest.mark.parametrize('series_or_index, join', [ ('series', 'left'), ('series', 'outer'), ('series', 'inner'), ('series', 'right'), ('index', 'left'), ('index', 'outer'), ('index', 'inner'), ('index', 'right') ]) - def test_str_cat_align_indexed(self, ser_or_ind, join): + def test_str_cat_align_indexed(self, series_or_index, join): # https://github.com/pandas-dev/pandas/issues/18657 s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) sa, ta = s.align(t, join=join) - if ser_or_ind == 'index': + if series_or_index == 'index': s = Index(s) sa = Index(sa) From d7587ba3bdf9271dc268e8ca86b03d1ec0103dab Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 26 Apr 2018 23:59:21 +0200 Subject: [PATCH 03/10] Emit FutureWarning only for different indexes --- doc/source/text.rst | 4 +- pandas/core/strings.py | 21 ++++---- pandas/tests/test_strings.py | 92 ++++++++++++++++++++++++++++++------ 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index a23de69fe2d99..7c2c733e7667c 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -280,7 +280,7 @@ the ``join``-keyword, which controls the manner of alignment. .. warning:: If the ``join`` keyword is not passed, the method :meth:`~Series.str.cat` will currently fall back to the behavior before version 0.23.0 (i.e. no alignment), - but a ``FutureWarning`` will be raised, since this default will change to ``join='left'`` in a future version. + but a ``FutureWarning`` will be raised if any of the involved indexes differ, since this default will change to ``join='left'`` in a future version. To usual options are available for ``join`` (one of ``'left', 'outer', 'inner', 'right'``). In particular, alignment also means that the different lengths do not need to coincide anymore. @@ -305,7 +305,7 @@ The same alignment can be used when ``others`` is a ``DataFrame``: Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All list-likes (including iterators, ``dict``-views, etc.) can be arbitrarily combined in a list-like container: +All list-likes (including iterators, ``dict``-views, etc.) can be arbitrarily combined in a list-like container: .. ipython:: python diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2b4bb4cb60656..77e70c9a2eb3b 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1962,20 +1962,23 @@ def _get_series_list(self, others, ignore_index=False): idx = self._orig if isinstance(self._orig, Index) else self._orig.index if isinstance(others, Series): + fut_warn = not others.index.equals(idx) los = [Series(others.values, index=idx) - if ignore_index else others] - return (los, True) + if ignore_index and fut_warn else others] + return (los, fut_warn) elif isinstance(others, Index): + fut_warn = not others.equals(idx) los = [Series(others.values, index=(idx if ignore_index else others))] - return (los, True) + return (los, fut_warn) elif isinstance(others, DataFrame): - if ignore_index: + fut_warn = not others.index.equals(idx) + if ignore_index and fut_warn: # without copy, this could change (the corresponding list # element of) "others" that was passed to str.cat others = others.copy() others.index = idx - return ([others[x] for x in others], True) + return ([others[x] for x in others], fut_warn) elif isinstance(others, np.ndarray) and others.ndim == 2: others = DataFrame(others, index=idx) return ([others[x] for x in others], False) @@ -1983,13 +1986,13 @@ def _get_series_list(self, others, ignore_index=False): others = list(others) # ensure iterators do not get read twice etc if all(is_list_like(x) for x in others): los = [] - fuwa = False + fut_warn = False while others: tmp = self._get_series_list(others.pop(0), ignore_index=ignore_index) los = los + tmp[0] - fuwa = fuwa or tmp[1] - return (los, fuwa) + fut_warn = fut_warn or tmp[1] + return (los, fut_warn) else: return ([Series(others, index=idx)], False) raise ValueError('others must be Series, Index, DataFrame, ' @@ -2145,7 +2148,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): else: # Series data = self._orig - # concatenate Series into itself if no "others" + # concatenate Series/Index with itself if no "others" if others is None: result = str_cat(data, others=others, sep=sep, na_rep=na_rep) return self._wrap_result(result, diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index e8fae1a47bafd..1c34d2b458744 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -19,10 +19,10 @@ import pandas.core.strings as strings -def assert_series_or_index_equal(left, right): +def assert_series_or_index_equal(left, right, expect_warn=False): if isinstance(left, Series): assert_series_equal(left, right) - else: + else: # Index assert_index_equal(left, right) @@ -158,20 +158,32 @@ def test_str_cat(self, series_or_index): exp = Index(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) if series_or_index == 'series': exp = Series(exp) + # s.index / s is different from t (as Index) -> warning with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) # Series/Index with Series t = Series(t) + # s as Series has same index as t -> no warning + # s as Index is different from t.index -> warning + if series_or_index == 'series': + assert_series_equal(s.str.cat(t, na_rep='-'), exp) + else: + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) + + # Series/Index with Series: warning if different indexes + t.index = t.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) - # Series/Index with array (no warning necessary) + # Series/Index with array assert_series_or_index_equal(s.str.cat(t.values, na_rep='-'), exp) - # Series/Index with list (no warning necessary) + # Series/Index with list assert_series_or_index_equal(s.str.cat(list(t), na_rep='-'), exp) # errors for incorrect lengths @@ -219,15 +231,29 @@ def test_str_cat_categorical(self, series_or_index, exp = Index(['ab', 'aa', 'bb', 'ac'], dtype=dtype_caller) if series_or_index == 'series': exp = Series(exp) + # Series/Index with Index + # s.index / s is different from t (as Index) -> warning with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(t), exp) # Series/Index with Series + t = Series(t) + # s as Series has same index as t -> no warning + # s as Index is different from t.index -> warning + if series_or_index == 'series': + assert_series_equal(s.str.cat(t), exp) + else: + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t), exp) + + # Series/Index with Series: warning if different indexes + t.index = t.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat(Series(t)), exp) + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) @pytest.mark.parametrize('series_or_index', ['series', 'index']) def test_str_cat_mixed_inputs(self, series_or_index): @@ -240,28 +266,62 @@ def test_str_cat_mixed_inputs(self, series_or_index): exp = Index(['aAa', 'bBb', 'cCc', 'dDd']) if series_or_index == 'series': exp = Series(exp) + # Series/Index with DataFrame + # s as Series has same index as d -> no warning + # s as Index is different from d.index -> warning + if series_or_index == 'series': + assert_series_equal(s.str.cat(d), exp) + else: + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(d), exp) + + # Series/Index with DataFrame: warning if different indexes + d.index = d.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default assert_series_or_index_equal(s.str.cat(d), exp) - # Series/Index with two-dimensional ndarray (no warning necessary) + # Series/Index with two-dimensional ndarray assert_series_or_index_equal(s.str.cat(d.values), exp) # Series/Index with list of Series + # s as Series has same index as t, s -> no warning + # s as Index is different from t.index -> warning + if series_or_index == 'series': + assert_series_equal(s.str.cat([t, s]), exp) + else: + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat([t, s]), exp) + + # Series/Index with list of Series: warning if different indexes + tt = t.copy() + tt.index = tt.index + 1 with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat([t, s]), exp) + assert_series_or_index_equal(s.str.cat([tt, s]), exp) - # Series/Index with list of list-likes (no warning necessary) + # Series/Index with list of list-likes assert_series_or_index_equal(s.str.cat([t.values, list(s)]), exp) # Series/Index with mixed list of Series/list-like + # s as Series has same index as t -> no warning + # s as Index is different from t.index -> warning + if series_or_index == 'series': + assert_series_equal(s.str.cat([t, s.values]), exp) + else: + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat([t, s.values]), exp) + + # Series/Index with mixed list: warning if different indexes with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat([t.values, s]), exp) + assert_series_or_index_equal(s.str.cat([tt, s.values]), exp) - # Series/Index with iterator of list-likes (no warning necessary) + # Series/Index with iterator of list-likes assert_series_or_index_equal(s.str.cat(iter([t.values, list(s)])), exp) # errors for incorrect lengths @@ -300,13 +360,13 @@ def test_str_cat_align_indexed(self, series_or_index, join): s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) sa, ta = s.align(t, join=join) + # result after manual alignment of inputs + exp = sa.str.cat(ta, na_rep='-') + if series_or_index == 'index': s = Index(s) sa = Index(sa) - - with tm.assert_produces_warning(expected_warning=FutureWarning): - # result of mamnual alignmnent of inputs - exp = sa.str.cat(ta, na_rep='-') + exp = Index(exp) assert_series_or_index_equal(s.str.cat(t, join=join, na_rep='-'), exp) @@ -329,6 +389,7 @@ def test_str_cat_align_mixed_inputs(self, join): # mixed list of indexed/unindexed u = ['A', 'B', 'C', 'D'] exp_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + # u will be forced have index of s -> use s here as placeholder e = concat([t, s], axis=1, join=(join if join == 'inner' else 'outer')) sa, ea = s.align(e, join=join) exp = exp_outer.loc[ea.index] @@ -355,6 +416,7 @@ def test_str_cat_special_cases(self): mix = [t, t.values, ['A', 'B', 'C', 'D'], d, d.values] exp = Series(['addAdddd', 'baaBaaaa', 'ceeCeeee', 'dbbDbbbb']) with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default tm.assert_series_equal(s.str.cat(mix, join=None), exp) # lists of elements with different types - aligned with na_rep @@ -367,7 +429,7 @@ def test_str_cat_special_cases(self): tm.assert_series_equal(s.str.cat(iter(mix), join='outer', na_rep='-'), exp) - # right-align with different indexes in other + # right-align with different indexes in others exp = Series(['aa--', 'd-dd'], index=[0, 3]) tm.assert_series_equal(s.str.cat([t.loc[[0]], d.loc[[3]]], join='right', na_rep='-'), exp) From 7c9735fc9075c66d953efbd61c05d8625144d59e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 28 Apr 2018 01:07:53 +0200 Subject: [PATCH 04/10] Restrict legal argument combinations; no nesting --- doc/source/text.rst | 12 +++++---- doc/source/whatsnew/v0.23.0.txt | 14 +++++----- pandas/core/strings.py | 48 ++++++++++++++++++++++++--------- pandas/tests/test_strings.py | 46 ++++++++++++++++++------------- 4 files changed, 76 insertions(+), 44 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 7c2c733e7667c..ea5aed69987c9 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -305,18 +305,20 @@ The same alignment can be used when ``others`` is a ``DataFrame``: Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All list-likes (including iterators, ``dict``-views, etc.) can be arbitrarily combined in a list-like container: +All one-dimensional list-likes can be arbitrarily combined in a list-like container (including iterators, ``dict``-views, etc.): .. ipython:: python - s.str.cat([u, t.values, ['A', 'B', 'C', 'D'], d.values, f], na_rep='-') + s + u + s.str.cat([u, pd.Index(u.values), ['A', 'B', 'C', 'D']], na_rep='-') -All elements must match in length to the calling ``Series``, except those having an index if ``join`` is not None: +All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None: .. ipython:: python - s.str.cat([u, v, ['A', 'B', 'C', 'D'], d.values, f.loc[[1]]], - join='outer', na_rep='-') + v + s.str.cat([u, v, ['A', 'B', 'C', 'D']], join='outer', na_rep='-') If using ``join='right'`` on a list of ``others`` that contains different indexes, the union of these indexes will be used as the basis for the final concatenation: diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 29085a56dbf69..f274123285ee7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -314,8 +314,9 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, :meth:`Series.str.cat` did not -- in contrast to most of ``pandas`` -- align :class:`Series` on their index before concatenation (see :issue:`18657`). -The method has now gained a keyword ``join`` to control the manner of alignment. In v.0.23 it will default to None (meaning no alignment), but this default will change -to ``'left'`` in a future version of pandas. +The method has now gained a keyword ``join`` to control the manner of alignment, see examples below and in :ref:`here `. + +In v.0.23 `join` will default to None (meaning no alignment), but this default will change to ``'left'`` in a future version of pandas. .. ipython:: python @@ -324,12 +325,9 @@ to ``'left'`` in a future version of pandas. s.str.cat(t) s.str.cat(t, join='left', na_rep='-') -In particular, ``others`` does not need to be of the same length as the calling ``Series`` (if both have an index and ``join is not None``). -For more examples, see :ref:`here `. - -For categorical data, it is now possible to call :meth:`Series.str.cat` for ``CategoricalIndex`` as well (previously raised a ``ValueError``). -Finally, if ``others is not None``, the resulting ``Series``/``Index`` will now remain categorical if the calling -``Series``/``Index`` is categorical. +Furthermore: +- meth:`Series.str.cat` now works as well for ``CategoricalIndex`` as well (previously raised a ``ValueError``; see :issue:`20842`) +- If concatenating with something (i.e. `others is not None`) the resulting ``Series``/``Index`` will now remain categorical if the calling ``Series``/``Index`` is categorical (see :issue:`20843`) .. _whatsnew_0230.enhancements.astype_category: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 77e70c9a2eb3b..91ef772f48597 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1938,19 +1938,20 @@ def cons_row(x): def _get_series_list(self, others, ignore_index=False): """ Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into list of Series (elements without an index must match the length of - the calling Series/Index). + into a list of Series (elements without an index must match the length + of the calling Series/Index). Parameters ---------- - input : Series, DataFrame, np.ndarrary, list-like or list-like of those + input : Series, DataFrame, np.ndarray, list-like or list-like of + objects that are either Series, np.ndarray (1-dim) or list-like ignore_index : boolean, default False Determines whether to forcefully align with index of the caller Returns ------- - tuple : first element: input transformed into list of Series - second element: Boolean whether FutureWarning should be raised + tuple : (input transformed into list of Series, + Boolean whether FutureWarning should be raised) """ # once str.cat defaults to alignment, this function can be simplified; @@ -1961,6 +1962,10 @@ def _get_series_list(self, others, ignore_index=False): # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index + err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or ' + 'list-like (either containing only strings or containing ' + 'only objects of type Series/Index/list-like/np.ndarray') + if isinstance(others, Series): fut_warn = not others.index.equals(idx) los = [Series(others.values, index=idx) @@ -1988,15 +1993,32 @@ def _get_series_list(self, others, ignore_index=False): los = [] fut_warn = False while others: - tmp = self._get_series_list(others.pop(0), - ignore_index=ignore_index) + nxt = others.pop(0) + # safety for iterators etc.; exclude indexed objects + if (is_list_like(nxt) and + not isinstance(nxt, (DataFrame, Series, Index))): + nxt = list(nxt) + + # nested list-likes are forbidden - content must be strings + is_legal = (is_list_like(nxt) and + all(isinstance(x, compat.string_types) + for x in nxt)) + # DataFrame is false positive of is_legal + # because "x in df" returns column names + if isinstance(nxt, DataFrame) or not is_legal: + raise TypeError(err_msg) + + tmp = self._get_series_list(nxt, ignore_index=ignore_index) los = los + tmp[0] fut_warn = fut_warn or tmp[1] return (los, fut_warn) + # test if there is a mix of list-like and string/NaN/None + elif (any(is_list_like(x) for x in others) + and any(not is_list_like(x) for x in others)): + raise TypeError(err_msg) else: return ([Series(others, index=idx)], False) - raise ValueError('others must be Series, Index, DataFrame, ' - 'np.ndarrary or list-like') + raise TypeError(err_msg) def cat(self, others=None, sep=None, na_rep=None, join=None): """ @@ -2015,9 +2037,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): calling Series/Index, with the exception of indexed objects (i.e. Series/Index/DataFrame) if `join` is not None. - If others is a list-like that contains an arbitrary combination of - the above, then all elements will be unpacked and must satisfy the - above criteria individually. + If others is a list-like that contains a combination of Series, + np.ndarray (1-dim) or list-like, then all elements will be unpacked + and must satisfy the above criteria individually. If others is None, the method returns the concatenation of all strings in the calling Series/Index. @@ -2158,7 +2180,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # turn anything in "others" into lists of Series tmp = self._get_series_list(others, ignore_index=(join is None)) others, fut_warn = tmp - except ValueError: + except ValueError: # let TypeError raised by _get_series_list pass if join is None: # legacy warning raise ValueError('All arrays must be same length') diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1c34d2b458744..d7268372eedc6 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -349,6 +349,29 @@ def test_str_cat_mixed_inputs(self, series_or_index): with tm.assert_raises_regex(ValueError, rgx): s.str.cat([z, list(s)]) + # errors for incorrect arguments in list-like + rgx = 'others must be Series, Index, DataFrame,.*' + + # mix of string and Series + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([s, 's']) + + # DataFrame in list + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([s, d]) + + # 2-dim ndarray in list + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([s, d.values]) + + # nested lists + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([s, [s, d]]) + + # forbidden input type, e.g. int + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat(1) + @pytest.mark.parametrize('series_or_index, join', [ ('series', 'left'), ('series', 'outer'), ('series', 'inner'), ('series', 'right'), @@ -410,28 +433,15 @@ def test_str_cat_align_mixed_inputs(self, join): def test_str_cat_special_cases(self): s = Series(['a', 'b', 'c', 'd']) t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) - d = concat([t, t], axis=1) - - # lists of elements with different types - unaligned - mix = [t, t.values, ['A', 'B', 'C', 'D'], d, d.values] - exp = Series(['addAdddd', 'baaBaaaa', 'ceeCeeee', 'dbbDbbbb']) - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - tm.assert_series_equal(s.str.cat(mix, join=None), exp) - - # lists of elements with different types - aligned with na_rep - exp = Series(['aadAaadd', 'bbaBbbaa', 'c-eC--ee', 'ddbDddbb']) - tm.assert_series_equal(s.str.cat(mix, join='left', na_rep='-'), exp) # iterator of elements with different types - exp = Series(['aadAaadd', 'bbaBbbaa', 'c-eC--ee', - 'ddbDddbb', '-e--ee--']) - tm.assert_series_equal(s.str.cat(iter(mix), join='outer', na_rep='-'), - exp) + exp = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + tm.assert_series_equal(s.str.cat(iter([t, ['A', 'B', 'C', 'D']]), + join='outer', na_rep='-'), exp) # right-align with different indexes in others - exp = Series(['aa--', 'd-dd'], index=[0, 3]) - tm.assert_series_equal(s.str.cat([t.loc[[0]], d.loc[[3]]], + exp = Series(['aa-', 'd-d'], index=[0, 3]) + tm.assert_series_equal(s.str.cat([t.loc[[0]], t.loc[[3]]], join='right', na_rep='-'), exp) def test_cat_on_filtered_index(self): From 1aedf724171171fbf3709fcd6dfdd1f826b609e0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 30 Apr 2018 08:46:38 +0200 Subject: [PATCH 05/10] Fix edge case for NaN/None; adapted tests; fixes --- pandas/core/strings.py | 24 ++++++++++++++---------- pandas/tests/test_strings.py | 10 ++++++---- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 91ef772f48597..cbfc744247a99 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1957,7 +1957,7 @@ def _get_series_list(self, others, ignore_index=False): # once str.cat defaults to alignment, this function can be simplified; # will not need `ignore_index` and the second boolean output anymore - from pandas import Index, Series, DataFrame + from pandas import Index, Series, DataFrame, isnull # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index @@ -1994,15 +1994,19 @@ def _get_series_list(self, others, ignore_index=False): fut_warn = False while others: nxt = others.pop(0) - # safety for iterators etc.; exclude indexed objects - if (is_list_like(nxt) and - not isinstance(nxt, (DataFrame, Series, Index))): + # safety for iterators etc.; nxt is list-like as per above + # do not map indexed objects, which would lose their index + if not isinstance(nxt, (DataFrame, Series, Index)): nxt = list(nxt) - # nested list-likes are forbidden - content must be strings - is_legal = (is_list_like(nxt) and - all(isinstance(x, compat.string_types) - for x in nxt)) + # Nested list-likes are forbidden - content of nxt must be + # strings/NaN/None. Need to robustify check against + # x in nxt being list-like (otherwise ambiguous boolean). + is_legal = all((isinstance(x, compat.string_types) + or (is_list_like(x) and any(isnull(x))) + or (not is_list_like(x) and isnull(x)) + or x is None) + for x in nxt) # DataFrame is false positive of is_legal # because "x in df" returns column names if isinstance(nxt, DataFrame) or not is_legal: @@ -2012,11 +2016,11 @@ def _get_series_list(self, others, ignore_index=False): los = los + tmp[0] fut_warn = fut_warn or tmp[1] return (los, fut_warn) - # test if there is a mix of list-like and string/NaN/None + # test if there is a mix of list-like and non-list-like (e.g. str) elif (any(is_list_like(x) for x in others) and any(not is_list_like(x) for x in others)): raise TypeError(err_msg) - else: + else: # all elements in others are _not_ list-like return ([Series(others, index=idx)], False) raise TypeError(err_msg) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d7268372eedc6..91424ccc3b0c8 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -351,22 +351,24 @@ def test_str_cat_mixed_inputs(self, series_or_index): # errors for incorrect arguments in list-like rgx = 'others must be Series, Index, DataFrame,.*' + # make sure None/Nan also work as string-replacements + u = Series(['a', np.nan, 'c', None]) # mix of string and Series with tm.assert_raises_regex(TypeError, rgx): - s.str.cat([s, 's']) + s.str.cat([u, 'u']) # DataFrame in list with tm.assert_raises_regex(TypeError, rgx): - s.str.cat([s, d]) + s.str.cat([u, d]) # 2-dim ndarray in list with tm.assert_raises_regex(TypeError, rgx): - s.str.cat([s, d.values]) + s.str.cat([u, d.values]) # nested lists with tm.assert_raises_regex(TypeError, rgx): - s.str.cat([s, [s, d]]) + s.str.cat([u, [u, d]]) # forbidden input type, e.g. int with tm.assert_raises_regex(TypeError, rgx): From 2143f190bf5c0a6cc1c5800bc52fc96bcbead156 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 30 Apr 2018 10:02:59 +0200 Subject: [PATCH 06/10] Revert cat-output-for-cat-caller propsal --- doc/source/whatsnew/v0.23.0.txt | 4 +--- pandas/core/strings.py | 5 ++--- pandas/tests/series/test_api.py | 4 ++-- pandas/tests/test_strings.py | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f274123285ee7..d3746d9e0b61e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -325,9 +325,7 @@ In v.0.23 `join` will default to None (meaning no alignment), but this default w s.str.cat(t) s.str.cat(t, join='left', na_rep='-') -Furthermore: -- meth:`Series.str.cat` now works as well for ``CategoricalIndex`` as well (previously raised a ``ValueError``; see :issue:`20842`) -- If concatenating with something (i.e. `others is not None`) the resulting ``Series``/``Index`` will now remain categorical if the calling ``Series``/``Index`` is categorical (see :issue:`20843`) +Furthermore, meth:`Series.str.cat` now works for ``CategoricalIndex`` as well (previously raised a ``ValueError``; see :issue:`20842`). .. _whatsnew_0230.enhancements.astype_category: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index cbfc744247a99..ac247f8a2ff00 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2217,11 +2217,10 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # str_cat discards index res = str_cat(data, others=others, sep=sep, na_rep=na_rep) - dtype = 'category' if self._is_categorical else None if isinstance(self._orig, Index): - res = Index(res, dtype=dtype) + res = Index(res) else: # Series - res = Series(res, index=data.index, dtype=dtype) + res = Series(res, index=data.index) return res @copy(str_split) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 65b33f0682e6f..f7f1ea019a3f0 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -608,6 +608,7 @@ def test_str_accessor_api_for_categorical(self): # str functions, which need special arguments special_func_defs = [ + ('cat', (list("zyxw"),), {"sep": ","}), ('center', (10,), {}), ('contains', ("a",), {}), ('count', ("a",), {}), @@ -643,12 +644,11 @@ def test_str_accessor_api_for_categorical(self): ] _special_func_names = [f[0] for f in special_func_defs] - # * cat tested extensively with categorical data in test_strings.py # * get, join: they need a individual elements of type lists, but # we can't make a categorical with lists as individual categories. # -> `s.str.split(" ").astype("category")` will error! # * `translate` has different interfaces for py2 vs. py3 - _ignore_names = ["cat", "get", "join", "translate"] + _ignore_names = ["get", "join", "translate"] str_func_names = [f for f in dir(s.str) if not ( f.startswith("_") or diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 91424ccc3b0c8..aaf3c99b97a73 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -228,7 +228,7 @@ def test_str_cat_categorical(self, series_or_index, s = Series(s) t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) - exp = Index(['ab', 'aa', 'bb', 'ac'], dtype=dtype_caller) + exp = Index(['ab', 'aa', 'bb', 'ac']) if series_or_index == 'series': exp = Series(exp) From fc9aa67b0eddc9e9175eabc04c4db574c03ebf28 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 30 Apr 2018 10:22:11 +0200 Subject: [PATCH 07/10] Removed duplicate tests --- pandas/tests/test_strings.py | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index aaf3c99b97a73..8c9b3eb0db4b2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -166,13 +166,9 @@ def test_str_cat(self, series_or_index): # Series/Index with Series t = Series(t) # s as Series has same index as t -> no warning - # s as Index is different from t.index -> warning + # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat(t, na_rep='-'), exp) - else: - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) # Series/Index with Series: warning if different indexes t.index = t.index + 1 @@ -241,13 +237,9 @@ def test_str_cat_categorical(self, series_or_index, # Series/Index with Series t = Series(t) # s as Series has same index as t -> no warning - # s as Index is different from t.index -> warning + # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat(t), exp) - else: - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat(t), exp) # Series/Index with Series: warning if different indexes t.index = t.index + 1 @@ -269,13 +261,9 @@ def test_str_cat_mixed_inputs(self, series_or_index): # Series/Index with DataFrame # s as Series has same index as d -> no warning - # s as Index is different from d.index -> warning + # s as Index is different from d.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat(d), exp) - else: - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat(d), exp) # Series/Index with DataFrame: warning if different indexes d.index = d.index + 1 @@ -288,13 +276,9 @@ def test_str_cat_mixed_inputs(self, series_or_index): # Series/Index with list of Series # s as Series has same index as t, s -> no warning - # s as Index is different from t.index -> warning + # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat([t, s]), exp) - else: - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat([t, s]), exp) # Series/Index with list of Series: warning if different indexes tt = t.copy() @@ -308,13 +292,9 @@ def test_str_cat_mixed_inputs(self, series_or_index): # Series/Index with mixed list of Series/list-like # s as Series has same index as t -> no warning - # s as Index is different from t.index -> warning + # s as Index is different from t.index -> warning (tested below) if series_or_index == 'series': assert_series_equal(s.str.cat([t, s.values]), exp) - else: - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - assert_series_or_index_equal(s.str.cat([t, s.values]), exp) # Series/Index with mixed list: warning if different indexes with tm.assert_produces_warning(expected_warning=FutureWarning): From fcdb57b5965f88eda29fe58e4dc3c0507a24dfc1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 30 Apr 2018 14:08:54 +0200 Subject: [PATCH 08/10] Improve tests/errors for str_cat; fix is_legal --- pandas/core/strings.py | 9 ++++----- pandas/tests/test_strings.py | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ac247f8a2ff00..e132f4f54cb75 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -141,7 +141,7 @@ def _length_check(others): elif len(x) != n: raise ValueError('All arrays must be same length') except TypeError: - raise ValueError("Did you mean to supply a `sep` keyword?") + raise ValueError('Must pass arrays containing strings to str_cat') return n @@ -2003,7 +2003,6 @@ def _get_series_list(self, others, ignore_index=False): # strings/NaN/None. Need to robustify check against # x in nxt being list-like (otherwise ambiguous boolean). is_legal = all((isinstance(x, compat.string_types) - or (is_list_like(x) and any(isnull(x))) or (not is_list_like(x) and isnull(x)) or x is None) for x in nxt) @@ -2184,10 +2183,10 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # turn anything in "others" into lists of Series tmp = self._get_series_list(others, ignore_index=(join is None)) others, fut_warn = tmp - except ValueError: # let TypeError raised by _get_series_list pass + except ValueError: # do not catch TypeError raised by _get_series_list if join is None: - # legacy warning - raise ValueError('All arrays must be same length') + raise ValueError('All arrays must be same length, except ' + 'those having an index if `join` is not None') else: raise ValueError('If `others` contains arrays or lists (or ' 'other list-likes without an index), these ' diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 8c9b3eb0db4b2..68764eb72378d 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -132,6 +132,18 @@ def test_cat(self): exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) + # error for incorrect lengths + rgx = 'All arrays must be same length' + three = Series(['1', '2', '3']) + + with tm.assert_raises_regex(ValueError, rgx): + strings.str_cat(one, three) + + # error for incorrect type + rgx = "Must pass arrays containing strings to str_cat" + with tm.assert_raises_regex(ValueError, rgx): + strings.str_cat(one, 'three') + @pytest.mark.parametrize('series_or_index', ['series', 'index']) def test_str_cat(self, series_or_index): # test_cat above tests "str_cat" from ndarray to ndarray; @@ -183,7 +195,7 @@ def test_str_cat(self, series_or_index): assert_series_or_index_equal(s.str.cat(list(t), na_rep='-'), exp) # errors for incorrect lengths - rgx = 'All arrays must be same length' + rgx = 'All arrays must be same length, except.*' z = Series(['1', '2', '3']) with tm.assert_raises_regex(ValueError, rgx): @@ -305,7 +317,7 @@ def test_str_cat_mixed_inputs(self, series_or_index): assert_series_or_index_equal(s.str.cat(iter([t.values, list(s)])), exp) # errors for incorrect lengths - rgx = 'All arrays must be same length' + rgx = 'All arrays must be same length, except.*' z = Series(['1', '2', '3']) e = concat([z, z], axis=1) @@ -331,7 +343,7 @@ def test_str_cat_mixed_inputs(self, series_or_index): # errors for incorrect arguments in list-like rgx = 'others must be Series, Index, DataFrame,.*' - # make sure None/Nan also work as string-replacements + # make sure None/NaN do not crash checks in _get_series_list u = Series(['a', np.nan, 'c', None]) # mix of string and Series From 5a237ea61a4d7690966bb5dc8630c18b9fa35542 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 1 May 2018 14:49:15 +0200 Subject: [PATCH 09/10] Avoid deep inspection for known types in _get_series_list --- doc/source/text.rst | 2 +- pandas/core/strings.py | 33 +++++++++++++++++++-------------- pandas/tests/test_strings.py | 2 +- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index ea5aed69987c9..3486b58c82cf2 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -311,7 +311,7 @@ All one-dimensional list-likes can be arbitrarily combined in a list-like contai s u - s.str.cat([u, pd.Index(u.values), ['A', 'B', 'C', 'D']], na_rep='-') + s.str.cat([u, pd.Index(u.values), ['A', 'B', 'C', 'D'], map(int, u.index)], na_rep='-') All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e132f4f54cb75..34ecc859fa8dd 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1979,8 +1979,8 @@ def _get_series_list(self, others, ignore_index=False): elif isinstance(others, DataFrame): fut_warn = not others.index.equals(idx) if ignore_index and fut_warn: - # without copy, this could change (the corresponding list - # element of) "others" that was passed to str.cat + # without copy, this could change "others" + # that was passed to str.cat others = others.copy() others.index = idx return ([others[x] for x in others], fut_warn) @@ -1993,22 +1993,27 @@ def _get_series_list(self, others, ignore_index=False): los = [] fut_warn = False while others: - nxt = others.pop(0) - # safety for iterators etc.; nxt is list-like as per above - # do not map indexed objects, which would lose their index - if not isinstance(nxt, (DataFrame, Series, Index)): + nxt = others.pop(0) # list-like as per check above + # safety for iterators and other non-persistent list-likes + # do not map indexed/typed objects; would lose information + if not isinstance(nxt, (DataFrame, Series, + Index, np.ndarray)): nxt = list(nxt) - # Nested list-likes are forbidden - content of nxt must be - # strings/NaN/None. Need to robustify check against - # x in nxt being list-like (otherwise ambiguous boolean). - is_legal = all((isinstance(x, compat.string_types) - or (not is_list_like(x) and isnull(x)) - or x is None) - for x in nxt) + # known types without deep inspection + no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1) + or isinstance(nxt, (Series, Index))) + # Nested list-likes are forbidden - elements of nxt must be + # strings/NaN/None. Need to robustify NaN-check against + # x in nxt being list-like (otherwise ambiguous boolean) + is_legal = ((no_deep and nxt.dtype == object) + or all((isinstance(x, compat.string_types) + or (not is_list_like(x) and isnull(x)) + or x is None) + for x in nxt)) # DataFrame is false positive of is_legal # because "x in df" returns column names - if isinstance(nxt, DataFrame) or not is_legal: + if not is_legal or isinstance(nxt, DataFrame): raise TypeError(err_msg) tmp = self._get_series_list(nxt, ignore_index=ignore_index) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 68764eb72378d..1a978cbf6363f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -19,7 +19,7 @@ import pandas.core.strings as strings -def assert_series_or_index_equal(left, right, expect_warn=False): +def assert_series_or_index_equal(left, right): if isinstance(left, Series): assert_series_equal(left, right) else: # Index From 3f77b80b4ca7c8b04894b00bc08e980b62b97b97 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 May 2018 06:49:31 +0200 Subject: [PATCH 10/10] Incorporate review feedback --- doc/source/text.rst | 6 +++--- pandas/core/strings.py | 35 ++++++++++++++++++----------------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 3486b58c82cf2..02fa2d882f8b1 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -266,8 +266,8 @@ Concatenating a Series and an indexed object into a Series, with alignment .. versionadded:: 0.23.0 -For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the respective indexes before concatenation by setting -the ``join``-keyword, which controls the manner of alignment. +For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the indexes before concatenation by setting +the ``join``-keyword. .. ipython:: python @@ -282,7 +282,7 @@ the ``join``-keyword, which controls the manner of alignment. If the ``join`` keyword is not passed, the method :meth:`~Series.str.cat` will currently fall back to the behavior before version 0.23.0 (i.e. no alignment), but a ``FutureWarning`` will be raised if any of the involved indexes differ, since this default will change to ``join='left'`` in a future version. -To usual options are available for ``join`` (one of ``'left', 'outer', 'inner', 'right'``). +The usual options are available for ``join`` (one of ``'left', 'outer', 'inner', 'right'``). In particular, alignment also means that the different lengths do not need to coincide anymore. .. ipython:: python diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 34ecc859fa8dd..da4845fcdfc8e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1964,26 +1964,26 @@ def _get_series_list(self, others, ignore_index=False): err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or ' 'list-like (either containing only strings or containing ' - 'only objects of type Series/Index/list-like/np.ndarray') + 'only objects of type Series/Index/list-like/np.ndarray)') if isinstance(others, Series): - fut_warn = not others.index.equals(idx) + fu_wrn = not others.index.equals(idx) los = [Series(others.values, index=idx) - if ignore_index and fut_warn else others] - return (los, fut_warn) + if ignore_index and fu_wrn else others] + return (los, fu_wrn) elif isinstance(others, Index): - fut_warn = not others.equals(idx) + fu_wrn = not others.equals(idx) los = [Series(others.values, index=(idx if ignore_index else others))] - return (los, fut_warn) + return (los, fu_wrn) elif isinstance(others, DataFrame): - fut_warn = not others.index.equals(idx) - if ignore_index and fut_warn: + fu_wrn = not others.index.equals(idx) + if ignore_index and fu_wrn: # without copy, this could change "others" # that was passed to str.cat others = others.copy() others.index = idx - return ([others[x] for x in others], fut_warn) + return ([others[x] for x in others], fu_wrn) elif isinstance(others, np.ndarray) and others.ndim == 2: others = DataFrame(others, index=idx) return ([others[x] for x in others], False) @@ -1991,7 +1991,7 @@ def _get_series_list(self, others, ignore_index=False): others = list(others) # ensure iterators do not get read twice etc if all(is_list_like(x) for x in others): los = [] - fut_warn = False + fu_wrn = False while others: nxt = others.pop(0) # list-like as per check above # safety for iterators and other non-persistent list-likes @@ -2016,10 +2016,11 @@ def _get_series_list(self, others, ignore_index=False): if not is_legal or isinstance(nxt, DataFrame): raise TypeError(err_msg) - tmp = self._get_series_list(nxt, ignore_index=ignore_index) - los = los + tmp[0] - fut_warn = fut_warn or tmp[1] - return (los, fut_warn) + nxt, fwn = self._get_series_list(nxt, + ignore_index=ignore_index) + los = los + nxt + fu_wrn = fu_wrn or fwn + return (los, fu_wrn) # test if there is a mix of list-like and non-list-like (e.g. str) elif (any(is_list_like(x) for x in others) and any(not is_list_like(x) for x in others)): @@ -2186,8 +2187,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): try: # turn anything in "others" into lists of Series - tmp = self._get_series_list(others, ignore_index=(join is None)) - others, fut_warn = tmp + others, fu_wrn = self._get_series_list(others, + ignore_index=(join is None)) except ValueError: # do not catch TypeError raised by _get_series_list if join is None: raise ValueError('All arrays must be same length, except ' @@ -2198,7 +2199,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 'must all be of the same length as the ' 'calling Series/Index.') - if join is None and fut_warn: + if join is None and fu_wrn: warnings.warn("A future version of pandas will perform index " "alignment when `others` is a Series/Index/" "DataFrame (or a list-like containing one). To "