From b5ab7f2202a26014d81092fa370e847e18273b19 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 27 May 2020 18:26:50 +0100 Subject: [PATCH 01/47] Categorical (to|from)_dummies methods Simplistic implementation to go between dummy variables and Categoricals. --- pandas/core/arrays/categorical.py | 109 +++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ef69d6565cfeb..6847224ccfc8d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Type, Union, cast +from typing import TYPE_CHECKING, Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -55,6 +55,9 @@ from pandas.io.formats import console +if TYPE_CHECKING: + from pandas._typing import DataFrame # noqa: F401 + def _cat_compare_op(op): opname = f"__{op.__name__}__" @@ -370,6 +373,110 @@ def __init__( self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) + @classmethod + def from_dummies(cls, dummies: "DataFrame", ordered=None): + """ + Create a `Categorical` using a ``DataFrame`` encoding those categories + as dummy/ one-hot encoded variables. + + The ``DataFrame`` must be coercible to boolean, + and have no more than one truthy value per row. + The columns of the ``DataFrame`` become the categories of the `Categorical`. + A column whose header is NA will be dropped. + + Parameters + ---------- + dummies : DataFrame of bool-like + ordered : bool + Whether or not this Categorical is ordered. + + Raises + ------ + ValueError + If a sample belongs to >1 category + + Returns + ------- + Categorical + + Examples + -------- + >>> df = pd.DataFrame( + ... [[1, 0, 0], [0, 1, 0], [0, 0, 1]], columns=["a", "b", "c"] + ... ) + >>> Categorical.from_dummies(df) + [a, b, c] + Categories (3, object): [a, b, c] + """ + # GH 8745 + from pandas import Series + + df = dummies.drop(columns=np.nan, errors="ignore").astype(bool) + + if (df.sum(axis=1) > 1).any(): + raise ValueError("Some rows belong to >1 category") + + index_into = Series([np.nan] + list(df.columns)) + mult_by = np.arange(1, len(index_into)) + + codes = (df.astype(int) * mult_by).sum(axis=1) - 1 + codes[codes.isna()] = -1 + return cls.from_codes(codes, df.columns.values, ordered=ordered) + + def to_dummies(self, na_column=None) -> "DataFrame": + """ + Create a ``DataFrame`` representing this `Categorical` + as dummy/ one-hot encoded variables. + + For more power over column names or to use a sparse matrix, + see :func:`pandas.get_dummies`. + + Parameters + ---------- + na_column : Optional + If None, NA values will be represented as a row of zeros. + Otherwise, this is the name of a new column representing + those NA values. + + Returns + ------- + DataFrame + + Examples + -------- + >>> Categorical(["a", "b", "c"]).to_dummies() + a b c + 0 True False False + 1 False True False + 2 False False True + + >>> Categorical(["a", "b", np.nan]).to_dummies() + a b + 0 True False + 1 False True + 2 False False + + >>> Categorical(["a", "b", np.nan]).to_dummies("c") + a b c + 0 True False False + 1 False True False + 2 False False True + + See Also + -------- + :func:`pandas.get_dummies` + """ + from pandas import DataFrame, CategoricalIndex + + eye = np.eye(len(self.categories) + 1, dtype=bool) + arr = eye[self.codes, :] + + if na_column is None: + return DataFrame(arr[:, :-1], columns=CategoricalIndex(self.categories)) + else: + cat_lst = list(self.categories) + [na_column] + return DataFrame(arr, columns=CategoricalIndex(cat_lst)) + @property def dtype(self) -> CategoricalDtype: """ From f937c96a5d61bbe7369284d5111a4403d9e045b8 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 28 May 2020 11:02:52 +0100 Subject: [PATCH 02/47] Tests: Categorical.(to|from)_dummies --- pandas/tests/arrays/categorical/test_api.py | 15 +++++- .../arrays/categorical/test_constructors.py | 51 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 6fce4b4145ff2..8e435f47e84ea 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series +from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series, get_dummies import pandas._testing as tm from pandas.core.arrays.categorical import recode_for_categories from pandas.tests.arrays.categorical.common import TestCategorical @@ -399,6 +399,19 @@ def test_remove_unused_categories(self): out = cat.remove_unused_categories() assert out.tolist() == val.tolist() + @pytest.mark.parametrize( + "vals", + [ + ["a", "b", "b", "a"], + ["a", "b", "b", "a", np.nan], + [1, 1.5, "a", (1, "b")], + [1, 1.5, "a", (1, "b"), np.nan], + ], + ) + def test_to_dummies(self, vals): + cats = Categorical(vals) + tm.assert_equal(cats.to_dummies(), get_dummies(cats).astype(bool)) + class TestCategoricalAPIWithFactor(TestCategorical): def test_describe(self): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e200f13652a84..b296838290360 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p16 + from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -10,6 +12,7 @@ from pandas import ( Categorical, CategoricalIndex, + DataFrame, DatetimeIndex, Index, Interval, @@ -19,6 +22,7 @@ Series, Timestamp, date_range, + get_dummies, period_range, timedelta_range, ) @@ -635,6 +639,7 @@ def test_constructor_imaginary(self): tm.assert_index_equal(c1.categories, Index(values)) tm.assert_numpy_array_equal(np.array(c1), np.array(values)) + @pytest.mark.skipif(_np_version_under1p16, reason="Skipping for NumPy <1.16") def test_constructor_string_and_tuples(self): # GH 21416 c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) @@ -682,3 +687,49 @@ def test_interval(self): expected_codes = np.array([0, 1], dtype="int8") tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) + + def test_from_dummies(self): + # GH 8745 + raw = ["a", "a", "b", "c", "c", "a"] + dummies = get_dummies(raw) + cats = Categorical.from_dummies(dummies) + assert list(cats) == raw + + def test_from_dummies_nan(self): + raw = ["a", "a", "b", "c", "c", "a", np.nan] + dummies = get_dummies(raw) + cats = Categorical.from_dummies(dummies) + assert list(cats)[:-1] == raw[:-1] + assert pd.isna(list(cats)[-1]) + + def test_from_dummies_gt1(self): + dummies = DataFrame([[1, 0, 1], [0, 1, 0], [0, 0, 1]], columns=["a", "b", "c"]) + with pytest.raises(ValueError): + Categorical.from_dummies(dummies) + + @pytest.mark.parametrize("ordered", [None, False, True]) + def test_from_dummies_ordered(self, ordered): + raw = ["a", "a", "b", "c", "c", "a"] + dummies = get_dummies(raw) + cats = Categorical.from_dummies(dummies, ordered) + assert cats.ordered == bool(ordered) + + def test_from_dummies_types(self): + cols = ["a", 1, 1.5, ("a", "b"), (1, "c")] + dummies = DataFrame(np.eye(len(cols)), columns=cols) + cats = Categorical.from_dummies(dummies) + assert list(cats) == cols + + def test_from_dummies_drops_na(self): + cols = ["a", "b", np.nan] + dummies = DataFrame(np.eye(len(cols)), columns=cols) + cats = Categorical.from_dummies(dummies) + assert list(cats.categories) == cols[:-1] + assert pd.isna(cats[-1]) + + def test_from_dummies_multiindex(self): + tups = [("a", 1), ("a", 2), ("b", 1), ("b", 2)] + cols = MultiIndex.from_tuples(tups) + dummies = DataFrame(np.eye(len(cols)), columns=cols) + cats = Categorical.from_dummies(dummies) + assert list(cats.categories) == tups From dd141320549ba4e2ee3cfbdee5cd1903ba0ca5bb Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 28 May 2020 11:06:54 +0100 Subject: [PATCH 03/47] Add reference to Categorical.to_dummies to get_dummies --- pandas/core/reshape/reshape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 18ebe14763797..e848f968b64e4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -768,6 +768,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. + Categorical.to_dummies : Simply create dummy variables from a Categorical. Examples -------- From 9dc9da5b7f333ffc5f5e85615f1052e9a199931d Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 28 May 2020 14:08:32 +0100 Subject: [PATCH 04/47] whatsnew: add issue number to Categorical.(to|from)_dummies --- doc/source/whatsnew/v1.1.0.rst | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a49b29d691692..f2df6579ced70 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -318,29 +318,10 @@ Other enhancements compression library. Compression was also added to the low-level Stata-file writers :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). -- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`). -- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). -- Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) -- Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). -- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) -- :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). -- :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). -- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) -- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example - combining a nullable integer column with a numpy integer column will no longer - result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). -- :func:`read_gbq` now allows to disable progress bar (:issue:`33360`). -- :func:`read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). -- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ``ddof`` to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). -- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). -- :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) -- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similar to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). -- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`) -- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable Boolean dtype (:issue:`34859`) -- :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) -- :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) -- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) -- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) +- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). +- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) +- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :class:`~pandas.core.arrays.categorical.Categorical` now has methods for converting to and from dummy/ one-hot encoded variables: :meth:`Categorical.to_dummies` and :meth:`Categorical.from_dummies` respectively. :meth:`Categorical.to_dummies` is smaller in scope than :func:`~pandas.core.reshape.reshape.get_dummies`, which can still be used if you require the extra flexibility. .. --------------------------------------------------------------------------- From ac9cec26bad2959dda4e3592bacde87efaee9436 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 28 May 2020 14:15:39 +0100 Subject: [PATCH 05/47] Review comments for dummies tests --- pandas/tests/arrays/categorical/test_api.py | 1 + pandas/tests/arrays/categorical/test_constructors.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 8e435f47e84ea..b65cabe735c5b 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -409,6 +409,7 @@ def test_remove_unused_categories(self): ], ) def test_to_dummies(self, vals): + # GH 8745 cats = Categorical(vals) tm.assert_equal(cats.to_dummies(), get_dummies(cats).astype(bool)) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index b296838290360..44ee6eddae628 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -696,6 +696,7 @@ def test_from_dummies(self): assert list(cats) == raw def test_from_dummies_nan(self): + # GH 8745 raw = ["a", "a", "b", "c", "c", "a", np.nan] dummies = get_dummies(raw) cats = Categorical.from_dummies(dummies) @@ -703,24 +704,28 @@ def test_from_dummies_nan(self): assert pd.isna(list(cats)[-1]) def test_from_dummies_gt1(self): + # GH 8745 dummies = DataFrame([[1, 0, 1], [0, 1, 0], [0, 0, 1]], columns=["a", "b", "c"]) with pytest.raises(ValueError): Categorical.from_dummies(dummies) @pytest.mark.parametrize("ordered", [None, False, True]) def test_from_dummies_ordered(self, ordered): + # GH 8745 raw = ["a", "a", "b", "c", "c", "a"] dummies = get_dummies(raw) cats = Categorical.from_dummies(dummies, ordered) assert cats.ordered == bool(ordered) def test_from_dummies_types(self): + # GH 8745 cols = ["a", 1, 1.5, ("a", "b"), (1, "c")] dummies = DataFrame(np.eye(len(cols)), columns=cols) cats = Categorical.from_dummies(dummies) assert list(cats) == cols def test_from_dummies_drops_na(self): + # GH 8745 cols = ["a", "b", np.nan] dummies = DataFrame(np.eye(len(cols)), columns=cols) cats = Categorical.from_dummies(dummies) @@ -728,6 +733,7 @@ def test_from_dummies_drops_na(self): assert pd.isna(cats[-1]) def test_from_dummies_multiindex(self): + # GH 8745 tups = [("a", 1), ("a", 2), ("b", 1), ("b", 2)] cols = MultiIndex.from_tuples(tups) dummies = DataFrame(np.eye(len(cols)), columns=cols) From 0459cb130207d3f0eac7d2fa625ba94e553e6769 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 28 May 2020 14:16:18 +0100 Subject: [PATCH 06/47] Review comments for dummies implementation --- pandas/core/arrays/categorical.py | 39 ++++++++++++++----------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6847224ccfc8d..237d4f582a4f8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import TYPE_CHECKING, Dict, Hashable, List, Type, Union, cast +from typing import TYPE_CHECKING, Dict, Hashable, List, Optional, Type, Union, cast from warnings import warn import numpy as np @@ -374,26 +374,27 @@ def __init__( self._codes = coerce_indexer_dtype(codes, dtype.categories) @classmethod - def from_dummies(cls, dummies: "DataFrame", ordered=None): - """ - Create a `Categorical` using a ``DataFrame`` encoding those categories - as dummy/ one-hot encoded variables. + def from_dummies( + cls, dummies: "DataFrame", ordered: Optional[bool] = None + ) -> "Categorical": + """Create a `Categorical` using a ``DataFrame`` of dummy variables. The ``DataFrame`` must be coercible to boolean, and have no more than one truthy value per row. The columns of the ``DataFrame`` become the categories of the `Categorical`. - A column whose header is NA will be dropped. + A column whose header is NA will be dropped; + any row with a NA value will be uncategorised. Parameters ---------- - dummies : DataFrame of bool-like - ordered : bool - Whether or not this Categorical is ordered. + dummies : DataFrame of bool-like + ordered : bool + Whether or not this Categorical is ordered. Raises ------ - ValueError - If a sample belongs to >1 category + ValueError + If a sample belongs to >1 category Returns ------- @@ -409,15 +410,12 @@ def from_dummies(cls, dummies: "DataFrame", ordered=None): Categories (3, object): [a, b, c] """ # GH 8745 - from pandas import Series - df = dummies.drop(columns=np.nan, errors="ignore").astype(bool) if (df.sum(axis=1) > 1).any(): raise ValueError("Some rows belong to >1 category") - index_into = Series([np.nan] + list(df.columns)) - mult_by = np.arange(1, len(index_into)) + mult_by = np.arange(1, df.shape[1] + 1) codes = (df.astype(int) * mult_by).sum(axis=1) - 1 codes[codes.isna()] = -1 @@ -425,18 +423,17 @@ def from_dummies(cls, dummies: "DataFrame", ordered=None): def to_dummies(self, na_column=None) -> "DataFrame": """ - Create a ``DataFrame`` representing this `Categorical` - as dummy/ one-hot encoded variables. + Create a ``DataFrame`` of boolean dummy variables representing this object. For more power over column names or to use a sparse matrix, see :func:`pandas.get_dummies`. Parameters ---------- - na_column : Optional - If None, NA values will be represented as a row of zeros. - Otherwise, this is the name of a new column representing - those NA values. + na_column : Optional + If None, NA values will be represented as a row of zeros. + Otherwise, this is the name of a new column representing + those NA values. Returns ------- From 65e68c26c61374d26078b3a419f81e72ae62d3cd Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 10:04:13 +0100 Subject: [PATCH 07/47] dummies review comments --- pandas/core/arrays/categorical.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 237d4f582a4f8..1e8f59b0797c6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -409,15 +409,17 @@ def from_dummies( [a, b, c] Categories (3, object): [a, b, c] """ - # GH 8745 df = dummies.drop(columns=np.nan, errors="ignore").astype(bool) if (df.sum(axis=1) > 1).any(): raise ValueError("Some rows belong to >1 category") - mult_by = np.arange(1, df.shape[1] + 1) - - codes = (df.astype(int) * mult_by).sum(axis=1) - 1 + mult_by = np.arange(df.shape[1]) + 1 + # 000 000 0 -1 + # 010 020 2 1 + # 001 * 1,2,3 => 003 -> 3 -> 2 = correct codes + # 100 100 1 0 + codes = (df * mult_by).sum(axis=1) - 1 codes[codes.isna()] = -1 return cls.from_codes(codes, df.columns.values, ordered=ordered) @@ -453,8 +455,8 @@ def to_dummies(self, na_column=None) -> "DataFrame": 1 False True 2 False False - >>> Categorical(["a", "b", np.nan]).to_dummies("c") - a b c + >>> Categorical(["a", "b", np.nan]).to_dummies("other") + a b other 0 True False False 1 False True False 2 False False True From 133402619cd57bdb1e0dfa66d466aff8f52c9b42 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 10:40:31 +0100 Subject: [PATCH 08/47] User guide: Describe Categorical.(to|from)_dummies --- doc/source/user_guide/categorical.rst | 42 +++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index b7475ae7bb132..e5203ef8d3c3f 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -127,6 +127,48 @@ This conversion is likewise done column by column: df_cat['A'] df_cat['B'] +Dummy / indicator / one-hot encoded variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some operations, like regression and classification, +encodes a single categorical variable as a column for each category, +with each row having False in all but one column (True). +These are called dummy variables, or one-hot encoding. +:class:`pandas.Categorical`s can easily be converted to and from such an encoding: + +.. ipython:: python + + cat = pd.Categorical(["a", "b", "b", "c"]) + cat + + dummies = cat.to_dummies() + dummies + + pd.Categorical.from_dummies(dummies) + +The :meth:`pandas.Categorical.from_dummies` class method accepts a dataframe +whose dtypes are coercible to boolean, and an ``ordered`` argument +for whether the resulting ``Categorical`` should be considered ordered +(like the ``Categorical`` constructor). +A column with a NA index will be ignored. +Any row which is entirely falsey, or has a missing value, +will be uncategorised. + +:meth:`pandas.Categorical.to_dummies` produces a boolean dataframe of dummy variables. +If the ``na_column`` argument is ``None`` (default), +missing items will result in a row of ``False``. +Otherwise, the value of ``na_column`` will be used as the index +of an extra column representing these items: + +.. ipython:: python + + cat = pd.Categorical(["a", "b", np.nan]) + cat.to_dummies(na_column="other") + +For more control over data types and column names, +see :func:`pandas.get_dummies`. + +.. versionadded:: 1.1.0 Controlling behavior ~~~~~~~~~~~~~~~~~~~~ From c2240b67451c3588df4ddf9caca32a2da6fabaa0 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 11:21:53 +0100 Subject: [PATCH 09/47] Fix user guide errors --- doc/source/user_guide/categorical.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index e5203ef8d3c3f..adf9d4f551d72 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -128,13 +128,13 @@ This conversion is likewise done column by column: df_cat['B'] Dummy / indicator / one-hot encoded variables -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Some operations, like regression and classification, encodes a single categorical variable as a column for each category, with each row having False in all but one column (True). These are called dummy variables, or one-hot encoding. -:class:`pandas.Categorical`s can easily be converted to and from such an encoding: +:class:`pandas.Categorical` objects can easily be converted to and from such an encoding: .. ipython:: python From 66771bfe0f4d85484d10e6be000468cae4ca01f1 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 13:00:16 +0100 Subject: [PATCH 10/47] Fix numpy element from sequence error --- pandas/core/arrays/categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1e8f59b0797c6..fc16a96b710de 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -465,7 +465,7 @@ def to_dummies(self, na_column=None) -> "DataFrame": -------- :func:`pandas.get_dummies` """ - from pandas import DataFrame, CategoricalIndex + from pandas import DataFrame, CategoricalIndex, Series eye = np.eye(len(self.categories) + 1, dtype=bool) arr = eye[self.codes, :] @@ -473,8 +473,8 @@ def to_dummies(self, na_column=None) -> "DataFrame": if na_column is None: return DataFrame(arr[:, :-1], columns=CategoricalIndex(self.categories)) else: - cat_lst = list(self.categories) + [na_column] - return DataFrame(arr, columns=CategoricalIndex(cat_lst)) + cats = CategoricalIndex(Series(list(self.categories) + [na_column])) + return DataFrame(arr, columns=cats) @property def dtype(self) -> CategoricalDtype: From 4e769da521736e8c033dff801ff345a8fbed7995 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 13:00:37 +0100 Subject: [PATCH 11/47] Test to_dummies column type cast --- pandas/tests/arrays/categorical/test_api.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index b65cabe735c5b..0122fd581f43d 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -410,9 +410,18 @@ def test_remove_unused_categories(self): ) def test_to_dummies(self, vals): # GH 8745 - cats = Categorical(vals) + cats = Categorical(Series(vals)) tm.assert_equal(cats.to_dummies(), get_dummies(cats).astype(bool)) + def test_to_dummies_na_dtype(self): + # when dtype of NA column name != dtype of categories, + # check the cast to object + # GH 8745 + cats = Categorical([1, 2, 2, 1, np.nan]) + assert cats.dtype != object + dummies = cats.to_dummies(na_column="other") + assert dummies.columns.categories.dtype == object + class TestCategoricalAPIWithFactor(TestCategorical): def test_describe(self): From fe002af13443a7d9e1b86ba492abbc9fe7254f26 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 15:56:40 +0100 Subject: [PATCH 12/47] Test review comments - Test Categories.from_dummies from sparse - Test that all NA-valued headers are dropped - Test for informative error message --- .../tests/arrays/categorical/test_constructors.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 44ee6eddae628..849e0819fe241 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -688,25 +688,27 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) - def test_from_dummies(self): + @pytest.mark.parametrize("sparse", [True, False]) + def test_from_dummies(self, sparse): # GH 8745 raw = ["a", "a", "b", "c", "c", "a"] - dummies = get_dummies(raw) + dummies = get_dummies(raw, sparse=sparse) cats = Categorical.from_dummies(dummies) assert list(cats) == raw - def test_from_dummies_nan(self): + @pytest.mark.parametrize("na_val", [np.nan, pd.NA, None, pd.NaT]) + def test_from_dummies_nan(self, na_val): # GH 8745 - raw = ["a", "a", "b", "c", "c", "a", np.nan] + raw = ["a", "a", "b", "c", "c", "a", na_val] dummies = get_dummies(raw) cats = Categorical.from_dummies(dummies) assert list(cats)[:-1] == raw[:-1] assert pd.isna(list(cats)[-1]) - def test_from_dummies_gt1(self): + def test_from_dummies_multiple(self): # GH 8745 dummies = DataFrame([[1, 0, 1], [0, 1, 0], [0, 0, 1]], columns=["a", "b", "c"]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="multiple categories"): Categorical.from_dummies(dummies) @pytest.mark.parametrize("ordered", [None, False, True]) From 097f2c6bcf9e46596560b1f2ddb75e06d7dbc380 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 15:58:06 +0100 Subject: [PATCH 13/47] Review comments for implementation - Handle more NA types - More examples - More informative error message --- pandas/core/arrays/categorical.py | 54 +++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fc16a96b710de..3219e1c8762bb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -379,15 +379,15 @@ def from_dummies( ) -> "Categorical": """Create a `Categorical` using a ``DataFrame`` of dummy variables. - The ``DataFrame`` must be coercible to boolean, - and have no more than one truthy value per row. + The ``DataFrame`` must have no more than one truthy value per row. The columns of the ``DataFrame`` become the categories of the `Categorical`. - A column whose header is NA will be dropped; - any row with a NA value will be uncategorised. + A column whose header is NA will be dropped: + any row containing a NA value will be uncategorised. Parameters ---------- - dummies : DataFrame of bool-like + dummies : DataFrame + dtypes of columns with non-NA headers must be coercible to bool. ordered : bool Whether or not this Categorical is ordered. @@ -402,17 +402,45 @@ def from_dummies( Examples -------- - >>> df = pd.DataFrame( - ... [[1, 0, 0], [0, 1, 0], [0, 0, 1]], columns=["a", "b", "c"] - ... ) - >>> Categorical.from_dummies(df) + >>> simple = pd.DataFrame(np.eye(3), columns=["a", "b", "c"]) + >>> Categorical.from_dummies(simple) [a, b, c] Categories (3, object): [a, b, c] - """ - df = dummies.drop(columns=np.nan, errors="ignore").astype(bool) - if (df.sum(axis=1) > 1).any(): - raise ValueError("Some rows belong to >1 category") + >>> nan_col = pd.DataFrame(np.eye(4), columns=["a", "b", np.nan, None]) + >>> Categorical.from_dummies(nan_col) + [a, b, NaN, NaN] + Categories (2, object): [a, b] + + >>> nan_cell = pd.DataFrame( + ... [[1, 0, np.nan], [0, 1, 0], [0, 0, 1]], + ... columns=["a", "b", "c"], + ... ) + >>> Categorical.from_dummies(nan_cell) + [NaN, b, c] + Categories (3, object): [a, b, c] + + >>> multi = pd.DataFrame( + ... [[1, 0, 1], [0, 1, 0], [0, 0, 1]], + ... columns=["a", "b", "c"], + ... ) + >>> Categorical.from_dummies(multi) + Traceback (most recent call last): + ... + ValueError: 1 record(s) belongs to multiple categories: [0] + """ + to_drop = dummies.columns[dummies.columns.isna()] + if len(to_drop): + dummies = dummies.drop(columns=to_drop) + df = dummies.astype(bool) + + multicat_rows = df.sum(axis=1) > 1 + if multicat_rows.any(): + raise ValueError( + "{} record(s) belongs to multiple categories: {}".format( + multicat_rows.sum(), list(df.index[multicat_rows]), + ) + ) mult_by = np.arange(df.shape[1]) + 1 # 000 000 0 -1 From afe8eda71f7c9ddff9d68ad85dfeef8ddbe0b901 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 16:28:35 +0100 Subject: [PATCH 14/47] Fix doctest for missing values --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3219e1c8762bb..307f1413e7d5e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -432,9 +432,9 @@ def from_dummies( to_drop = dummies.columns[dummies.columns.isna()] if len(to_drop): dummies = dummies.drop(columns=to_drop) - df = dummies.astype(bool) + df = dummies.astype("boolean") - multicat_rows = df.sum(axis=1) > 1 + multicat_rows = df.sum(axis=1, skipna=False) > 1 if multicat_rows.any(): raise ValueError( "{} record(s) belongs to multiple categories: {}".format( From e78158ea4aec01ebdbcf6314b6e2430860cb135c Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 16:33:29 +0100 Subject: [PATCH 15/47] xfail for Categorical from sparse --- pandas/tests/arrays/categorical/test_constructors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 849e0819fe241..0b1f7819d9505 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -690,6 +690,8 @@ def test_interval(self): @pytest.mark.parametrize("sparse", [True, False]) def test_from_dummies(self, sparse): + if sparse: + pytest.xfail("from sparse is not supported") # GH 8745 raw = ["a", "a", "b", "c", "c", "a"] dummies = get_dummies(raw, sparse=sparse) From 4fb1e5ea1d90207bd7ee7265a7d8ff6c93ecc2cf Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 16:33:51 +0100 Subject: [PATCH 16/47] Fix tests --- pandas/core/arrays/categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 307f1413e7d5e..57ffdf560426f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -388,6 +388,7 @@ def from_dummies( ---------- dummies : DataFrame dtypes of columns with non-NA headers must be coercible to bool. + Sparse dataframes are not supported. ordered : bool Whether or not this Categorical is ordered. @@ -429,7 +430,7 @@ def from_dummies( ... ValueError: 1 record(s) belongs to multiple categories: [0] """ - to_drop = dummies.columns[dummies.columns.isna()] + to_drop = dummies.columns[dummies.columns.values.isna()] if len(to_drop): dummies = dummies.drop(columns=to_drop) df = dummies.astype("boolean") From 9fa549411b24b9e3e722b0d1b09230cfd0ff6775 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 17:15:47 +0100 Subject: [PATCH 17/47] fix isna --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 57ffdf560426f..1de927eada9ca 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -430,7 +430,7 @@ def from_dummies( ... ValueError: 1 record(s) belongs to multiple categories: [0] """ - to_drop = dummies.columns[dummies.columns.values.isna()] + to_drop = dummies.columns[isna(dummies.columns.values)] if len(to_drop): dummies = dummies.drop(columns=to_drop) df = dummies.astype("boolean") From 61567fd24a3807d154c5622298d01296d312c30d Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 17:44:49 +0100 Subject: [PATCH 18/47] Explicit integer type cast --- pandas/core/arrays/categorical.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1de927eada9ca..bd19c3f629882 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -448,9 +448,8 @@ def from_dummies( # 010 020 2 1 # 001 * 1,2,3 => 003 -> 3 -> 2 = correct codes # 100 100 1 0 - codes = (df * mult_by).sum(axis=1) - 1 - codes[codes.isna()] = -1 - return cls.from_codes(codes, df.columns.values, ordered=ordered) + codes = ((df * mult_by).sum(axis=1) - 1).astype("Int64") + return cls.from_codes(codes.fillna(-1), df.columns.values, ordered=ordered) def to_dummies(self, na_column=None) -> "DataFrame": """ From 5d724ccc85d771c4f30c47d018f737b4a353b5f2 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 17:46:14 +0100 Subject: [PATCH 19/47] Test categorical <-> dummies roundtrip --- pandas/tests/arrays/categorical/test_api.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 0122fd581f43d..56aa03ee82a3e 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -422,6 +422,22 @@ def test_to_dummies_na_dtype(self): dummies = cats.to_dummies(na_column="other") assert dummies.columns.categories.dtype == object + @pytest.mark.parametrize( + "vals", + [ + ["a", "b", "b", "a"], + ["a", "b", "b", "a", np.nan], + [1, 1.5, "a", (1, "b")], + [1, 1.5, "a", (1, "b"), np.nan], + ], + ) + def test_dummies_roundtrip(self, vals): + # GH 8745 + cats = Categorical(Series(vals)) + dummies = cats.to_dummies() + cats2 = Categorical.from_dummies(dummies) + tm.assert_equal(cats, cats2) + class TestCategoricalAPIWithFactor(TestCategorical): def test_describe(self): From 1182ce52933e63df66b52d1b91c13ec8c46a8919 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 29 May 2020 18:22:13 +0100 Subject: [PATCH 20/47] more type casts --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bd19c3f629882..9704a990400d2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -448,7 +448,7 @@ def from_dummies( # 010 020 2 1 # 001 * 1,2,3 => 003 -> 3 -> 2 = correct codes # 100 100 1 0 - codes = ((df * mult_by).sum(axis=1) - 1).astype("Int64") + codes = ((df * mult_by).sum(axis=1, skipna=False) - 1).astype("Int64") return cls.from_codes(codes.fillna(-1), df.columns.values, ordered=ordered) def to_dummies(self, na_column=None) -> "DataFrame": From 04ca72a3f423390fc2ea69e6e578a48dc3105dd4 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 10:59:12 +0100 Subject: [PATCH 21/47] Add wiki link for dummy variables --- doc/source/user_guide/categorical.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index adf9d4f551d72..fe5fc3d734fe7 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -133,7 +133,7 @@ Dummy / indicator / one-hot encoded variables Some operations, like regression and classification, encodes a single categorical variable as a column for each category, with each row having False in all but one column (True). -These are called dummy variables, or one-hot encoding. +These are called `dummy variables `_, or one-hot encoding. :class:`pandas.Categorical` objects can easily be converted to and from such an encoding: .. ipython:: python From 6e4f71adafa5bbf3dfbffdde4e988707cc3405c2 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 10:59:47 +0100 Subject: [PATCH 22/47] Remove deprecated numpy Date: Thu, 17 Sep 2020 11:12:47 +0100 Subject: [PATCH 23/47] isort fix --- pandas/tests/series/indexing/test_datetime.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 088f8681feb99..19d5dfa3b3900 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,6 +11,7 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm + """ Also test support for datetime64[ns] in Series / DataFrame """ From ed58c7799a423e60888a22ef0d1378807a897f4a Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:04:06 +0100 Subject: [PATCH 24/47] undo changes to whatsnew v1.1.0 --- doc/source/whatsnew/v1.1.0.rst | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f2df6579ced70..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -318,10 +318,29 @@ Other enhancements compression library. Compression was also added to the low-level Stata-file writers :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). -- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). -- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) -- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). -- :class:`~pandas.core.arrays.categorical.Categorical` now has methods for converting to and from dummy/ one-hot encoded variables: :meth:`Categorical.to_dummies` and :meth:`Categorical.from_dummies` respectively. :meth:`Categorical.to_dummies` is smaller in scope than :func:`~pandas.core.reshape.reshape.get_dummies`, which can still be used if you require the extra flexibility. +- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`). +- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). +- Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) +- Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) +- :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). +- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) +- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example + combining a nullable integer column with a numpy integer column will no longer + result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). +- :func:`read_gbq` now allows to disable progress bar (:issue:`33360`). +- :func:`read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). +- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ``ddof`` to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). +- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). +- :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) +- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similar to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). +- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`) +- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable Boolean dtype (:issue:`34859`) +- :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) +- :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) +- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) +- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) .. --------------------------------------------------------------------------- From 741cf8f4f4006743ddcd8c2bc2c590c84ef8811c Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:04:46 +0100 Subject: [PATCH 25/47] whatsnew/v1.2.0: Categorical get/from dummies --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6a5b4b3b9ff16..c3ac951eb51b1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -120,6 +120,7 @@ Other enhancements - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- :meth:`Categorical.from_dummies` and :meth:`Categorical.get_dummies` convert between :class:`Categorical` and :class:`DataFrame` objects of dummy variables. .. _whatsnew_120.api_breaking.python: From 6f199b6013a8ed79e07f271ccfbc277c4f04e06e Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:05:44 +0100 Subject: [PATCH 26/47] Update user_guide/categorical docs --- doc/source/user_guide/categorical.rst | 32 +++++++++------------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index fe5fc3d734fe7..892c20bc64c51 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -134,17 +134,7 @@ Some operations, like regression and classification, encodes a single categorical variable as a column for each category, with each row having False in all but one column (True). These are called `dummy variables `_, or one-hot encoding. -:class:`pandas.Categorical` objects can easily be converted to and from such an encoding: - -.. ipython:: python - - cat = pd.Categorical(["a", "b", "b", "c"]) - cat - - dummies = cat.to_dummies() - dummies - - pd.Categorical.from_dummies(dummies) +:class:`pandas.Categorical` objects can easily be converted to and from such an encoding. The :meth:`pandas.Categorical.from_dummies` class method accepts a dataframe whose dtypes are coercible to boolean, and an ``ordered`` argument @@ -154,21 +144,21 @@ A column with a NA index will be ignored. Any row which is entirely falsey, or has a missing value, will be uncategorised. -:meth:`pandas.Categorical.to_dummies` produces a boolean dataframe of dummy variables. -If the ``na_column`` argument is ``None`` (default), -missing items will result in a row of ``False``. -Otherwise, the value of ``na_column`` will be used as the index -of an extra column representing these items: +:meth:`pandas.Categorical.get_dummies` produces a dataframe of dummy variables. +It works in the same way and supports most of the same arguments as :func:`pandas.get_dummies`. .. ipython:: python - cat = pd.Categorical(["a", "b", np.nan]) - cat.to_dummies(na_column="other") + cat = pd.Categorical(["a", "b", "b", "c"]) + cat + + dummies = cat.get_dummies() + dummies + + pd.Categorical.from_dummies(dummies) -For more control over data types and column names, -see :func:`pandas.get_dummies`. -.. versionadded:: 1.1.0 +.. versionadded:: 1.2.0 Controlling behavior ~~~~~~~~~~~~~~~~~~~~ From 034f8e1efc15c6974ade669248ffaf5d008b9479 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:06:09 +0100 Subject: [PATCH 27/47] Reference Categorical.get_dummies in reshape.py --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e848f968b64e4..be5a39a9f90d4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -768,7 +768,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. - Categorical.to_dummies : Simply create dummy variables from a Categorical. + Categorical.get_dummies : Convert a Categorical array to dummy codes. Examples -------- From b80f089978ed92aba023f77ddcd3ab3a4a88a432 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:06:59 +0100 Subject: [PATCH 28/47] Categorical->dummies more like get_dummies --- pandas/core/arrays/categorical.py | 127 ++++++++++++++++++++---------- 1 file changed, 86 insertions(+), 41 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9704a990400d2..4db55ee1871a8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -451,58 +451,103 @@ def from_dummies( codes = ((df * mult_by).sum(axis=1, skipna=False) - 1).astype("Int64") return cls.from_codes(codes.fillna(-1), df.columns.values, ordered=ordered) - def to_dummies(self, na_column=None) -> "DataFrame": - """ - Create a ``DataFrame`` of boolean dummy variables representing this object. - - For more power over column names or to use a sparse matrix, - see :func:`pandas.get_dummies`. + def get_dummies( + self, + prefix=None, + prefix_sep="_", + dummy_na=False, + sparse=False, + drop_first=False, + dtype=None, + ) -> "DataFrame": + """ + Convert into dummy/indicator variables. Parameters ---------- - na_column : Optional - If None, NA values will be represented as a row of zeros. - Otherwise, this is the name of a new column representing - those NA values. + prefix : str, default None + String to append DataFrame column names. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + sparse : bool, default False + Whether the dummy-encoded columns should be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). + drop_first : bool, default False + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + dtype : dtype, default np.uint8 + Data type for new columns. Only a single dtype is allowed. Returns ------- DataFrame - - Examples - -------- - >>> Categorical(["a", "b", "c"]).to_dummies() - a b c - 0 True False False - 1 False True False - 2 False False True - - >>> Categorical(["a", "b", np.nan]).to_dummies() - a b - 0 True False - 1 False True - 2 False False - - >>> Categorical(["a", "b", np.nan]).to_dummies("other") - a b other - 0 True False False - 1 False True False - 2 False False True + Dummy-coded data. See Also -------- - :func:`pandas.get_dummies` - """ - from pandas import DataFrame, CategoricalIndex, Series + Series.str.get_dummies : Convert Series to dummy codes. + pandas.get_dummies : Convert categorical variable to dummy/indicator variables. - eye = np.eye(len(self.categories) + 1, dtype=bool) - arr = eye[self.codes, :] - - if na_column is None: - return DataFrame(arr[:, :-1], columns=CategoricalIndex(self.categories)) - else: - cats = CategoricalIndex(Series(list(self.categories) + [na_column])) - return DataFrame(arr, columns=cats) + Examples + -------- + >>> s = pd.Categorical(list('abca')) + + >>> s.get_dummies() + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> s1 = pd.Categorical(['a', 'b', np.nan]) + + >>> s1.get_dummies() + a b + 0 1 0 + 1 0 1 + 2 0 0 + + >>> s1.get_dummies(dummy_na=True) + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + + >>> pd.Categorical(list('abcaa)).get_dummies() + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + 4 1 0 0 + + >>> pd.Categorical(list('abcaa)).get_dummies(drop_first=True) + b c + 0 0 0 + 1 1 0 + 2 0 1 + 3 0 0 + 4 0 0 + + >>> pd.Categorical(list('abc')).get_dummies(dtype=float) + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import _get_dummies_1d + + return _get_dummies_1d( + self, + prefix=prefix, + prefix_sep=prefix_sep, + dummy_na=dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) @property def dtype(self) -> CategoricalDtype: From 0eb936f1c04813a0c5573e88b68969ea07f7d07d Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:07:19 +0100 Subject: [PATCH 29/47] categorical tests --- pandas/tests/arrays/categorical/test_api.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 56aa03ee82a3e..7fcb568c665ed 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -408,19 +408,10 @@ def test_remove_unused_categories(self): [1, 1.5, "a", (1, "b"), np.nan], ], ) - def test_to_dummies(self, vals): + def test_get_dummies(self, vals): # GH 8745 cats = Categorical(Series(vals)) - tm.assert_equal(cats.to_dummies(), get_dummies(cats).astype(bool)) - - def test_to_dummies_na_dtype(self): - # when dtype of NA column name != dtype of categories, - # check the cast to object - # GH 8745 - cats = Categorical([1, 2, 2, 1, np.nan]) - assert cats.dtype != object - dummies = cats.to_dummies(na_column="other") - assert dummies.columns.categories.dtype == object + tm.assert_equal(cats.get_dummies(), get_dummies(cats)) @pytest.mark.parametrize( "vals", From 8f212e1d999937538ae7043a639a62a7f0aca13f Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:08:05 +0100 Subject: [PATCH 30/47] isort pandas_web --- web/pandas_web.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/web/pandas_web.py b/web/pandas_web.py index 7dd63175e69ac..e62deaa8cdc7f 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -34,12 +34,13 @@ import time import typing -import feedparser import jinja2 -import markdown import requests import yaml +import feedparser +import markdown + class Preprocessors: """ From bda526512be6c16cececfb00eff4f1912795bde4 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 13:24:45 +0100 Subject: [PATCH 31/47] fix _get_dummies_1d import path --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4db55ee1871a8..3a241fca60eb2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -537,7 +537,7 @@ def get_dummies( 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import _get_dummies_1d + from pandas.core.reshape.reshape import _get_dummies_1d return _get_dummies_1d( self, From 6e6ddda82e0c79dd3afaa201e88d10b1ae6de118 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 13:25:38 +0100 Subject: [PATCH 32/47] categorical.test_api: to->get dummies --- pandas/tests/arrays/categorical/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 7fcb568c665ed..d47841618d6f0 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -425,7 +425,7 @@ def test_get_dummies(self, vals): def test_dummies_roundtrip(self, vals): # GH 8745 cats = Categorical(Series(vals)) - dummies = cats.to_dummies() + dummies = cats.get_dummies() cats2 = Categorical.from_dummies(dummies) tm.assert_equal(cats, cats2) From 9fcebf066d487fabffda6a20afa9027fe17c9bc3 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 13:37:57 +0100 Subject: [PATCH 33/47] isort pandas_web --- web/pandas_web.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/web/pandas_web.py b/web/pandas_web.py index e62deaa8cdc7f..7dd63175e69ac 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -34,13 +34,12 @@ import time import typing +import feedparser import jinja2 +import markdown import requests import yaml -import feedparser -import markdown - class Preprocessors: """ From b9908c419f9e8d6c1f88ef4ce59097c145ccb4da Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 13:51:34 +0100 Subject: [PATCH 34/47] fix typos in categorical doctests --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3a241fca60eb2..cf8b6c00e5a53 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -515,7 +515,7 @@ def get_dummies( 1 0 1 0 2 0 0 1 - >>> pd.Categorical(list('abcaa)).get_dummies() + >>> pd.Categorical(list('abcaa')).get_dummies() a b c 0 1 0 0 1 0 1 0 @@ -523,7 +523,7 @@ def get_dummies( 3 1 0 0 4 1 0 0 - >>> pd.Categorical(list('abcaa)).get_dummies(drop_first=True) + >>> pd.Categorical(list('abcaa')).get_dummies(drop_first=True) b c 0 0 0 1 1 0 From faeec4170e0de9fdb344d905c0d55ec45bea1905 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 14:06:11 +0100 Subject: [PATCH 35/47] isort test_datetime --- pandas/tests/series/indexing/test_datetime.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 19d5dfa3b3900..088f8681feb99 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,7 +11,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm - """ Also test support for datetime64[ns] in Series / DataFrame """ From e11f28e0f2b6c52c29f7121600e527d64a6bd856 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 14:06:21 +0100 Subject: [PATCH 36/47] use get_dummies instead of _get_dummies_1d --- pandas/core/arrays/categorical.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cf8b6c00e5a53..aaf34006a9657 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -537,9 +537,11 @@ def get_dummies( 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas.core.reshape.reshape import _get_dummies_1d + # Would be better to use pandas.core.reshape.reshape._get_dummies_1d + # but that's internal and fails lints + from pandas import get_dummies - return _get_dummies_1d( + return get_dummies( self, prefix=prefix, prefix_sep=prefix_sep, From 742c940dd87c5979eadee429e2036a293177760b Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 14:34:48 +0100 Subject: [PATCH 37/47] Reference get_dummies/ from_dummies in reshaping docs --- doc/source/user_guide/reshaping.rst | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 1b90aeb00cf9c..a666bbd885baf 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -606,7 +606,7 @@ This function is often used along with discretization functions like ``cut``: pd.get_dummies(pd.cut(values, bins)) -See also :func:`Series.str.get_dummies `. +See also :func:`Series.str.get_dummies ` and :func:`Categorical.get_dummies `. :func:`get_dummies` also accepts a ``DataFrame``. By default all categorical variables (categorical in the statistical sense, those with `object` or @@ -679,6 +679,15 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes +A :class:`~pandas.Categorical` can be recovered from a :class:`~pandas.DataFrame` of such dummy variables using :meth:`~pandas.Categorical.from_dummies`. +Use the ``prefix`` and ``prefix_sep`` arguments to select and rename columns which have had a prefix applied in the same way as :class:`~pandas.get_dummies` does. + +.. ipython:: python + + df = pd.get_dummies(list("abca")) + + pd.Categorical.from_dummies(df) + .. _reshaping.factorize: From 722137d374639fbd5fc14c1b12b39b92a74f1640 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 14:35:05 +0100 Subject: [PATCH 38/47] use prefix in from_dummies --- pandas/core/arrays/categorical.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index aaf34006a9657..6546b07b3c834 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -375,10 +375,17 @@ def __init__( @classmethod def from_dummies( - cls, dummies: "DataFrame", ordered: Optional[bool] = None + cls, + dummies: "DataFrame", + ordered: Optional[bool] = None, + prefix=None, + prefix_sep="_", ) -> "Categorical": """Create a `Categorical` using a ``DataFrame`` of dummy variables. + Can use a subset of columns based on the ``prefix`` + and ``prefix_sep`` parameters. + The ``DataFrame`` must have no more than one truthy value per row. The columns of the ``DataFrame`` become the categories of the `Categorical`. A column whose header is NA will be dropped: @@ -391,6 +398,13 @@ def from_dummies( Sparse dataframes are not supported. ordered : bool Whether or not this Categorical is ordered. + prefix : optional str + Only take columns whose names are strings starting + with this prefix and ``prefix_sep``, + stripping those elements from the resulting category names. + prefix_sep : str, default "_" + If ``prefix`` is not ``None``, use as the separator + between the prefix and the final name of the category. Raises ------ @@ -433,6 +447,17 @@ def from_dummies( to_drop = dummies.columns[isna(dummies.columns.values)] if len(to_drop): dummies = dummies.drop(columns=to_drop) + + if prefix is not None: + pref = prefix + (prefix_sep or "") + name_map = dict() + to_keep = [] + for c in dummies.columns: + if isinstance(c, str) and c.startswith(pref): + to_keep.append(c) + name_map[c] = c[len(pref) :] + dummies = dummies[to_keep].rename(columns=name_map) + df = dummies.astype("boolean") multicat_rows = df.sum(axis=1, skipna=False) > 1 From 4945ba8722d3e7ea365253cd43172538c17a73ba Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 14:53:47 +0100 Subject: [PATCH 39/47] document prefix handling in categorical.rst --- doc/source/user_guide/categorical.rst | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 892c20bc64c51..c8bdc2394ddf9 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -136,6 +136,16 @@ with each row having False in all but one column (True). These are called `dummy variables `_, or one-hot encoding. :class:`pandas.Categorical` objects can easily be converted to and from such an encoding. +:meth:`pandas.Categorical.get_dummies` produces a dataframe of dummy variables. +It works in the same way and supports most of the same arguments as :func:`pandas.get_dummies`. + +.. ipython:: python + + cat = pd.Categorical(["a", "b", "b", "c"]) + cat + + cat.get_dummies() + The :meth:`pandas.Categorical.from_dummies` class method accepts a dataframe whose dtypes are coercible to boolean, and an ``ordered`` argument for whether the resulting ``Categorical`` should be considered ordered @@ -143,19 +153,16 @@ for whether the resulting ``Categorical`` should be considered ordered A column with a NA index will be ignored. Any row which is entirely falsey, or has a missing value, will be uncategorised. - -:meth:`pandas.Categorical.get_dummies` produces a dataframe of dummy variables. -It works in the same way and supports most of the same arguments as :func:`pandas.get_dummies`. +In the same way that :func:`pandas.get_dummies` can add a prefix to string category names, +:meth:`~pandas.Categorical.from_dummies` can filter a dataframe for columns with a prefix: +the resulting ``Categorical`` will have the prefix stripped from its categories. .. ipython:: python - cat = pd.Categorical(["a", "b", "b", "c"]) - cat - - dummies = cat.get_dummies() + dummies = pd.get_dummies(["a", "b", "b", "c"], prefix="cat") dummies - pd.Categorical.from_dummies(dummies) + pd.Categorical.from_dummies(dummies, prefix="cat") .. versionadded:: 1.2.0 From 1f98233ba0dac0b6558ddc20cfd281a16f853165 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 16:12:06 +0100 Subject: [PATCH 40/47] Lower-memory impl for Categorical.from_dummies --- pandas/core/arrays/categorical.py | 39 +++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6546b07b3c834..cd0ccc2cdce76 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -380,6 +380,7 @@ def from_dummies( ordered: Optional[bool] = None, prefix=None, prefix_sep="_", + fillna=None, ) -> "Categorical": """Create a `Categorical` using a ``DataFrame`` of dummy variables. @@ -405,6 +406,9 @@ def from_dummies( prefix_sep : str, default "_" If ``prefix`` is not ``None``, use as the separator between the prefix and the final name of the category. + fillna : optional bool, default None + How to handle NA values. If ``True`` or ``False``, NA is filled with that value. + If ``None``, raise a ValueError if there are any NA values. Raises ------ @@ -444,23 +448,35 @@ def from_dummies( ... ValueError: 1 record(s) belongs to multiple categories: [0] """ + from pandas import Series + + copied = False to_drop = dummies.columns[isna(dummies.columns.values)] if len(to_drop): dummies = dummies.drop(columns=to_drop) + copied = True - if prefix is not None: + if prefix is None: + cats = dummies.columns + else: pref = prefix + (prefix_sep or "") - name_map = dict() + cats = [] to_keep = [] for c in dummies.columns: if isinstance(c, str) and c.startswith(pref): to_keep.append(c) - name_map[c] = c[len(pref) :] - dummies = dummies[to_keep].rename(columns=name_map) + cats.append(c[len(pref) :]) + dummies = dummies[to_keep] df = dummies.astype("boolean") + if fillna is not None: + df = df.fillna(fillna, inplace=copied) - multicat_rows = df.sum(axis=1, skipna=False) > 1 + row_totals = df.sum(axis=1, skipna=False) + if row_totals.isna().any(): + raise ValueError("Unhandled NA values in dummy array") + + multicat_rows = row_totals > 1 if multicat_rows.any(): raise ValueError( "{} record(s) belongs to multiple categories: {}".format( @@ -468,13 +484,12 @@ def from_dummies( ) ) - mult_by = np.arange(df.shape[1]) + 1 - # 000 000 0 -1 - # 010 020 2 1 - # 001 * 1,2,3 => 003 -> 3 -> 2 = correct codes - # 100 100 1 0 - codes = ((df * mult_by).sum(axis=1, skipna=False) - 1).astype("Int64") - return cls.from_codes(codes.fillna(-1), df.columns.values, ordered=ordered) + codes = Series(np.full(len(row_totals), np.nan), index=df.index, dtype="Int64") + codes[row_totals == 0] = -1 + row_idx, code = np.nonzero(df) + codes[row_idx] = code + + return cls.from_codes(codes.fillna(-1), cats, ordered=ordered) def get_dummies( self, From ff01048275e900a07e3e73fba4d3cd2ebfd1032b Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 15:57:24 +0100 Subject: [PATCH 41/47] remove comment about use of _get_dummies_1d --- pandas/core/arrays/categorical.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cd0ccc2cdce76..ba4c8a9552b44 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -577,8 +577,6 @@ def get_dummies( 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - # Would be better to use pandas.core.reshape.reshape._get_dummies_1d - # but that's internal and fails lints from pandas import get_dummies return get_dummies( From 604b8397b54baea3f8b1dec1be0eb4a397dfc1f8 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 16:02:40 +0100 Subject: [PATCH 42/47] type-annotate get/from_dummies --- pandas/core/arrays/categorical.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ba4c8a9552b44..6002dc83c780e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -378,9 +378,9 @@ def from_dummies( cls, dummies: "DataFrame", ordered: Optional[bool] = None, - prefix=None, - prefix_sep="_", - fillna=None, + prefix: Optional[str] = None, + prefix_sep: str = "_", + fillna: Optional[bool] = None, ) -> "Categorical": """Create a `Categorical` using a ``DataFrame`` of dummy variables. @@ -493,12 +493,12 @@ def from_dummies( def get_dummies( self, - prefix=None, - prefix_sep="_", - dummy_na=False, - sparse=False, - drop_first=False, - dtype=None, + prefix: Optional[str] = None, + prefix_sep: str = "_", + dummy_na: bool = False, + sparse: bool = False, + drop_first: bool = False, + dtype: Dtype = None, ) -> "DataFrame": """ Convert into dummy/indicator variables. From c71e8076f65fd092dc17edef9c28dedcab17dd26 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 16:03:16 +0100 Subject: [PATCH 43/47] split overlong line --- pandas/core/arrays/categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6002dc83c780e..b22f48cdda78a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -407,7 +407,8 @@ def from_dummies( If ``prefix`` is not ``None``, use as the separator between the prefix and the final name of the category. fillna : optional bool, default None - How to handle NA values. If ``True`` or ``False``, NA is filled with that value. + How to handle NA values. + If ``True`` or ``False``, NA is filled with that value. If ``None``, raise a ValueError if there are any NA values. Raises From 6f9272a3722577da8df11e940bab8034b05dec86 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 16:07:42 +0100 Subject: [PATCH 44/47] blacken --- pandas/core/arrays/categorical.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b22f48cdda78a..676c10eed6f02 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -451,11 +451,9 @@ def from_dummies( """ from pandas import Series - copied = False to_drop = dummies.columns[isna(dummies.columns.values)] if len(to_drop): dummies = dummies.drop(columns=to_drop) - copied = True if prefix is None: cats = dummies.columns @@ -471,7 +469,7 @@ def from_dummies( df = dummies.astype("boolean") if fillna is not None: - df = df.fillna(fillna, inplace=copied) + df = df.fillna(fillna) row_totals = df.sum(axis=1, skipna=False) if row_totals.isna().any(): @@ -481,7 +479,8 @@ def from_dummies( if multicat_rows.any(): raise ValueError( "{} record(s) belongs to multiple categories: {}".format( - multicat_rows.sum(), list(df.index[multicat_rows]), + multicat_rows.sum(), + list(df.index[multicat_rows]), ) ) From 8fd4b721f3ed5f5c96d4dc07327ca5a26fd93340 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 16:16:31 +0100 Subject: [PATCH 45/47] use f-strings --- pandas/core/arrays/categorical.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 676c10eed6f02..e65bd6e8d9f92 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -478,10 +478,8 @@ def from_dummies( multicat_rows = row_totals > 1 if multicat_rows.any(): raise ValueError( - "{} record(s) belongs to multiple categories: {}".format( - multicat_rows.sum(), - list(df.index[multicat_rows]), - ) + f"{multicat_rows.sum()} record(s) belongs to multiple categories: " + f"{list(df.index[multicat_rows])}" ) codes = Series(np.full(len(row_totals), np.nan), index=df.index, dtype="Int64") From 534bc332ebfbf42047609b98c3faaaee59508bca Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 16:18:01 +0100 Subject: [PATCH 46/47] add some typing --- pandas/core/arrays/categorical.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e65bd6e8d9f92..d018968274b75 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import TYPE_CHECKING, Dict, Hashable, List, Optional, Type, Union, cast +from typing import TYPE_CHECKING, Any, Dict, Hashable, List, Optional, Type, Union, cast from warnings import warn import numpy as np @@ -455,12 +455,13 @@ def from_dummies( if len(to_drop): dummies = dummies.drop(columns=to_drop) + cats: List[Any] if prefix is None: - cats = dummies.columns + cats = list(dummies.columns) else: pref = prefix + (prefix_sep or "") cats = [] - to_keep = [] + to_keep: List[str] = [] for c in dummies.columns: if isinstance(c, str) and c.startswith(pref): to_keep.append(c) From 0facec6e6b01a89ff8573ad25b1575baeb4f9225 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 22 Sep 2020 16:22:00 +0100 Subject: [PATCH 47/47] remove unnecessary .values --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d018968274b75..224e336fae9dd 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -451,7 +451,7 @@ def from_dummies( """ from pandas import Series - to_drop = dummies.columns[isna(dummies.columns.values)] + to_drop = dummies.columns[isna(dummies.columns)] if len(to_drop): dummies = dummies.drop(columns=to_drop)