pandas-dev · clbarnes · May 27, 2020 · May 28, 2020 · May 28, 2020 · May 28, 2020
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -127,6 +127,45 @@ This conversion is likewise done column by column:
     df_cat['A']
     df_cat['B']
 
+Dummy / indicator / one-hot encoded variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some operations, like regression and classification,
+encodes a single categorical variable as a column for each category,
+with each row having False in all but one column (True).
+These are called `dummy variables <https://en.wikipedia.org/wiki/Dummy_variable_(statistics)>`_, or one-hot encoding.
+:class:`pandas.Categorical` objects can easily be converted to and from such an encoding.
+
+:meth:`pandas.Categorical.get_dummies` produces a dataframe of dummy variables.
+It works in the same way and supports most of the same arguments as :func:`pandas.get_dummies`.
+
+.. ipython:: python
+
+    cat = pd.Categorical(["a", "b", "b", "c"])
+    cat
+
+    cat.get_dummies()
+
+The :meth:`pandas.Categorical.from_dummies` class method accepts a dataframe
+whose dtypes are coercible to boolean, and an ``ordered`` argument
+for whether the resulting ``Categorical`` should be considered ordered
+(like the ``Categorical`` constructor).
+A column with a NA index will be ignored.
+Any row which is entirely falsey, or has a missing value,
+will be uncategorised.
+In the same way that :func:`pandas.get_dummies` can add a prefix to string category names,
+:meth:`~pandas.Categorical.from_dummies` can filter a dataframe for columns with a prefix:
+the resulting ``Categorical`` will have the prefix stripped from its categories.
+
+.. ipython:: python
+
+    dummies = pd.get_dummies(["a", "b", "b", "c"], prefix="cat")
+    dummies
+
+    pd.Categorical.from_dummies(dummies, prefix="cat")
+
+
+.. versionadded:: 1.2.0
 
 Controlling behavior
 ~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -606,7 +606,7 @@ This function is often used along with discretization functions like ``cut``:
 
    pd.get_dummies(pd.cut(values, bins))
 
-See also :func:`Series.str.get_dummies <pandas.Series.str.get_dummies>`.
+See also :func:`Series.str.get_dummies <pandas.Series.str.get_dummies>` and :func:`Categorical.get_dummies <pandas.Categorical.get_dummies>`.
 
 :func:`get_dummies` also accepts a ``DataFrame``. By default all categorical
 variables (categorical in the statistical sense, those with `object` or
@@ -679,6 +679,15 @@ To choose another dtype, use the ``dtype`` argument:
 
     pd.get_dummies(df, dtype=bool).dtypes
 
+A :class:`~pandas.Categorical` can be recovered from a :class:`~pandas.DataFrame` of such dummy variables using :meth:`~pandas.Categorical.from_dummies`.
+Use the ``prefix`` and ``prefix_sep`` arguments to select and rename columns which have had a prefix applied in the same way as :class:`~pandas.get_dummies` does.
+
+.. ipython:: python
+
+    df = pd.get_dummies(list("abca"))
+
+    pd.Categorical.from_dummies(df)
+
 
 .. _reshaping.factorize:
 

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -120,6 +120,7 @@ Other enhancements
 - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
 - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
 - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
+- :meth:`Categorical.from_dummies` and :meth:`Categorical.get_dummies` convert between :class:`Categorical` and :class:`DataFrame` objects of dummy variables.
 
 .. _whatsnew_120.api_breaking.python:
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2,7 +2,7 @@
 from functools import partial
 import operator
 from shutil import get_terminal_size
-from typing import Dict, Hashable, List, Type, Union, cast
+from typing import TYPE_CHECKING, Any, Dict, Hashable, List, Optional, Type, Union, cast
 from warnings import warn
 
 import numpy as np
@@ -55,6 +55,9 @@
 
 from pandas.io.formats import console
 
+if TYPE_CHECKING:
+    from pandas._typing import DataFrame  # noqa: F401
+
 
 def _cat_compare_op(op):
     opname = f"__{op.__name__}__"
@@ -370,6 +373,221 @@ def __init__(
         self._dtype = self._dtype.update_dtype(dtype)
         self._codes = coerce_indexer_dtype(codes, dtype.categories)
 
+    @classmethod
+    def from_dummies(
+        cls,
+        dummies: "DataFrame",
+        ordered: Optional[bool] = None,
+        prefix: Optional[str] = None,
+        prefix_sep: str = "_",
+        fillna: Optional[bool] = None,
+    ) -> "Categorical":
+        """Create a `Categorical` using a ``DataFrame`` of dummy variables.
+
+        Can use a subset of columns based on the ``prefix``
+        and ``prefix_sep`` parameters.
+
+        The ``DataFrame`` must have no more than one truthy value per row.
+        The columns of the ``DataFrame`` become the categories of the `Categorical`.
+        A column whose header is NA will be dropped:
+        any row containing a NA value will be uncategorised.
+
+        Parameters
+        ----------
+        dummies : DataFrame
+            dtypes of columns with non-NA headers must be coercible to bool.
+            Sparse dataframes are not supported.
+        ordered : bool
+            Whether or not this Categorical is ordered.
+        prefix : optional str
+            Only take columns whose names are strings starting
+            with this prefix and ``prefix_sep``,
+            stripping those elements from the resulting category names.
+        prefix_sep : str, default "_"
+            If ``prefix`` is not ``None``, use as the separator
+            between the prefix and the final name of the category.
+        fillna : optional bool, default None
+            How to handle NA values.
+            If ``True`` or ``False``, NA is filled with that value.
+            If ``None``, raise a ValueError if there are any NA values.
+
+        Raises
+        ------
+        ValueError
+            If a sample belongs to >1 category
+
+        Returns
+        -------
+        Categorical
+
+        Examples
+        --------
+        >>> simple = pd.DataFrame(np.eye(3), columns=["a", "b", "c"])
+        >>> Categorical.from_dummies(simple)
+        [a, b, c]
+        Categories (3, object): [a, b, c]
+
+        >>> nan_col = pd.DataFrame(np.eye(4), columns=["a", "b", np.nan, None])
+        >>> Categorical.from_dummies(nan_col)
+        [a, b, NaN, NaN]
+        Categories (2, object): [a, b]
+
+        >>> nan_cell = pd.DataFrame(
+        ...     [[1, 0, np.nan], [0, 1, 0], [0, 0, 1]],
+        ...     columns=["a", "b", "c"],
+        ... )
+        >>> Categorical.from_dummies(nan_cell)
+        [NaN, b, c]
+        Categories (3, object): [a, b, c]
+
+        >>> multi = pd.DataFrame(
+        ...     [[1, 0, 1], [0, 1, 0], [0, 0, 1]],
+        ...     columns=["a", "b", "c"],
+        ... )
+        >>> Categorical.from_dummies(multi)
+        Traceback (most recent call last):
+            ...
+        ValueError: 1 record(s) belongs to multiple categories: [0]
+        """
+        from pandas import Series
+
+        to_drop = dummies.columns[isna(dummies.columns)]
+        if len(to_drop):
+            dummies = dummies.drop(columns=to_drop)
+
+        cats: List[Any]
+        if prefix is None:
+            cats = list(dummies.columns)
+        else:
+            pref = prefix + (prefix_sep or "")
+            cats = []
+            to_keep: List[str] = []
+            for c in dummies.columns:
+                if isinstance(c, str) and c.startswith(pref):
+                    to_keep.append(c)
+                    cats.append(c[len(pref) :])
+            dummies = dummies[to_keep]
+
+        df = dummies.astype("boolean")
+        if fillna is not None:
+            df = df.fillna(fillna)
+
+        row_totals = df.sum(axis=1, skipna=False)
+        if row_totals.isna().any():
+            raise ValueError("Unhandled NA values in dummy array")
+
+        multicat_rows = row_totals > 1
+        if multicat_rows.any():
+            raise ValueError(
+                f"{multicat_rows.sum()} record(s) belongs to multiple categories: "
+                f"{list(df.index[multicat_rows])}"
+            )
+
+        codes = Series(np.full(len(row_totals), np.nan), index=df.index, dtype="Int64")
+        codes[row_totals == 0] = -1
+        row_idx, code = np.nonzero(df)
+        codes[row_idx] = code
+
+        return cls.from_codes(codes.fillna(-1), cats, ordered=ordered)
+
+    def get_dummies(
+        self,
+        prefix: Optional[str] = None,
+        prefix_sep: str = "_",
+        dummy_na: bool = False,
+        sparse: bool = False,
+        drop_first: bool = False,
+        dtype: Dtype = None,
+    ) -> "DataFrame":
+        """
+        Convert into dummy/indicator variables.
+
+        Parameters
+        ----------
+        prefix : str, default None
+            String to append DataFrame column names.
+        prefix_sep : str, default '_'
+            If appending prefix, separator/delimiter to use.
+        dummy_na : bool, default False
+            Add a column to indicate NaNs, if False NaNs are ignored.
+        sparse : bool, default False
+            Whether the dummy-encoded columns should be backed by
+            a :class:`SparseArray` (True) or a regular NumPy array (False).
+        drop_first : bool, default False
+            Whether to get k-1 dummies out of k categorical levels by removing the
+            first level.
+        dtype : dtype, default np.uint8
+            Data type for new columns. Only a single dtype is allowed.
+
+        Returns
+        -------
+        DataFrame
+            Dummy-coded data.
+
+        See Also
+        --------
+        Series.str.get_dummies : Convert Series to dummy codes.
+        pandas.get_dummies : Convert categorical variable to dummy/indicator variables.
+
+        Examples
+        --------
+        >>> s = pd.Categorical(list('abca'))
+
+        >>> s.get_dummies()
+        a  b  c
+        0  1  0  0
+        1  0  1  0
+        2  0  0  1
+        3  1  0  0
+
+        >>> s1 = pd.Categorical(['a', 'b', np.nan])
+
+        >>> s1.get_dummies()
+        a  b
+        0  1  0
+        1  0  1
+        2  0  0
+
+        >>> s1.get_dummies(dummy_na=True)
+        a  b  NaN
+        0  1  0    0
+        1  0  1    0
+        2  0  0    1
+
+        >>> pd.Categorical(list('abcaa')).get_dummies()
+        a  b  c
+        0  1  0  0
+        1  0  1  0
+        2  0  0  1
+        3  1  0  0
+        4  1  0  0
+
+        >>> pd.Categorical(list('abcaa')).get_dummies(drop_first=True)
+        b  c
+        0  0  0
+        1  1  0
+        2  0  1
+        3  0  0
+        4  0  0
+
+        >>> pd.Categorical(list('abc')).get_dummies(dtype=float)
+            a    b    c
+        0  1.0  0.0  0.0
+        1  0.0  1.0  0.0
+        2  0.0  0.0  1.0
+        """
+        from pandas import get_dummies
+
+        return get_dummies(
+            self,
+            prefix=prefix,
+            prefix_sep=prefix_sep,
+            dummy_na=dummy_na,
+            sparse=sparse,
+            drop_first=drop_first,
+            dtype=dtype,
+        )
+
     @property
     def dtype(self) -> CategoricalDtype:
         """

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -768,6 +768,7 @@ def get_dummies(
     See Also
     --------
     Series.str.get_dummies : Convert Series to dummy codes.
+    Categorical.get_dummies : Convert a Categorical array to dummy codes.
 
     Examples
     --------

diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
+from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series, get_dummies
 import pandas._testing as tm
 from pandas.core.arrays.categorical import recode_for_categories
 from pandas.tests.arrays.categorical.common import TestCategorical
@@ -399,6 +399,36 @@ def test_remove_unused_categories(self):
         out = cat.remove_unused_categories()
         assert out.tolist() == val.tolist()
 
+    @pytest.mark.parametrize(
+        "vals",
+        [
+            ["a", "b", "b", "a"],
+            ["a", "b", "b", "a", np.nan],
+            [1, 1.5, "a", (1, "b")],
+            [1, 1.5, "a", (1, "b"), np.nan],
+        ],
+    )
+    def test_get_dummies(self, vals):
+        # GH 8745
+        cats = Categorical(Series(vals))
+        tm.assert_equal(cats.get_dummies(), get_dummies(cats))
+
+    @pytest.mark.parametrize(
+        "vals",
+        [
+            ["a", "b", "b", "a"],
+            ["a", "b", "b", "a", np.nan],
+            [1, 1.5, "a", (1, "b")],
+            [1, 1.5, "a", (1, "b"), np.nan],
+        ],
+    )
+    def test_dummies_roundtrip(self, vals):
+        # GH 8745
+        cats = Categorical(Series(vals))
+        dummies = cats.get_dummies()
+        cats2 = Categorical.from_dummies(dummies)
+        tm.assert_equal(cats, cats2)
+
 
 class TestCategoricalAPIWithFactor(TestCategorical):
     def test_describe(self):