From 022e0b7f67f7d883e766cc6e188d1a7e91ff46d4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 30 Jun 2017 22:49:12 +0300 Subject: [PATCH] Bug in pd.merge() when merge/join with multiple categorical columns (#16786) closes #16767 (cherry picked from commit 5e776fb6cf4e7b3ae0f36b480e1f4e5da154b313) --- doc/source/whatsnew/v0.20.3.txt | 5 +++-- pandas/core/reshape/merge.py | 9 +++++---- pandas/tests/reshape/test_merge.py | 23 +++++++++++++++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index f8916f5464276..0fac6367fd3a5 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -55,8 +55,8 @@ Indexing I/O ^^^ -- Bug in :func:`read_csv`` in which files weren't opened as binary files by the C engine on Windows, causing EOF characters mid-field, which would fail (:issue:`16039`, :issue:`16559`, :issue:`16675`) -- Bug in :func:`read_hdf`` in which reading a ``Series`` saved to an HDF file in 'fixed' format fails when an explicit ``mode='r'`` argument is supplied (:issue:`16583`) +- Bug in :func:`read_csv` in which files weren't opened as binary files by the C engine on Windows, causing EOF characters mid-field, which would fail (:issue:`16039`, :issue:`16559`, :issue:`16675`) +- Bug in :func:`read_hdf` in which reading a ``Series`` saved to an HDF file in 'fixed' format fails when an explicit ``mode='r'`` argument is supplied (:issue:`16583`) Plotting ^^^^^^^^ @@ -79,6 +79,7 @@ Reshaping ^^^^^^^^^ - Bug in joining on a ``MultiIndex`` with a ``category`` dtype for a level (:issue:`16627`). +- Bug in :func:`merge` when merging/joining with multiple categorical columns (:issue:`16767`) Numeric diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c55f4b5bf935f..7c2301e3666d4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1384,13 +1384,14 @@ def _factorize_keys(lk, rk, sort=True): lk = lk.values rk = rk.values - # if we exactly match in categories, allow us to use codes + # if we exactly match in categories, allow us to factorize on codes if (is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)): - return lk.codes, rk.codes, len(lk.categories) - - if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): + klass = libhashtable.Int64Factorizer + lk = _ensure_int64(lk.codes) + rk = _ensure_int64(rk.codes) + elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = libhashtable.Int64Factorizer lk = _ensure_int64(com._values_from_object(lk)) rk = _ensure_int64(com._values_from_object(rk)) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index d3257243d7a2c..29a6b8a2ae818 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1356,6 +1356,29 @@ def test_dtype_on_merged_different(self, change, how, left, right): index=['X', 'Y', 'Z']) assert_series_equal(result, expected) + def test_self_join_multiple_categories(self): + # GH 16767 + # non-duplicates should work with multiple categories + m = 5 + df = pd.DataFrame({ + 'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m, + 'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m, + 'c': [letter + for each in ['m', 'n', 'u', 'p', 'o'] + for letter in [each] * 2 * m], + 'd': [letter + for each in ['aa', 'bb', 'cc', 'dd', 'ee', + 'ff', 'gg', 'hh', 'ii', 'jj'] + for letter in [each] * m]}) + + # change them all to categorical variables + df = df.apply(lambda x: x.astype('category')) + + # self-join should equal ourselves + result = pd.merge(df, df, on=list(df.columns)) + + assert_frame_equal(result, df) + @pytest.fixture def left_df():