From 7c83e55792ce29a4c1b4317776271002771961c5 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 11 Jan 2019 12:04:34 -0800 Subject: [PATCH 1/5] BUG-24212 fix usage of Index.take in pd.merge --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/reshape/merge.py | 14 ++++++++++++++ pandas/tests/reshape/merge/test_merge.py | 10 ++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 960b205c49c61..2d548e3696e9a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1815,6 +1815,7 @@ Reshaping - Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`) - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`) - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`) +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7861a122afdb6..6a3a0e4f6ca31 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -758,12 +758,26 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: join_index = self.left.index.take(left_indexer) + if (self.how == 'right' and -1 in left_indexer + and not isinstance(self.right.index, MultiIndex)): + join_list = join_index.to_numpy() + absent = left_indexer == -1 + join_list[absent] = self.right.index.to_numpy()[absent] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: join_index = self.right.index.take(right_indexer) + if (self.how == 'left' and -1 in right_indexer + and not isinstance(self.left.index, MultiIndex)): + join_list = join_index.to_numpy() + absent = right_indexer == -1 + join_list[absent] = self.left.index.to_numpy()[absent] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 2080fc542bc61..3408161641831 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1102,6 +1102,16 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A']) + def test_merge_on_index_with_more_values(self): # GH 24212 + df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], + columns=['a', 'b']) + df2 = pd.DataFrame([[3, 30], [4, 40]], + columns=['a', 'c']) + df1.set_index('a', drop=False, inplace=True) + df2.set_index('a', inplace=True) + result = pd.merge(df1, df2, left_index=True, right_on='a', how='left') + assert 1 in result.index + @pytest.fixture def left(): From 977704157a243deaa360b5b58961fb4f100779cc Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 11 Jan 2019 13:57:05 -0800 Subject: [PATCH 2/5] BUG-24212 add comment --- pandas/core/reshape/merge.py | 4 ++++ pandas/tests/reshape/merge/test_merge.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6a3a0e4f6ca31..4886fc3a71d06 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -760,6 +760,8 @@ def _get_join_info(self): join_index = self.left.index.take(left_indexer) if (self.how == 'right' and -1 in left_indexer and not isinstance(self.right.index, MultiIndex)): + # if values missing (-1) from left index, + # take from right index instead join_list = join_index.to_numpy() absent = left_indexer == -1 join_list[absent] = self.right.index.to_numpy()[absent] @@ -773,6 +775,8 @@ def _get_join_info(self): join_index = self.right.index.take(right_indexer) if (self.how == 'left' and -1 in right_indexer and not isinstance(self.left.index, MultiIndex)): + # if values missing (-1) from right index, + # take from left index instead join_list = join_index.to_numpy() absent = right_indexer == -1 join_list[absent] = self.left.index.to_numpy()[absent] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 3408161641831..95135443a0230 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1102,7 +1102,8 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A']) - def test_merge_on_index_with_more_values(self): # GH 24212 + def test_merge_on_index_with_more_values(self): + # GH 24212 df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], columns=['a', 'b']) df2 = pd.DataFrame([[3, 30], [4, 40]], From d81b596deb920ea35e854de004e68a468c6e07ee Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 11 Jan 2019 18:45:26 -0800 Subject: [PATCH 3/5] BUG-24212 clarify test --- pandas/core/reshape/merge.py | 1 + pandas/tests/reshape/merge/test_merge.py | 30 +++++++++++++++--------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4886fc3a71d06..ed857fa520b82 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -777,6 +777,7 @@ def _get_join_info(self): and not isinstance(self.left.index, MultiIndex)): # if values missing (-1) from right index, # take from left index instead + print(right_indexer) join_list = join_index.to_numpy() absent = right_indexer == -1 join_list[absent] = self.left.index.to_numpy()[absent] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 95135443a0230..61ed442b14368 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -908,6 +908,25 @@ def test_merge_two_empty_df_no_division_error(self): with np.errstate(divide='raise'): merge(a, a, on=('a', 'b')) + def test_merge_on_index_with_more_values(self): + # GH 24212 + # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is + # interpreted as a missing value instead of the last element + df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], + columns=['a', 'b']) + df2 = pd.DataFrame([[3, 30], [4, 40]], + columns=['a', 'c']) + df1.set_index('a', drop=False, inplace=True) + df2.set_index('a', inplace=True) + result = pd.merge(df1, df2, left_index=True, right_on='a', how='left') + expected = pd.DataFrame([[1, 2, np.nan], + [2, 4, np.nan], + [3, 6, 30.0], + [4, 8, 40.0]], + columns=['a', 'b', 'c']) + expected.set_index('a', drop=False, inplace=True) + assert_frame_equal(result, expected) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: @@ -1102,17 +1121,6 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A']) - def test_merge_on_index_with_more_values(self): - # GH 24212 - df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], - columns=['a', 'b']) - df2 = pd.DataFrame([[3, 30], [4, 40]], - columns=['a', 'c']) - df1.set_index('a', drop=False, inplace=True) - df2.set_index('a', inplace=True) - result = pd.merge(df1, df2, left_index=True, right_on='a', how='left') - assert 1 in result.index - @pytest.fixture def left(): From d72427a370f2e52045b13a4b9526daa02d51a3fa Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 14 Jan 2019 15:37:45 -0800 Subject: [PATCH 4/5] BUG-24212 make _create_join_index function --- pandas/core/reshape/merge.py | 47 ++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ed857fa520b82..77c38ee7d7338 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,32 +757,15 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self.left.index.take(left_indexer) - if (self.how == 'right' and -1 in left_indexer - and not isinstance(self.right.index, MultiIndex)): - # if values missing (-1) from left index, - # take from right index instead - join_list = join_index.to_numpy() - absent = left_indexer == -1 - join_list[absent] = self.right.index.to_numpy()[absent] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) + join_index = self._create_join_index(left_indexer, + using_left=True) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self.right.index.take(right_indexer) - if (self.how == 'left' and -1 in right_indexer - and not isinstance(self.left.index, MultiIndex)): - # if values missing (-1) from right index, - # take from left index instead - print(right_indexer) - join_list = join_index.to_numpy() - absent = right_indexer == -1 - join_list[absent] = self.left.index.to_numpy()[absent] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) + join_index = self._create_join_index(right_indexer, + using_left=False) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -793,6 +776,28 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer + def _create_join_index(self, indexer, using_left=True): + if using_left: + index = self.left.index + other_index = self.right.index + how_check = 'right' + else: + index = self.right.index + other_index = self.left.index + how_check = 'left' + + join_index = index.take(indexer) + if self.how == how_check and not isinstance(other_index, MultiIndex): + absent = indexer == -1 + if any(absent): + # if values missing (-1) from target index, + # take from other index instead + join_list = join_index.to_numpy() + join_list[absent] = other_index.to_numpy()[absent] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) + return join_index + def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) From 9e49ff6ed15f2b1bc20406c7b6e926f65a644cbd Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 16 Jan 2019 16:16:16 -0800 Subject: [PATCH 5/5] BUG-24212 add docstring and comments --- pandas/core/reshape/merge.py | 49 +++++++++++++++--------- pandas/tests/reshape/merge/test_merge.py | 5 ++- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 77c38ee7d7338..167bc246fe108 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,15 +757,19 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self._create_join_index(left_indexer, - using_left=True) + join_index = self._create_join_index(self.left.index, + self.right.index, + left_indexer, + how='right') else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self._create_join_index(right_indexer, - using_left=False) + join_index = self._create_join_index(self.right.index, + self.left.index, + right_indexer, + how='left') else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -776,24 +780,33 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, indexer, using_left=True): - if using_left: - index = self.left.index - other_index = self.right.index - how_check = 'right' - else: - index = self.right.index - other_index = self.left.index - how_check = 'left' + def _create_join_index(self, index, other_index, indexer, how='left'): + """ + Create a join index by rearranging one index to match another + + Parameters + ---------- + index: Index being rearranged + other_index: Index used to supply values not found in index + indexer: how to rearrange index + how: replacement is only necessary if indexer based on other_index + Returns + ------- + join_index + """ join_index = index.take(indexer) - if self.how == how_check and not isinstance(other_index, MultiIndex): - absent = indexer == -1 - if any(absent): + if (self.how in (how, 'outer') and + not isinstance(other_index, MultiIndex)): + # if final index requires values in other_index but not target + # index, indexer may hold missing (-1) values, causing Index.take + # to take the final value in target index + mask = indexer == -1 + if np.any(mask): # if values missing (-1) from target index, - # take from other index instead + # take from other_index instead join_list = join_index.to_numpy() - join_list[absent] = other_index.to_numpy()[absent] + join_list[mask] = other_index.to_numpy()[mask] join_index = Index(join_list, dtype=join_index.dtype, name=join_index.name) return join_index diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 61ed442b14368..b852215fad3a0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -908,7 +908,8 @@ def test_merge_two_empty_df_no_division_error(self): with np.errstate(divide='raise'): merge(a, a, on=('a', 'b')) - def test_merge_on_index_with_more_values(self): + @pytest.mark.parametrize('how', ['left', 'outer']) + def test_merge_on_index_with_more_values(self, how): # GH 24212 # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is # interpreted as a missing value instead of the last element @@ -918,7 +919,7 @@ def test_merge_on_index_with_more_values(self): columns=['a', 'c']) df1.set_index('a', drop=False, inplace=True) df2.set_index('a', inplace=True) - result = pd.merge(df1, df2, left_index=True, right_on='a', how='left') + result = pd.merge(df1, df2, left_index=True, right_on='a', how=how) expected = pd.DataFrame([[1, 2, np.nan], [2, 4, np.nan], [3, 6, 30.0],