From d8a2e7483e1c60f1ba11441ecfad2ac8f4c95f5f Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Thu, 26 Nov 2020 01:31:46 +0100 Subject: [PATCH] ENH: Implement cross method for Merge Operations (#37864) --- asv_bench/benchmarks/join_merge.py | 6 ++ doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/frame.py | 56 ++++++++++- pandas/core/reshape/merge.py | 63 +++++++++++- pandas/tests/reshape/merge/test_join.py | 12 +++ .../tests/reshape/merge/test_merge_cross.py | 95 +++++++++++++++++++ 6 files changed, 230 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_merge_cross.py diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1333b3a0f0560..a572b8a70a680 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) + def time_join_dataframes_cross(self, sort): + self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) + class JoinIndex: def setup(self): @@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframes_cross(self, sort): + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + class I8Merge: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0c7cd31a10acb..dec0122e12a98 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -255,6 +255,7 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a5ba803897fc6..bca6e255d7a2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,12 +205,14 @@ The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -221,6 +223,11 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -341,6 +348,44 @@ ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 """ @@ -8083,6 +8128,15 @@ def _join_compat( other = DataFrame({other.name: other}) if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) return merge( self, other, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cdcd6b19704c4..3b755c40721fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -5,6 +5,7 @@ import copy import datetime from functools import partial +import hashlib import string from typing import TYPE_CHECKING, Optional, Tuple, cast import warnings @@ -643,6 +644,17 @@ def __init__( self._validate_specification() + cross_col = None + if self.how == "cross": + ( + self.left, + self.right, + self.how, + cross_col, + ) = self._create_cross_configuration(self.left, self.right) + self.left_on = self.right_on = [cross_col] + self._cross = cross_col + # note this function has side effects ( self.left_join_keys, @@ -690,8 +702,14 @@ def get_result(self): self._maybe_restore_index_levels(result) + self._maybe_drop_cross_column(result, self._cross) + return result.__finalize__(self, method="merge") + def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): + if cross_col is not None: + result.drop(columns=cross_col, inplace=True) + def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" ) -> Tuple["DataFrame", "DataFrame"]: @@ -1200,9 +1218,50 @@ def _maybe_coerce_merge_keys(self): typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) + def _create_cross_configuration( + self, left, right + ) -> Tuple["DataFrame", "DataFrame", str, str]: + """ + Creates the configuration to dispatch the cross operation to inner join, + e.g. adding a join column and resetting parameters. Join column is added + to a new object, no inplace modification + + Parameters + ---------- + left: DataFrame + right DataFrame + + Returns + ------- + a tuple (left, right, how, cross_col) representing the adjusted + DataFrames with cross_col, the merge operation set to inner and the column + to join over. + """ + cross_col = f"_cross_{hashlib.md5().hexdigest()}" + how = "inner" + return ( + left.assign(**{cross_col: 1}), + right.assign(**{cross_col: 1}), + how, + cross_col, + ) + def _validate_specification(self): + if self.how == "cross": + if ( + self.left_index + or self.right_index + or self.right_on is not None + or self.left_on is not None + or self.on is not None + ): + raise MergeError( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + return # Hm, any way to make this logic less complicated?? - if self.on is None and self.left_on is None and self.right_on is None: + elif self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () @@ -1266,7 +1325,7 @@ def _validate_specification(self): 'of levels in the index of "left"' ) self.left_on = [None] * n - if len(self.right_on) != len(self.left_on): + if self.how != "cross" and len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 7db92eb55fa0b..00ef7a05f5902 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -803,3 +803,15 @@ def test_join_inner_multiindex_deterministic_order(): index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_join_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py new file mode 100644 index 0000000000000..d6c29ea129027 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -0,0 +1,95 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.core.reshape.merge import MergeError, merge + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_merge_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + left_copy = left.copy() + right_copy = right.copy() + result = merge(left, right, how="cross") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(left, left_copy) + tm.assert_frame_equal(right, right_copy) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"left_index": True}, + {"right_index": True}, + {"on": "a"}, + {"left_on": "a"}, + {"right_on": "b"}, + ], +) +def test_merge_cross_error_reporting(kwargs): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"b": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + merge(left, right, how="cross", **kwargs) + + +def test_merge_cross_mixed_dtypes(): + # GH#5401 + left = DataFrame(["a", "b", "c"], columns=["A"]) + right = DataFrame(range(2), columns=["B"]) + result = merge(left, right, how="cross") + expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_more_than_one_column(): + # GH#5401 + left = DataFrame({"A": list("ab"), "B": [2, 1]}) + right = DataFrame({"C": range(2), "D": range(4, 6)}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "A": ["a", "a", "b", "b"], + "B": [2, 2, 1, 1], + "C": [0, 1, 0, 1], + "D": [4, 5, 4, 5], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_null_values(nulls_fixture): + # GH#5401 + left = DataFrame({"a": [1, nulls_fixture]}) + right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "a": [1, 1, nulls_fixture, nulls_fixture], + "b": ["a", "b", "a", "b"], + "c": [1.0, 2.0, 1.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_join_cross_error_reporting(): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"a": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + left.join(right, how="cross", on="a")