Skip to content

Commit

Permalink
ENH: Implement cross method for Merge Operations (#37864)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Nov 26, 2020
1 parent 6ef5b4b commit d8a2e74
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 3 deletions.
6 changes: 6 additions & 0 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort):
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
self.df_shuf.join(self.df_key2, on="key2", sort=sort)

def time_join_dataframes_cross(self, sort):
self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort)


class JoinIndex:
def setup(self):
Expand Down Expand Up @@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort):
def time_merge_dataframe_integer_key(self, sort):
merge(self.df, self.df2, on="key1", sort=sort)

def time_merge_dataframes_cross(self, sort):
merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)


class I8Merge:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ Other enhancements
- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)

.. ---------------------------------------------------------------------------
Expand Down
56 changes: 55 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,14 @@
The join is done on columns or indexes. If joining columns on
columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
on indexes or indexes on a column or columns, the index will be passed on.
When performing a cross merge, no column specifications to merge on are
allowed.
Parameters
----------%s
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
Type of merge to be performed.
* left: use only keys from left frame, similar to a SQL left outer join;
Expand All @@ -221,6 +223,11 @@
join; sort keys lexicographically.
* inner: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
* cross: creates the cartesian product from both frames, preserves the order
of the left keys.
.. versionadded:: 1.2.0
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
Expand Down Expand Up @@ -341,6 +348,44 @@
...
ValueError: columns overlap but no suffix specified:
Index(['value'], dtype='object')
>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
>>> df1
a b
0 foo 1
1 bar 2
>>> df2
a c
0 foo 3
1 baz 4
>>> df1.merge(df2, how='inner', on='a')
a b c
0 foo 1 3
>>> df1.merge(df2, how='left', on='a')
a b c
0 foo 1 3.0
1 bar 2 NaN
>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
>>> df2 = pd.DataFrame({'right': [7, 8]})
>>> df1
left
0 foo
1 bar
>>> df2
right
0 7
1 8
>>> df1.merge(df2, how='cross')
left right
0 foo 7
1 foo 8
2 bar 7
3 bar 8
"""


Expand Down Expand Up @@ -8083,6 +8128,15 @@ def _join_compat(
other = DataFrame({other.name: other})

if isinstance(other, DataFrame):
if how == "cross":
return merge(
self,
other,
how=how,
on=on,
suffixes=(lsuffix, rsuffix),
sort=sort,
)
return merge(
self,
other,
Expand Down
63 changes: 61 additions & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import copy
import datetime
from functools import partial
import hashlib
import string
from typing import TYPE_CHECKING, Optional, Tuple, cast
import warnings
Expand Down Expand Up @@ -643,6 +644,17 @@ def __init__(

self._validate_specification()

cross_col = None
if self.how == "cross":
(
self.left,
self.right,
self.how,
cross_col,
) = self._create_cross_configuration(self.left, self.right)
self.left_on = self.right_on = [cross_col]
self._cross = cross_col

# note this function has side effects
(
self.left_join_keys,
Expand Down Expand Up @@ -690,8 +702,14 @@ def get_result(self):

self._maybe_restore_index_levels(result)

self._maybe_drop_cross_column(result, self._cross)

return result.__finalize__(self, method="merge")

def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]):
if cross_col is not None:
result.drop(columns=cross_col, inplace=True)

def _indicator_pre_merge(
self, left: "DataFrame", right: "DataFrame"
) -> Tuple["DataFrame", "DataFrame"]:
Expand Down Expand Up @@ -1200,9 +1218,50 @@ def _maybe_coerce_merge_keys(self):
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(**{name: self.right[name].astype(typ)})

def _create_cross_configuration(
self, left, right
) -> Tuple["DataFrame", "DataFrame", str, str]:
"""
Creates the configuration to dispatch the cross operation to inner join,
e.g. adding a join column and resetting parameters. Join column is added
to a new object, no inplace modification
Parameters
----------
left: DataFrame
right DataFrame
Returns
-------
a tuple (left, right, how, cross_col) representing the adjusted
DataFrames with cross_col, the merge operation set to inner and the column
to join over.
"""
cross_col = f"_cross_{hashlib.md5().hexdigest()}"
how = "inner"
return (
left.assign(**{cross_col: 1}),
right.assign(**{cross_col: 1}),
how,
cross_col,
)

def _validate_specification(self):
if self.how == "cross":
if (
self.left_index
or self.right_index
or self.right_on is not None
or self.left_on is not None
or self.on is not None
):
raise MergeError(
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
return
# Hm, any way to make this logic less complicated??
if self.on is None and self.left_on is None and self.right_on is None:
elif self.on is None and self.left_on is None and self.right_on is None:

if self.left_index and self.right_index:
self.left_on, self.right_on = (), ()
Expand Down Expand Up @@ -1266,7 +1325,7 @@ def _validate_specification(self):
'of levels in the index of "left"'
)
self.left_on = [None] * n
if len(self.right_on) != len(self.left_on):
if self.how != "cross" and len(self.right_on) != len(self.left_on):
raise ValueError("len(right_on) must equal len(left_on)")

def _validate(self, validate: str):
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,3 +803,15 @@ def test_join_inner_multiindex_deterministic_order():
index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
)
def test_join_cross(input_col, output_cols):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({input_col: [3, 4]})
result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y")
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
tm.assert_frame_equal(result, expected)
95 changes: 95 additions & 0 deletions pandas/tests/reshape/merge/test_merge_cross.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pytest

from pandas import DataFrame
import pandas._testing as tm
from pandas.core.reshape.merge import MergeError, merge


@pytest.mark.parametrize(
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
)
def test_merge_cross(input_col, output_cols):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({input_col: [3, 4]})
left_copy = left.copy()
right_copy = right.copy()
result = merge(left, right, how="cross")
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(left, left_copy)
tm.assert_frame_equal(right, right_copy)


@pytest.mark.parametrize(
"kwargs",
[
{"left_index": True},
{"right_index": True},
{"on": "a"},
{"left_on": "a"},
{"right_on": "b"},
],
)
def test_merge_cross_error_reporting(kwargs):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({"b": [3, 4]})
msg = (
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
with pytest.raises(MergeError, match=msg):
merge(left, right, how="cross", **kwargs)


def test_merge_cross_mixed_dtypes():
# GH#5401
left = DataFrame(["a", "b", "c"], columns=["A"])
right = DataFrame(range(2), columns=["B"])
result = merge(left, right, how="cross")
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
tm.assert_frame_equal(result, expected)


def test_merge_cross_more_than_one_column():
# GH#5401
left = DataFrame({"A": list("ab"), "B": [2, 1]})
right = DataFrame({"C": range(2), "D": range(4, 6)})
result = merge(left, right, how="cross")
expected = DataFrame(
{
"A": ["a", "a", "b", "b"],
"B": [2, 2, 1, 1],
"C": [0, 1, 0, 1],
"D": [4, 5, 4, 5],
}
)
tm.assert_frame_equal(result, expected)


def test_merge_cross_null_values(nulls_fixture):
# GH#5401
left = DataFrame({"a": [1, nulls_fixture]})
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
result = merge(left, right, how="cross")
expected = DataFrame(
{
"a": [1, 1, nulls_fixture, nulls_fixture],
"b": ["a", "b", "a", "b"],
"c": [1.0, 2.0, 1.0, 2.0],
}
)
tm.assert_frame_equal(result, expected)


def test_join_cross_error_reporting():
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({"a": [3, 4]})
msg = (
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
with pytest.raises(MergeError, match=msg):
left.join(right, how="cross", on="a")

0 comments on commit d8a2e74

Please sign in to comment.