Skip to content

Commit

Permalink
Review feedback; refactor tests; add whatsnew
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Jun 27, 2018
1 parent f1cff7f commit 740df13
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 140 deletions.
34 changes: 33 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,44 @@ v0.24.0
New features
~~~~~~~~~~~~

- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
.. _whatsnew_0240.enhancements.duplicated_inverse:

``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, there was no way to determine how duplicate rows in a ``DataFrame`` got mapped to the deduplicated, unique subset. This made it hard to push back
information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
``return_inverse``-kwarg, it fails to work with ``object`` data.

The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series:

.. ipython:: python

df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=[1, 4, 9, 16, 25])
df
isdup, inv = df.duplicated(return_inverse=True)
isdup
inv

This allows to reconstruct the original DataFrame as follows:

.. ipython:: python

unique = df.loc[~isdup] # same as df.drop_duplicates()
unique
reconstruct = unique.reindex(inv.values).set_index(inv.index)
reconstruct.equals(df)

The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
to construct an inverse).

.. _whatsnew_0240.enhancements.other:

Other Enhancements
^^^^^^^^^^^^^^^^^^

- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4362,10 +4362,12 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
last occurrence.
- False : Mark all duplicates as ``True``. This option is not
compatible with ``return_inverse``.
return_inverse boolean, default False
return_inverse : boolean, default False
Determines whether the mapping from unique elements to the original
index should be returned. If true, the output is a tuple.
.. versionadded:: 0.24.0
Returns
-------
duplicated : Series or tuple of Series if return_inverse is True
Expand Down Expand Up @@ -4413,9 +4415,16 @@ def f(vals):
return_index=True)
inv = Series(self.index[o2u][u2o], index=self.index)
elif keep == 'last':
ids = ids[::-1] # np.unique takes first occurrence as unique value
# np.unique takes first occurrence as unique value,
# so we flip ids that first becomes last
ids = ids[::-1]
_, o2u, u2o = np.unique(ids, return_inverse=True,
return_index=True)
# the values in the ids-array correspond(ed) to self.index -
# by flipping ids around, we need to do the same for self.index,
# ___because o2u and u2o are relative to that order___.
# Finally, to fit with 'index=self.index' in the constructor,
# we need to flip the values around one last time
inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
return isdup, inv

Expand Down
163 changes: 132 additions & 31 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from numpy.random import randn
import numpy as np

from pandas.compat import lrange, PY35
from pandas.compat import lrange, PY35, string_types
from pandas import (compat, isna, notna, DataFrame, Series,
MultiIndex, date_range, Timestamp, Categorical,
_np_version_under1p12,
Expand Down Expand Up @@ -1523,6 +1523,137 @@ def test_isin_empty_datetimelike(self):
# ----------------------------------------------------------------------
# Row deduplication

@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
def test_duplicated_with_misspelled_column_name(self, subset):
# GH 19730
df = pd.DataFrame({'A': [0, 0, 1],
'B': [0, 0, 1],
'C': [0, 0, 1]})

with pytest.raises(KeyError):
df.duplicated(subset)

with pytest.raises(KeyError):
df.drop_duplicates(subset)

@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes(self):
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
for i in range(100)}
df = pd.DataFrame(data).T
result = df.duplicated()

# Then duplicates produce the bool pd.Series as a result
# and don't fail during calculation.
# Actual values doesn't matter here, though usually
# it's all False in this case
assert isinstance(result, pd.Series)
assert result.dtype == np.bool

@pytest.mark.parametrize('keep, expected', [
('first', Series([False, False, True, False, True])),
('last', Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True]))
])
def test_duplicated_keep(self, keep, expected):
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})

result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('keep, expected', [
('first', Series([False, False, True, True, True])),
('last', Series([True, True, False, True, False])),
(False, Series([True] * 5))
])
def test_duplicated_nan_none(self, keep, expected):
# np.nan and None are considered equal
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)

result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('keep', ['first', 'last', False])
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
def test_duplicated_subset(self, subset, keep):
df = DataFrame({'A': [0, 1, 1, 2, 0],
'B': ['a', 'b', 'b', 'c', 'a'],
'C': [np.nan, 3, 3, None, np.nan]})

if subset is None:
subset = list(df.columns)

expected = df[subset].duplicated(keep=keep).rename(name=None)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)

def test_duplicated_inverse(self):
# check that return_inverse kwarg does not affect outcome;
# index of inverse must be correctly transformed as well
idx = [1, 4, 9, 16, 25]
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=idx)

# keep = 'first'
expected_isdup = df.duplicated(keep='first')
expected_inv = Series([1, 4, 4, 16, 1], index=idx)
result_isdup, result_inv = df.duplicated(keep='first',
return_inverse=True)
tm.assert_series_equal(result_isdup, expected_isdup)
tm.assert_series_equal(result_inv, expected_inv)

# test that inv works (and fits together with expected_isdup)
unique = df.loc[~expected_isdup]
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
tm.assert_frame_equal(reconstr, df)

# keep = 'last'
expected_isdup = df.duplicated(keep='last')
expected_inv = Series([25, 9, 9, 16, 25], index=idx)
result_isdup, result_inv = df.duplicated(keep='last',
return_inverse=True)
tm.assert_series_equal(result_isdup, expected_isdup)
tm.assert_series_equal(result_inv, expected_inv)

# test that inv works (and fits together with expected_isdup)
unique = df.loc[~expected_isdup]
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
tm.assert_frame_equal(reconstr, df)

def test_duplicated_inverse_raises(self):
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})

# keep = False
rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
with tm.assert_raises_regex(ValueError, rgx):
df.duplicated(keep=False, return_inverse=True)

@pytest.mark.parametrize('keep', ['first', 'last'])
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
def test_duplicated_inverse_large(self, subset, keep):
# unsorted index important to check 'first'/'last' functionality
df = DataFrame(np.random.randint(0, 10, (10000, 3)),
columns=list('ABC')).sample(5000)

expected_isdup = df.duplicated(keep=keep, subset=subset)
result_isdup, inv = df.duplicated(keep=keep, subset=subset,
return_inverse=True)
tm.assert_series_equal(result_isdup, expected_isdup)

if subset is None:
subset = list(df.columns)
elif isinstance(subset, string_types):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]

unique = df.loc[~expected_isdup, subset]
reconstr = unique.reindex(inv.values).set_index(inv.index)
tm.assert_frame_equal(reconstr, df[subset])

def test_drop_duplicates(self):
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
Expand Down Expand Up @@ -1618,36 +1749,6 @@ def test_drop_duplicates(self):
for keep in ['first', 'last', False]:
assert df.duplicated(keep=keep).sum() == 0

@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
def test_duplicated_with_misspelled_column_name(self, subset):
# GH 19730
df = pd.DataFrame({'A': [0, 0, 1],
'B': [0, 0, 1],
'C': [0, 0, 1]})

with pytest.raises(KeyError):
df.duplicated(subset)

with pytest.raises(KeyError):
df.drop_duplicates(subset)

@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes(self):
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
for i in range(100)}
df = pd.DataFrame(data).T
result = df.duplicated()

# Then duplicates produce the bool pd.Series as a result
# and don't fail during calculation.
# Actual values doesn't matter here, though usually
# it's all False in this case
assert isinstance(result, pd.Series)
assert result.dtype == np.bool

def test_drop_duplicates_with_duplicate_column_names(self):
# GH17836
df = DataFrame([
Expand Down
106 changes: 0 additions & 106 deletions pandas/tests/frame/test_duplicates.py

This file was deleted.

0 comments on commit 740df13

Please sign in to comment.