Skip to content

Commit

Permalink
Incorporate review feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Jun 28, 2018
1 parent 36b03a1 commit b08dc3d
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 42 deletions.
15 changes: 3 additions & 12 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,22 +431,13 @@ def setup(self, keep, return_inverse):
self.df3 = df3

def time_frame_duplicated(self, keep, return_inverse):
if return_inverse:
self.df.duplicated(keep=keep, return_inverse=return_inverse)
else:
self.df.duplicated(keep=keep)
self.df.duplicated(keep=keep, return_inverse=return_inverse)

def time_frame_duplicated_wide(self, keep, return_inverse):
if return_inverse:
self.df2.duplicated(keep=keep, return_inverse=return_inverse)
else:
self.df2.duplicated(keep=keep)
self.df2.duplicated(keep=keep, return_inverse=return_inverse)

def time_frame_duplicated_mixed(self, keep, return_inverse):
if return_inverse:
self.df3.duplicated(keep=keep, return_inverse=return_inverse)
else:
self.df3.duplicated(keep=keep)
self.df3.duplicated(keep=keep, return_inverse=return_inverse)


class XS(object):
Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.. _whatsnew_0231:

v0.23.1 (June 12, 2018)
-----------------------
v0.23.1
-------

This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes
and bug fixes. We recommend that all users upgrade to this version.
Expand Down
18 changes: 7 additions & 11 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,25 @@ New features
``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, there was no way to determine how duplicate rows in a ``DataFrame`` got mapped to the deduplicated, unique subset. This made it hard to push back
information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
``return_inverse``-kwarg, it fails to work with ``object`` data.

Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account):
The ``duplicated``-method has gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
to a tuple of two Series, where the second Series contains the mapping from the indices of the deduplicated, unique subset back to the original index:

.. ipython:: python

df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=[1, 4, 9, 16, 25])
df
isdup, inv = df.duplicated(return_inverse=True) # default: keep='first'
isdup
inv
isduplicate, inverse = df.duplicated(return_inverse=True) # default: keep='first'
isduplicate
inverse

This allows to reconstruct the original DataFrame as follows:

.. ipython:: python

unique = df.loc[~isdup] # same as df.drop_duplicates()
unique = df.loc[~isduplicate] # same as df.drop_duplicates()
unique
reconstruct = unique.reindex(inv.values).set_index(inv.index)
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
reconstruct.equals(df)

The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
Expand Down
29 changes: 12 additions & 17 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,35 +1585,30 @@ def test_duplicated_subset(self, subset, keep):

if subset is None:
subset = list(df.columns)
elif isinstance(subset, string_types):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]

expected = df[subset].duplicated(keep=keep).rename(name=None)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)

def test_duplicated_inverse(self):
@pytest.mark.parametrize('keep, expected_inv_values', [
('first', [1, 4, 4, 16, 1]),
('last', [25, 9, 9, 16, 25])
])
def test_duplicated_inverse(self, keep, expected_inv_values):
# check that return_inverse kwarg does not affect outcome;
# index of inverse must be correctly transformed as well
idx = [1, 4, 9, 16, 25]
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=idx)

# keep = 'first'
expected_isdup = df.duplicated(keep='first')
expected_inv = Series([1, 4, 4, 16, 1], index=idx)
result_isdup, result_inv = df.duplicated(keep='first',
return_inverse=True)
tm.assert_series_equal(result_isdup, expected_isdup)
tm.assert_series_equal(result_inv, expected_inv)

# test that inv works (and fits together with expected_isdup)
unique = df.loc[~expected_isdup]
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
tm.assert_frame_equal(reconstr, df)

# keep = 'last'
expected_isdup = df.duplicated(keep='last')
expected_inv = Series([25, 9, 9, 16, 25], index=idx)
result_isdup, result_inv = df.duplicated(keep='last',
expected_isdup = df.duplicated(keep=keep)
expected_inv = Series(expected_inv_values, index=idx)
result_isdup, result_inv = df.duplicated(keep=keep,
return_inverse=True)
tm.assert_series_equal(result_isdup, expected_isdup)
tm.assert_series_equal(result_inv, expected_inv)
Expand Down

0 comments on commit b08dc3d

Please sign in to comment.