Skip to content

Commit

Permalink
Add tests to asv; doc changes; fastpath if no dups
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Jun 27, 2018
1 parent 740df13 commit d3b5d0a
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 5 deletions.
21 changes: 20 additions & 1 deletion asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,14 +420,33 @@ def setup(self):
self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
'b': np.random.choice(t, n),
'c': np.random.choice(xs, n)})
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
# df2 will not have any duplicates
self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))

df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
columns=list('ABCDE'))
df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
self.df3 = df3

def time_frame_duplicated(self):
self.df.duplicated()

def time_frame_duplicated_wide(self):
self.df2.duplicated()

def time_frame_duplicated_wide_inverse(self):
# will take fastpath for no duplicates
self.df2.duplicated(return_inverse=True)

def time_frame_duplicated_mixed(self):
self.df3.duplicated()

def time_frame_duplicated_mixed_inverse(self):
self.df3.duplicated(return_inverse=True)

def time_frame_duplicated_mixed_inverse_last(self):
self.df3.duplicated(return_inverse=True, keep='last')


class XS(object):

Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.. _whatsnew_0231:

v0.23.1
-------
v0.23.1 (June 12, 2018)
-----------------------

This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes
and bug fixes. We recommend that all users upgrade to this version.
Expand Down
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ Previously, there was no way to determine how duplicate rows in a ``DataFrame``
information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
``return_inverse``-kwarg, it fails to work with ``object`` data.

The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series:
Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account):

.. ipython:: python

df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
index=[1, 4, 9, 16, 25])
df
isdup, inv = df.duplicated(return_inverse=True)
isdup, inv = df.duplicated(return_inverse=True) # default: keep='first'
isdup
inv

Expand Down
4 changes: 4 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4407,6 +4407,10 @@ def f(vals):
isdup = Series(duplicated_int64(ids, keep), index=self.index)
if not return_inverse:
return isdup
elif not isdup.any():
# no need to calculate inverse if no duplicates
inv = Series(self.index, index=self.index)
return isdup, inv

if keep == 'first':
# o2u: original indices to indices of ARRAY of unique values
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1654,6 +1654,18 @@ def test_duplicated_inverse_large(self, subset, keep):
reconstr = unique.reindex(inv.values).set_index(inv.index)
tm.assert_frame_equal(reconstr, df[subset])

@pytest.mark.parametrize('keep', ['first', 'last'])
def test_duplicated_inverse_fastpath(self, keep):
df = DataFrame({'A' : range(10)}) # no duplicates

expected_isdup = df.duplicated(keep=keep)
result_isdup, result_inv = df.duplicated(keep=keep,
return_inverse=True)
tm.assert_series_equal(result_isdup, expected_isdup)

expected_inv = Series(range(10))
tm.assert_series_equal(result_inv, expected_inv)

def test_drop_duplicates(self):
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
Expand Down

0 comments on commit d3b5d0a

Please sign in to comment.