diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 12e4824b2dd2a2..3c4ffb13b9ec29 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -420,7 +420,13 @@ def setup(self): self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), 'b': np.random.choice(t, n), 'c': np.random.choice(xs, n)}) - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T + # df2 will not have any duplicates + self.df2 = DataFrame(np.random.randn(100, 1000).astype(str)) + + df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)), + columns=list('ABCDE')) + df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str)) + self.df3 = df3 def time_frame_duplicated(self): self.df.duplicated() @@ -428,6 +434,19 @@ def time_frame_duplicated(self): def time_frame_duplicated_wide(self): self.df2.duplicated() + def time_frame_duplicated_wide_inverse(self): + # will take fastpath for no duplicates + self.df2.duplicated(return_inverse=True) + + def time_frame_duplicated_mixed(self): + self.df3.duplicated() + + def time_frame_duplicated_mixed_inverse(self): + self.df3.duplicated(return_inverse=True) + + def time_frame_duplicated_mixed_inverse_last(self): + self.df3.duplicated(return_inverse=True, keep='last') + class XS(object): diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index af4eeffd87d01b..cf60e86553fe31 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -1,7 +1,7 @@ .. _whatsnew_0231: -v0.23.1 -------- +v0.23.1 (June 12, 2018) +----------------------- This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index bf6a181bd1bfc6..ccc53c8aaa128a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,14 +17,15 @@ Previously, there was no way to determine how duplicate rows in a ``DataFrame`` information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a ``return_inverse``-kwarg, it fails to work with ``object`` data. -The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series: +Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series +to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account): .. ipython:: python df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}, index=[1, 4, 9, 16, 25]) df - isdup, inv = df.duplicated(return_inverse=True) + isdup, inv = df.duplicated(return_inverse=True) # default: keep='first' isdup inv diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 12bcf543fdd8e7..e46cc12028c79e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4407,6 +4407,10 @@ def f(vals): isdup = Series(duplicated_int64(ids, keep), index=self.index) if not return_inverse: return isdup + elif not isdup.any(): + # no need to calculate inverse if no duplicates + inv = Series(self.index, index=self.index) + return isdup, inv if keep == 'first': # o2u: original indices to indices of ARRAY of unique values diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index cd90a90dd2a87a..ebb585848be7a2 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1654,6 +1654,18 @@ def test_duplicated_inverse_large(self, subset, keep): reconstr = unique.reindex(inv.values).set_index(inv.index) tm.assert_frame_equal(reconstr, df[subset]) + @pytest.mark.parametrize('keep', ['first', 'last']) + def test_duplicated_inverse_fastpath(self, keep): + df = DataFrame({'A' : range(10)}) # no duplicates + + expected_isdup = df.duplicated(keep=keep) + result_isdup, result_inv = df.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + expected_inv = Series(range(10)) + tm.assert_series_equal(result_inv, expected_inv) + def test_drop_duplicates(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'],