From 2688cbe2a20b41d199e9f8257384b9d719480eb4 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 14 Nov 2018 07:26:24 -0700 Subject: [PATCH] BUG: Fix Series/DataFrame.rank(pct=True) with more than 2**24 rows (#23688) --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/_libs/algos_rank_helper.pxi.in | 4 ++-- pandas/tests/frame/test_rank.py | 7 +++++++ pandas/tests/series/test_rank.py | 7 +++++++ pandas/tests/test_algos.py | 9 +++++++++ 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 19af38954e282..f6a27e4c68ce0 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1205,6 +1205,7 @@ Numeric - Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) - Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) +- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) Strings ^^^^^^^ diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 4d144dcf2808a..329c368e13d6d 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -53,7 +53,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', int tiebreak = 0 bint keep_na = 0 bint isnan - float count = 0.0 + float64_t count = 0.0 tiebreak = tiebreakers[ties_method] {{if dtype == 'float64'}} @@ -228,7 +228,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 - float count = 0.0 + float64_t count = 0.0 tiebreak = tiebreakers[ties_method] diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index 078c48539de16..eaba5f7ec7790 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -309,3 +309,10 @@ def test_rank_pct_true(self, method, exp): expected = DataFrame(exp) tm.assert_frame_equal(result, expected) + + def test_pct_max_many_rows(self): + # GH 18271 + df = DataFrame({'A': np.arange(2**24 + 1), + 'B': np.arange(2**24 + 1, 0, -1)}) + result = df.rank(pct=True).max() + assert (result == 1).all() diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 9772ceecfc7b1..5b0ea37a0bfcf 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -495,3 +495,10 @@ def test_rank_first_pct(dtype, ser, exp): result = s.rank(method='first', pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) + + +def test_pct_max_many_rows(): + # GH 18271 + s = Series(np.arange(2**24 + 1)) + result = s.rank(pct=True).max() + assert result == 1 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3642c4ee98a9e..ff505f2986b1a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1462,6 +1462,15 @@ def test_too_many_ndims(self): with pytest.raises(TypeError, match=msg): algos.rank(arr) + @pytest.mark.parametrize('values', [ + np.arange(2**24 + 1), + np.arange(2**25 + 2).reshape(2**24 + 1, 2)], + ids=['1d', '2d']) + def test_pct_max_many_rows(self, values): + # GH 18271 + result = algos.rank(values, pct=True).max() + assert result == 1 + def test_pad_backfill_object_segfault():