Skip to content

Commit

Permalink
PERF/REF: improve performance of Series.searchsorted, PandasArray.sea…
Browse files Browse the repository at this point in the history
…rchsorted, collect functionality (pandas-dev#22034)
  • Loading branch information
topper-123 authored and Pingviinituutti committed Feb 28, 2019
1 parent 1036bb0 commit 47c430f
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 20 deletions.
19 changes: 19 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,25 @@ def time_dropna(self, dtype):
self.s.dropna()


class SearchSorted(object):

goal_time = 0.2
params = ['int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64',
'float16', 'float32', 'float64',
'str']
param_names = ['dtype']

def setup(self, dtype):
N = 10**5
data = np.array([1] * N + [2] * N + [3] * N).astype(dtype)
self.s = Series(data)

def time_searchsorted(self, dtype):
key = '2' if dtype == 'str' else 2
self.s.searchsorted(key)


class Map(object):

params = ['dict', 'Series']
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ Performance Improvements

- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
- `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`)
-
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)


.. _whatsnew_0250.bug_fixes:
Expand Down
85 changes: 84 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
is_datetimelike, is_extension_array_dtype, is_float_dtype,
is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer,
is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
Expand Down Expand Up @@ -1724,6 +1724,89 @@ def func(arr, indexer, out, fill_value=np.nan):
return out


# ------------ #
# searchsorted #
# ------------ #

def searchsorted(arr, value, side="left", sorter=None):
"""
Find indices where elements should be inserted to maintain order.
.. versionadded:: 0.25.0
Find the indices into a sorted array `arr` (a) such that, if the
corresponding elements in `value` were inserted before the indices,
the order of `arr` would be preserved.
Assuming that `arr` is sorted:
====== ================================
`side` returned index `i` satisfies
====== ================================
left ``arr[i-1] < value <= self[i]``
right ``arr[i-1] <= value < self[i]``
====== ================================
Parameters
----------
arr: array-like
Input array. If `sorter` is None, then it must be sorted in
ascending order, otherwise `sorter` must be an array of indices
that sort it.
value : array_like
Values to insert into `arr`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
sorter : 1-D array_like, optional
Optional array of integer indices that sort array a into ascending
order. They are typically the result of argsort.
Returns
-------
array of ints
Array of insertion points with the same shape as `value`.
See Also
--------
numpy.searchsorted : Similar method from NumPy.
"""
if sorter is not None:
sorter = ensure_platform_int(sorter)

if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and (
is_integer(value) or is_integer_dtype(value)):
from .arrays.array_ import array
# if `arr` and `value` have different dtypes, `arr` would be
# recast by numpy, causing a slow search.
# Before searching below, we therefore try to give `value` the
# same dtype as `arr`, while guarding against integer overflows.
iinfo = np.iinfo(arr.dtype.type)
value_arr = np.array([value]) if is_scalar(value) else np.array(value)
if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
# value within bounds, so no overflow, so can convert value dtype
# to dtype of arr
dtype = arr.dtype
else:
dtype = value_arr.dtype

if is_scalar(value):
value = dtype.type(value)
else:
value = array(value, dtype=dtype)
elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or
is_categorical_dtype(arr)):
from pandas.core.series import Series
# E.g. if `arr` is an array with dtype='datetime64[ns]'
# and `value` is a pd.Timestamp, we may need to convert value
value_ser = Series(value)._values
value = value_ser[0] if is_scalar(value) else value_ser

result = arr.searchsorted(value, side=side, sorter=sorter)
return result


# ---- #
# diff #
# ---- #
Expand Down
18 changes: 9 additions & 9 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,17 +555,17 @@ def searchsorted(self, value, side="left", sorter=None):
.. versionadded:: 0.24.0
Find the indices into a sorted array `self` (a) such that, if the
corresponding elements in `v` were inserted before the indices, the
order of `self` would be preserved.
corresponding elements in `value` were inserted before the indices,
the order of `self` would be preserved.
Assuming that `a` is sorted:
Assuming that `self` is sorted:
====== ============================
====== ================================
`side` returned index `i` satisfies
====== ============================
left ``self[i-1] < v <= self[i]``
right ``self[i-1] <= v < self[i]``
====== ============================
====== ================================
left ``self[i-1] < value <= self[i]``
right ``self[i-1] <= value < self[i]``
====== ================================
Parameters
----------
Expand All @@ -581,7 +581,7 @@ def searchsorted(self, value, side="left", sorter=None):
Returns
-------
indices : array of ints
array of ints
Array of insertion points with the same shape as `value`.
See Also
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pandas._libs import lib
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.dtypes import ExtensionDtype
Expand All @@ -12,6 +13,7 @@

from pandas import compat
from pandas.core import nanops
from pandas.core.algorithms import searchsorted
from pandas.core.missing import backfill_1d, pad_1d

from .base import ExtensionArray, ExtensionOpsMixin
Expand Down Expand Up @@ -423,6 +425,11 @@ def to_numpy(self, dtype=None, copy=False):

return result

@Appender(ExtensionArray.searchsorted.__doc__)
def searchsorted(self, value, side='left', sorter=None):
return searchsorted(self.to_numpy(), value,
side=side, sorter=sorter)

# ------------------------------------------------------------------------
# Ops

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1522,11 +1522,11 @@ def factorize(self, sort=False, na_sentinel=-1):
array([3])
""")

@Substitution(klass='IndexOpsMixin')
@Substitution(klass='Index')
@Appender(_shared_docs['searchsorted'])
def searchsorted(self, value, side='left', sorter=None):
# needs coercion on the key (DatetimeIndex does already)
return self._values.searchsorted(value, side=side, sorter=sorter)
return algorithms.searchsorted(self._values, value,
side=side, sorter=sorter)

def drop_duplicates(self, keep='first', inplace=False):
inplace = validate_bool_kwarg(inplace, 'inplace')
Expand Down
8 changes: 2 additions & 6 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2392,12 +2392,8 @@ def __rmatmul__(self, other):
@Substitution(klass='Series')
@Appender(base._shared_docs['searchsorted'])
def searchsorted(self, value, side='left', sorter=None):
if sorter is not None:
sorter = ensure_platform_int(sorter)
result = self._values.searchsorted(Series(value)._values,
side=side, sorter=sorter)

return result[0] if is_scalar(value) else result
return algorithms.searchsorted(self._values, value,
side=side, sorter=sorter)

# -------------------------------------------------------------------
# Combination
Expand Down
49 changes: 49 additions & 0 deletions pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import pandas as pd
from pandas.api.extensions import register_extension_dtype
from pandas.api.types import is_scalar
from pandas.core.arrays import PandasArray, integer_array, period_array
from pandas.tests.extension.decimal import (
DecimalArray, DecimalDtype, to_decimal)
Expand Down Expand Up @@ -254,3 +255,51 @@ def test_array_not_registered(registry_without_decimal):
result = pd.array(data, dtype=DecimalDtype)
expected = DecimalArray._from_sequence(data)
tm.assert_equal(result, expected)


class TestArrayAnalytics(object):
def test_searchsorted(self, string_dtype):
arr = pd.array(['a', 'b', 'c'], dtype=string_dtype)

result = arr.searchsorted('a', side='left')
assert is_scalar(result)
assert result == 0

result = arr.searchsorted('a', side='right')
assert is_scalar(result)
assert result == 1

def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_dtype)
result = arr.searchsorted(30)
assert is_scalar(result)
assert result == 2

result = arr.searchsorted([30])
expected = np.array([2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_dtype)
result = arr.searchsorted([2, 30])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize('arr, val', [
[pd.date_range('20120101', periods=10, freq='2D'),
pd.Timestamp('20120102')],
[pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'),
pd.Timestamp('20120102', tz='Asia/Hong_Kong')],
[pd.timedelta_range(start='1 day', end='10 days', periods=10),
pd.Timedelta('2 days')]])
def test_search_sorted_datetime64_scalar(self, arr, val):
arr = pd.array(arr)
result = arr.searchsorted(val)
assert is_scalar(result)
assert result == 1

def test_searchsorted_sorter(self, any_real_dtype):
arr = pd.array([3, 1, 2], dtype=any_real_dtype)
result = arr.searchsorted([0, 3], sorter=np.argsort(arr))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

0 comments on commit 47c430f

Please sign in to comment.