From eb02b7206054a0af7c4c0b4a7f9e6d8957d41645 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 26 Oct 2018 01:11:54 +0000 Subject: [PATCH] PERF: speed up CategoricalIndex.get_loc (#23235) --- asv_bench/benchmarks/indexing_engines.py | 28 ++- doc/source/whatsnew/v0.24.0.txt | 8 +- pandas/_libs/algos.pyx | 24 ++- pandas/_libs/algos_common_helper.pxi.in | 3 + pandas/_libs/index.pyx | 6 +- pandas/_libs/index_class_helper.pxi.in | 35 ++-- pandas/core/indexes/category.py | 15 +- pandas/tests/indexes/test_category.py | 24 ++- pandas/tests/indexing/conftest.py | 20 +++ .../tests/indexing/test_indexing_engines.py | 168 ++++++++++++++++++ 10 files changed, 299 insertions(+), 32 deletions(-) create mode 100644 pandas/tests/indexing/conftest.py create mode 100644 pandas/tests/indexing/test_indexing_engines.py diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 1e9283c7fb92b8..f3d063ee31bc8a 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,18 +1,30 @@ import numpy as np -from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine, - ObjectEngine) +from pandas._libs import index as libindex + + +def _get_numeric_engines(): + engine_names = [ + ('Int64Engine', np.int64), ('Int32Engine', np.int32), + ('Int16Engine', np.int16), ('Int8Engine', np.int8), + ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32), + ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8), + ('Float64Engine', np.float64), ('Float32Engine', np.float32), + ] + return [(getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name)] class NumericEngineIndexing(object): - params = [[Int64Engine, UInt64Engine, Float64Engine], - [np.int64, np.uint64, np.float64], + params = [_get_numeric_engines(), ['monotonic_incr', 'monotonic_decr', 'non_monotonic'], ] - param_names = ['engine', 'dtype', 'index_type'] + param_names = ['engine_and_dtype', 'index_type'] - def setup(self, engine, dtype, index_type): + def setup(self, engine_and_dtype, index_type): + engine, dtype = engine_and_dtype N = 10**5 values = list([1] * N + [2] * N + [3] * N) arr = { @@ -26,7 +38,7 @@ def setup(self, engine, dtype, index_type): # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) - def time_get_loc(self, engine, dtype, index_type): + def time_get_loc(self, engine_and_dtype, index_type): self.data.get_loc(2) @@ -44,7 +56,7 @@ def setup(self, index_type): 'non_monotonic': np.array(list('abc') * N, dtype=object), }[index_type] - self.data = ObjectEngine(lambda: arr, len(arr)) + self.data = libindex.ObjectEngine(lambda: arr, len(arr)) # code belows avoids populating the mapping etc. while timing. self.data.get_loc('b') diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 638b3ac0ac5600..fd34fef886a16e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -949,9 +949,11 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Slicing Series and Dataframes with an monotonically increasing :class:`CategoricalIndex` + is now very fast and has speed comparable to slicing with an ``Int64Index``. + The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) + Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``) + shows similar speed improvements as above (:issue:`21659`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d2914dc8ac751e..3ba4c2375b4e8e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -10,7 +10,8 @@ from libc.math cimport fabs, sqrt import numpy as np cimport numpy as cnp from numpy cimport (ndarray, - NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8, + NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8, NPY_FLOAT32, NPY_FLOAT64, NPY_OBJECT, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, @@ -359,9 +360,13 @@ ctypedef fused algos_t: float64_t float32_t object - int32_t int64_t + int32_t + int16_t + int8_t uint64_t + uint32_t + uint16_t uint8_t @@ -459,7 +464,12 @@ pad_float32 = pad["float32_t"] pad_object = pad["object"] pad_int64 = pad["int64_t"] pad_int32 = pad["int32_t"] +pad_int16 = pad["int16_t"] +pad_int8 = pad["int8_t"] pad_uint64 = pad["uint64_t"] +pad_uint32 = pad["uint32_t"] +pad_uint16 = pad["uint16_t"] +pad_uint8 = pad["uint8_t"] pad_bool = pad["uint8_t"] @@ -653,7 +663,12 @@ backfill_float32 = backfill["float32_t"] backfill_object = backfill["object"] backfill_int64 = backfill["int64_t"] backfill_int32 = backfill["int32_t"] +backfill_int16 = backfill["int16_t"] +backfill_int8 = backfill["int8_t"] backfill_uint64 = backfill["uint64_t"] +backfill_uint32 = backfill["uint32_t"] +backfill_uint16 = backfill["uint16_t"] +backfill_uint8 = backfill["uint8_t"] backfill_bool = backfill["uint8_t"] @@ -866,7 +881,12 @@ is_monotonic_float32 = is_monotonic["float32_t"] is_monotonic_object = is_monotonic["object"] is_monotonic_int64 = is_monotonic["int64_t"] is_monotonic_int32 = is_monotonic["int32_t"] +is_monotonic_int16 = is_monotonic["int16_t"] +is_monotonic_int8 = is_monotonic["int8_t"] is_monotonic_uint64 = is_monotonic["uint64_t"] +is_monotonic_uint32 = is_monotonic["uint32_t"] +is_monotonic_uint16 = is_monotonic["uint16_t"] +is_monotonic_uint8 = is_monotonic["uint8_t"] is_monotonic_bool = is_monotonic["uint8_t"] diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index b39b5eaced8fdd..518664d70cf067 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -133,6 +133,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('uint8', 'UINT8', 'uint8'), + ('uint16', 'UINT16', 'uint16'), + ('uint32', 'UINT32', 'uint32'), ('uint64', 'UINT64', 'uint64'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3f76915655f580..d418ac63a4ac85 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -5,8 +5,10 @@ import cython import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, float64_t, int32_t, - int64_t, uint8_t, uint64_t, intp_t, +from numpy cimport (ndarray, intp_t, + float64_t, float32_t, + int64_t, int32_t, int16_t, int8_t, + uint64_t, uint32_t, uint16_t, uint8_t, # Note: NPY_DATETIME, NPY_TIMEDELTA are only available # for cimport in cython>=0.27.3 NPY_DATETIME, NPY_TIMEDELTA) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 4ea35da0626f37..c19812efaaa35a 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -10,14 +10,22 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, ctype -dtypes = [('Float64', 'float64', 'float64_t'), - ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t'), - ('Object', 'object', 'object')] +# name, dtype, ctype, hashtable_name, hashtable_dtype +dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'), + ('Float32', 'float32', 'float32_t', 'Float64', 'float64'), + ('Int64', 'int64', 'int64_t', 'Int64', 'int64'), + ('Int32', 'int32', 'int32_t', 'Int64', 'int64'), + ('Int16', 'int16', 'int16_t', 'Int64', 'int64'), + ('Int8', 'int8', 'int8_t', 'Int64', 'int64'), + ('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'), + ('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'), + ('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'), + ('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'), + ('Object', 'object', 'object', 'PyObject', 'object'), + ] }} -{{for name, dtype, ctype in dtypes}} +{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}} cdef class {{name}}Engine(IndexEngine): @@ -34,13 +42,9 @@ cdef class {{name}}Engine(IndexEngine): other, limit=limit) cdef _make_hash_table(self, n): - {{if name == 'Object'}} - return _hash.PyObjectHashTable(n) - {{else}} - return _hash.{{name}}HashTable(n) - {{endif}} + return _hash.{{hashtable_name}}HashTable(n) - {{if name != 'Float64' and name != 'Object'}} + {{if name not in {'Float64', 'Float32', 'Object'} }} cdef _check_type(self, object val): hash(val) if util.is_bool_object(val): @@ -50,6 +54,11 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} {{if name != 'Object'}} + cpdef _call_map_locations(self, values): + # self.mapping is of type {{hashtable_name}}HashTable, + # so convert dtype of values + self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values)) + cdef _get_index_values(self): return algos.ensure_{{dtype}}(self.vgetter()) @@ -60,7 +69,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name != 'Float64'}} + {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) {{endif}} diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e4250ae790553f..278e395d650142 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -84,7 +84,17 @@ class CategoricalIndex(Index, accessor.PandasDelegate): """ _typ = 'categoricalindex' - _engine_type = libindex.Int64Engine + + @property + def _engine_type(self): + # self.codes can have dtype int8, int16, int32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + return {np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] + _attributes = ['name'] def __new__(cls, data=None, categories=None, ordered=None, dtype=None, @@ -382,7 +392,7 @@ def argsort(self, *args, **kwargs): def _engine(self): # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + return self._engine_type(lambda: self.codes, len(self)) # introspection @cache_readonly @@ -450,6 +460,7 @@ def get_loc(self, key, method=None): array([False, True, False, True], dtype=bool) """ code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) try: return self._engine.get_loc(code) except KeyError: diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 99058f883a3923..d89baa41d33fe9 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- import pytest +import numpy as np import pandas.util.testing as tm from pandas.core.indexes.api import Index, CategoricalIndex from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas._libs import index as libindex from .common import Base from pandas.compat import range, PY3 -import numpy as np - from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf @@ -1117,3 +1117,23 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" tm.assert_raises_regex(ValueError, msg, idx.take, indices, mode='clip') + + @pytest.mark.parametrize('dtype, engine_type', [ + (np.int8, libindex.Int8Engine), + (np.int16, libindex.Int16Engine), + (np.int32, libindex.Int32Engine), + (np.int64, libindex.Int64Engine), + ]) + def test_engine_type(self, dtype, engine_type): + if dtype != np.int64: + # num. of uniques required to push CategoricalIndex.codes to a + # dtype (128 categories required for .codes dtype to be int16 etc.) + num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype] + ci = pd.CategoricalIndex(range(num_uniques)) + else: + # having 2**32 - 2**31 categories would be very memory-intensive, + # so we cheat a bit with the dtype + ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) + ci.values._codes = ci.values._codes.astype('int64') + assert np.issubdtype(ci.codes.dtype, dtype) + assert isinstance(ci._engine, engine_type) diff --git a/pandas/tests/indexing/conftest.py b/pandas/tests/indexing/conftest.py new file mode 100644 index 00000000000000..be1cf4800a2ef3 --- /dev/null +++ b/pandas/tests/indexing/conftest.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest + +from pandas._libs import index as libindex + + +@pytest.fixture(params=[ + (libindex.Int64Engine, np.int64), + (libindex.Int32Engine, np.int32), + (libindex.Int16Engine, np.int16), + (libindex.Int8Engine, np.int8), + (libindex.UInt64Engine, np.uint64), + (libindex.UInt32Engine, np.uint32), + (libindex.UInt16Engine, np.uint16), + (libindex.UInt8Engine, np.uint8), + (libindex.Float64Engine, np.float64), + (libindex.Float32Engine, np.float32), +], ids=lambda x: x[0].__name__) +def numeric_indexing_engine_type_and_dtype(request): + return request.param diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py new file mode 100644 index 00000000000000..410eba99948ce1 --- /dev/null +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -0,0 +1,168 @@ +import numpy as np + +import pandas.util.testing as tm +from pandas import compat +from pandas._libs import algos as libalgos, index as libindex + + +class TestNumericEngine(object): + def test_is_monotonic(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + num = 1000 + arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) + + # monotonic increasing + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_monotonic_increasing is True + assert engine.is_monotonic_decreasing is False + + # monotonic decreasing + engine = engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is True + + # neither monotonic increasing or decreasing + arr = np.array([1] * num + [2] * num + [1] * num, dtype=dtype) + engine = engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is False + + def test_is_unique(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + # unique + arr = np.array([1, 3, 2], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_unique is True + + # not unique + arr = np.array([1, 2, 1], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_unique is False + + def test_get_loc(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + # unique + arr = np.array([1, 2, 3], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.get_loc(2) == 1 + + # monotonic + num = 1000 + arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.get_loc(2) == slice(1000, 2000) + + # not monotonic + arr = np.array([1, 2, 3] * num, dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + expected = np.array([False, True, False] * num, dtype=bool) + result = engine.get_loc(2) + assert (result == expected).all() + + def test_get_backfill_indexer( + self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + arr = np.array([1, 5, 10], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + + new = np.array(compat.range(12), dtype=dtype) + result = engine.get_backfill_indexer(new) + + expected = libalgos.backfill(arr, new) + tm.assert_numpy_array_equal(result, expected) + + def test_get_pad_indexer( + self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + arr = np.array([1, 5, 10], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + + new = np.array(compat.range(12), dtype=dtype) + result = engine.get_pad_indexer(new) + + expected = libalgos.pad(arr, new) + tm.assert_numpy_array_equal(result, expected) + + +class TestObjectEngine(object): + engine_type = libindex.ObjectEngine + dtype = np.object_ + values = list('abc') + + def test_is_monotonic(self): + + num = 1000 + arr = np.array(['a'] * num + ['a'] * num + ['c'] * num, + dtype=self.dtype) + + # monotonic increasing + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_monotonic_increasing is True + assert engine.is_monotonic_decreasing is False + + # monotonic decreasing + engine = self.engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is True + + # neither monotonic increasing or decreasing + arr = np.array(['a'] * num + ['b'] * num + ['a'] * num, + dtype=self.dtype) + engine = self.engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is False + + def test_is_unique(self): + # unique + arr = np.array(self.values, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_unique is True + + # not unique + arr = np.array(['a', 'b', 'a'], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_unique is False + + def test_get_loc(self): + # unique + arr = np.array(self.values, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.get_loc('b') == 1 + + # monotonic + num = 1000 + arr = np.array(['a'] * num + ['b'] * num + ['c'] * num, + dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.get_loc('b') == slice(1000, 2000) + + # not monotonic + arr = np.array(self.values * num, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + expected = np.array([False, True, False] * num, dtype=bool) + result = engine.get_loc('b') + assert (result == expected).all() + + def test_get_backfill_indexer(self): + arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + + new = np.array(list('abcdefghij'), dtype=self.dtype) + result = engine.get_backfill_indexer(new) + + expected = libalgos.backfill_object(arr, new) + tm.assert_numpy_array_equal(result, expected) + + def test_get_pad_indexer(self): + arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + + new = np.array(list('abcdefghij'), dtype=self.dtype) + result = engine.get_pad_indexer(new) + + expected = libalgos.pad_object(arr, new) + tm.assert_numpy_array_equal(result, expected)