Skip to content

Commit

Permalink
PERF: speed up CategoricalIndex.get_loc (pandas-dev#23235)
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored and tm9k1 committed Nov 19, 2018
1 parent 708e0b1 commit eb02b72
Show file tree
Hide file tree
Showing 10 changed files with 299 additions and 32 deletions.
28 changes: 20 additions & 8 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@
import numpy as np

from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
ObjectEngine)
from pandas._libs import index as libindex


def _get_numeric_engines():
engine_names = [
('Int64Engine', np.int64), ('Int32Engine', np.int32),
('Int16Engine', np.int16), ('Int8Engine', np.int8),
('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32),
('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
('Float64Engine', np.float64), ('Float32Engine', np.float32),
]
return [(getattr(libindex, engine_name), dtype)
for engine_name, dtype in engine_names
if hasattr(libindex, engine_name)]


class NumericEngineIndexing(object):

params = [[Int64Engine, UInt64Engine, Float64Engine],
[np.int64, np.uint64, np.float64],
params = [_get_numeric_engines(),
['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
]
param_names = ['engine', 'dtype', 'index_type']
param_names = ['engine_and_dtype', 'index_type']

def setup(self, engine, dtype, index_type):
def setup(self, engine_and_dtype, index_type):
engine, dtype = engine_and_dtype
N = 10**5
values = list([1] * N + [2] * N + [3] * N)
arr = {
Expand All @@ -26,7 +38,7 @@ def setup(self, engine, dtype, index_type):
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc(2)

def time_get_loc(self, engine, dtype, index_type):
def time_get_loc(self, engine_and_dtype, index_type):
self.data.get_loc(2)


Expand All @@ -44,7 +56,7 @@ def setup(self, index_type):
'non_monotonic': np.array(list('abc') * N, dtype=object),
}[index_type]

self.data = ObjectEngine(lambda: arr, len(arr))
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc('b')

Expand Down
8 changes: 5 additions & 3 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -949,9 +949,11 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
both when indexing by label (using .loc) and position(.iloc).
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
- Slicing Series and Dataframes with an monotonically increasing :class:`CategoricalIndex`
is now very fast and has speed comparable to slicing with an ``Int64Index``.
The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``)
shows similar speed improvements as above (:issue:`21659`)
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
Expand Down
24 changes: 22 additions & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ from libc.math cimport fabs, sqrt
import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
NPY_FLOAT32, NPY_FLOAT64,
NPY_OBJECT,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
Expand Down Expand Up @@ -359,9 +360,13 @@ ctypedef fused algos_t:
float64_t
float32_t
object
int32_t
int64_t
int32_t
int16_t
int8_t
uint64_t
uint32_t
uint16_t
uint8_t


Expand Down Expand Up @@ -459,7 +464,12 @@ pad_float32 = pad["float32_t"]
pad_object = pad["object"]
pad_int64 = pad["int64_t"]
pad_int32 = pad["int32_t"]
pad_int16 = pad["int16_t"]
pad_int8 = pad["int8_t"]
pad_uint64 = pad["uint64_t"]
pad_uint32 = pad["uint32_t"]
pad_uint16 = pad["uint16_t"]
pad_uint8 = pad["uint8_t"]
pad_bool = pad["uint8_t"]


Expand Down Expand Up @@ -653,7 +663,12 @@ backfill_float32 = backfill["float32_t"]
backfill_object = backfill["object"]
backfill_int64 = backfill["int64_t"]
backfill_int32 = backfill["int32_t"]
backfill_int16 = backfill["int16_t"]
backfill_int8 = backfill["int8_t"]
backfill_uint64 = backfill["uint64_t"]
backfill_uint32 = backfill["uint32_t"]
backfill_uint16 = backfill["uint16_t"]
backfill_uint8 = backfill["uint8_t"]
backfill_bool = backfill["uint8_t"]


Expand Down Expand Up @@ -866,7 +881,12 @@ is_monotonic_float32 = is_monotonic["float32_t"]
is_monotonic_object = is_monotonic["object"]
is_monotonic_int64 = is_monotonic["int64_t"]
is_monotonic_int32 = is_monotonic["int32_t"]
is_monotonic_int16 = is_monotonic["int16_t"]
is_monotonic_int8 = is_monotonic["int8_t"]
is_monotonic_uint64 = is_monotonic["uint64_t"]
is_monotonic_uint32 = is_monotonic["uint32_t"]
is_monotonic_uint16 = is_monotonic["uint16_t"]
is_monotonic_uint8 = is_monotonic["uint8_t"]
is_monotonic_bool = is_monotonic["uint8_t"]


Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('uint8', 'UINT8', 'uint8'),
('uint16', 'UINT16', 'uint16'),
('uint32', 'UINT32', 'uint32'),
('uint64', 'UINT64', 'uint64'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import cython

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray, float64_t, int32_t,
int64_t, uint8_t, uint64_t, intp_t,
from numpy cimport (ndarray, intp_t,
float64_t, float32_t,
int64_t, int32_t, int16_t, int8_t,
uint64_t, uint32_t, uint16_t, uint8_t,
# Note: NPY_DATETIME, NPY_TIMEDELTA are only available
# for cimport in cython>=0.27.3
NPY_DATETIME, NPY_TIMEDELTA)
Expand Down
35 changes: 22 additions & 13 deletions pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

{{py:

# name, dtype, ctype
dtypes = [('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t'),
('Object', 'object', 'object')]
# name, dtype, ctype, hashtable_name, hashtable_dtype
dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'),
('Float32', 'float32', 'float32_t', 'Float64', 'float64'),
('Int64', 'int64', 'int64_t', 'Int64', 'int64'),
('Int32', 'int32', 'int32_t', 'Int64', 'int64'),
('Int16', 'int16', 'int16_t', 'Int64', 'int64'),
('Int8', 'int8', 'int8_t', 'Int64', 'int64'),
('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'),
('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'),
('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'),
('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'),
('Object', 'object', 'object', 'PyObject', 'object'),
]
}}

{{for name, dtype, ctype in dtypes}}
{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}}


cdef class {{name}}Engine(IndexEngine):
Expand All @@ -34,13 +42,9 @@ cdef class {{name}}Engine(IndexEngine):
other, limit=limit)

cdef _make_hash_table(self, n):
{{if name == 'Object'}}
return _hash.PyObjectHashTable(n)
{{else}}
return _hash.{{name}}HashTable(n)
{{endif}}
return _hash.{{hashtable_name}}HashTable(n)

{{if name != 'Float64' and name != 'Object'}}
{{if name not in {'Float64', 'Float32', 'Object'} }}
cdef _check_type(self, object val):
hash(val)
if util.is_bool_object(val):
Expand All @@ -50,6 +54,11 @@ cdef class {{name}}Engine(IndexEngine):
{{endif}}

{{if name != 'Object'}}
cpdef _call_map_locations(self, values):
# self.mapping is of type {{hashtable_name}}HashTable,
# so convert dtype of values
self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values))

cdef _get_index_values(self):
return algos.ensure_{{dtype}}(self.vgetter())

Expand All @@ -60,7 +69,7 @@ cdef class {{name}}Engine(IndexEngine):
ndarray[{{ctype}}] values
int count = 0

{{if name != 'Float64'}}
{{if name not in {'Float64', 'Float32'} }}
if not util.is_integer_object(val):
raise KeyError(val)
{{endif}}
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,17 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
"""

_typ = 'categoricalindex'
_engine_type = libindex.Int64Engine

@property
def _engine_type(self):
# self.codes can have dtype int8, int16, int32 or int64, so we need
# to return the corresponding engine type (libindex.Int8Engine, etc.).
return {np.int8: libindex.Int8Engine,
np.int16: libindex.Int16Engine,
np.int32: libindex.Int32Engine,
np.int64: libindex.Int64Engine,
}[self.codes.dtype.type]

_attributes = ['name']

def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
Expand Down Expand Up @@ -382,7 +392,7 @@ def argsort(self, *args, **kwargs):
def _engine(self):

# we are going to look things up with the codes themselves
return self._engine_type(lambda: self.codes.astype('i8'), len(self))
return self._engine_type(lambda: self.codes, len(self))

# introspection
@cache_readonly
Expand Down Expand Up @@ -450,6 +460,7 @@ def get_loc(self, key, method=None):
array([False, True, False, True], dtype=bool)
"""
code = self.categories.get_loc(key)
code = self.codes.dtype.type(code)
try:
return self._engine.get_loc(code)
except KeyError:
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-

import pytest
import numpy as np

import pandas.util.testing as tm
from pandas.core.indexes.api import Index, CategoricalIndex
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas._libs import index as libindex
from .common import Base

from pandas.compat import range, PY3

import numpy as np

from pandas import Categorical, IntervalIndex, compat
from pandas.util.testing import assert_almost_equal
import pandas.core.config as cf
Expand Down Expand Up @@ -1117,3 +1117,23 @@ def test_take_invalid_kwargs(self):
msg = "the 'mode' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, idx.take,
indices, mode='clip')

@pytest.mark.parametrize('dtype, engine_type', [
(np.int8, libindex.Int8Engine),
(np.int16, libindex.Int16Engine),
(np.int32, libindex.Int32Engine),
(np.int64, libindex.Int64Engine),
])
def test_engine_type(self, dtype, engine_type):
if dtype != np.int64:
# num. of uniques required to push CategoricalIndex.codes to a
# dtype (128 categories required for .codes dtype to be int16 etc.)
num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
ci = pd.CategoricalIndex(range(num_uniques))
else:
# having 2**32 - 2**31 categories would be very memory-intensive,
# so we cheat a bit with the dtype
ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
ci.values._codes = ci.values._codes.astype('int64')
assert np.issubdtype(ci.codes.dtype, dtype)
assert isinstance(ci._engine, engine_type)
20 changes: 20 additions & 0 deletions pandas/tests/indexing/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import numpy as np
import pytest

from pandas._libs import index as libindex


@pytest.fixture(params=[
(libindex.Int64Engine, np.int64),
(libindex.Int32Engine, np.int32),
(libindex.Int16Engine, np.int16),
(libindex.Int8Engine, np.int8),
(libindex.UInt64Engine, np.uint64),
(libindex.UInt32Engine, np.uint32),
(libindex.UInt16Engine, np.uint16),
(libindex.UInt8Engine, np.uint8),
(libindex.Float64Engine, np.float64),
(libindex.Float32Engine, np.float32),
], ids=lambda x: x[0].__name__)
def numeric_indexing_engine_type_and_dtype(request):
return request.param
Loading

0 comments on commit eb02b72

Please sign in to comment.