Skip to content

Commit

Permalink
ENH: Cython Reducer, speed up DataFrame.apply significantly, GH #309
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Nov 13, 2011
1 parent a1e2798 commit 74f5d6d
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 45 deletions.
3 changes: 3 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,11 @@ pandas 0.5.1
for fast conversion to DataFrame (GH #357)
- Can pass multiple levels to groupby, e.g. `df.groupby(level=[0, 1])` (GH
#103)
- Can sort by multiple columns in `DataFrame.sort_index` (GH #92, PR #362)
- Add fast `get_value` and `put_value` methods to DataFrame and
micro-performance tweaks (GH #360)
- Add `cov` instance methods to Series and DataFrame (GH #194, PR #362)

**Improvements to existing features**

Expand Down
15 changes: 13 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1520,7 +1520,6 @@ def sort_index(self, axis=0, by=None, ascending=True):
else:
to_sort = self[by].values

# stable sort
indexer = to_sort.argsort()
else:
indexer = labels.argsort()
Expand Down Expand Up @@ -2187,7 +2186,10 @@ def apply(self, func, axis=0, broadcast=False, raw=False):
return self._apply_broadcast(func, axis)

def _apply_raw(self, func, axis):
result = np.apply_along_axis(func, axis, self.values)
try:
result = lib.reduce(self.values, func, axis=axis)
except Exception:
result = np.apply_along_axis(func, axis, self.values)

# TODO: mixed type case
if result.ndim == 2:
Expand All @@ -2197,6 +2199,15 @@ def _apply_raw(self, func, axis):
return Series(result, index=self._get_agg_axis(axis))

def _apply_standard(self, func, axis, ignore_failures=False):
try:
values = self.values
dummy = Series(np.nan, index=self._get_axis(axis),
dtype=values.dtype)
result = lib.reduce(values, func, axis=axis, dummy=dummy)
return Series(result, index=self._get_agg_axis(axis))
except Exception:
pass

if axis == 0:
series_gen = ((c, self[c]) for c in self.columns)
res_index = self.columns
Expand Down
83 changes: 83 additions & 0 deletions pandas/src/reduce.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from numpy cimport *
import numpy as np

cdef class Reducer:
'''
Performs generic reduction operation on a C or Fortran-contiguous ndarray
while avoiding ndarray construction overhead
'''
cdef:
Py_ssize_t increment, chunksize, nresults
object arr, dummy, f

def __init__(self, object arr, object f, axis=1, dummy=None):
n, k = arr.shape

if axis == 0:
if not arr.flags.f_contiguous:
arr = arr.copy('F')

self.nresults = k
self.chunksize = n
self.increment = n * arr.dtype.itemsize
else:
if not arr.flags.c_contiguous:
arr = arr.copy('C')

self.nresults = n
self.chunksize = k
self.increment = k * arr.dtype.itemsize

self.f = f
self.arr = arr
self.dummy = self._check_dummy(dummy)

def _check_dummy(self, dummy=None):
if dummy is None:
dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
else:
if dummy.dtype != self.arr.dtype:
raise ValueError('Dummy array must be same dtype')
if len(dummy) != self.chunksize:
raise ValueError('Dummy array must be length %d' %
self.chunksize)

return dummy

def get_result(self):
cdef:
char* dummy_buf
ndarray arr, result, chunk
Py_ssize_t i
flatiter it

arr = self.arr
chunk = self.dummy

result = np.empty(self.nresults, dtype=self.arr.dtype)
it = <flatiter> PyArray_IterNew(result)

test = self.f(self.chunk)
try:
result[0] = test
except Exception:
raise ValueError('function does not reduce')

dummy_buf = chunk.data
chunk.data = arr.data

try:
for i in range(self.nresults):
PyArray_SETITEM(result, PyArray_ITER_DATA(it),
self.f(self.dummy))
chunk.data = chunk.data + self.increment
PyArray_ITER_NEXT(it)
finally:
# so we don't free the wrong memory
chunk.data = dummy_buf

return result

def reduce(arr, f, axis=0, dummy=None):
reducer = Reducer(arr, f, axis=axis, dummy=dummy)
return reducer.get_result()
46 changes: 5 additions & 41 deletions pandas/src/sandbox.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,10 @@ import numpy as np

import_array()

cdef class ArrayCruncher:
cdef class SeriesIterator:

cdef:
ndarray arr
object f
bint raw
Py_ssize_t N, K
def __init__(self, arr):
pass

def __init__(self, arr, f, axis=0, raw=True):
self.arr = arr
self.f = f
self.raw = raw
self.N, self.K = arr.shape

def reduce(self):
cdef:
char* dummy_buf
ndarray arr, result, chunk
Py_ssize_t i, increment
flatiter it

if not self.arr.flags.c_contiguous:
arr = self.arr.copy('C')
else:
arr = self.arr

increment = self.K * self.arr.dtype.itemsize
chunk = np.empty(self.K, dtype=arr.dtype)
result = np.empty(self.N, dtype=arr.dtype)
it = <flatiter> PyArray_IterNew(result)

dummy_buf = chunk.data
chunk.data = arr.data

for i in range(self.N):
PyArray_SETITEM(result, PyArray_ITER_DATA(it), self.f(chunk))
chunk.data = chunk.data + increment
PyArray_ITER_NEXT(it)

# so we don't free the wrong memory
chunk.data = dummy_buf

return result
def next(self):
pass
2 changes: 1 addition & 1 deletion pandas/src/tseries.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -473,4 +473,4 @@ include "moments.pyx"
include "reindex.pyx"
include "generated.pyx"
include "parsing.pyx"

include "reduce.pyx"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def run(self):
cmdclass['sdist'] = CheckSDist

tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments',
'generated', 'parsing']
'generated', 'parsing', 'reduce']
def srcpath(name=None, suffix='.pyx', subdir='src'):
return pjoin('pandas', subdir, name+suffix)

Expand Down

0 comments on commit 74f5d6d

Please sign in to comment.