Skip to content

Commit

Permalink
PERF: perform reductions block-wise (pandas-dev#29847)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and hweecat committed Jan 1, 2020
1 parent f4d3806 commit d7ff4e6
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 2 deletions.
20 changes: 20 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7746,6 +7746,26 @@ def _get_data(axis_matters):
raise NotImplementedError(msg)
return data

if numeric_only is not None and axis in [0, 1]:
df = self
if numeric_only is True:
df = _get_data(axis_matters=True)
if axis == 1:
df = df.T
axis = 0

out_dtype = "bool" if filter_type == "bool" else None

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager._reduce
res = df._data.reduce(op, axis=1, skipna=skipna, **kwds)
assert isinstance(res, dict)
if len(res):
assert len(res) == max(list(res.keys())) + 1, res.keys()
out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype)
out.index = df.columns
return out

if numeric_only is None:
values = self.values
try:
Expand Down
26 changes: 26 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,32 @@ def _verify_integrity(self):
f"tot_items: {tot_items}"
)

def reduce(self, func, *args, **kwargs):
# If 2D, we assume that we're operating column-wise
if self.ndim == 1:
# we'll be returning a scalar
blk = self.blocks[0]
return func(blk.values, *args, **kwargs)

res = {}
for blk in self.blocks:
bres = func(blk.values, *args, **kwargs)

if np.ndim(bres) == 0:
# EA
assert blk.shape[0] == 1
new_res = zip(blk.mgr_locs.as_array, [bres])
else:
assert bres.ndim == 1, bres.shape
assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs)
new_res = zip(blk.mgr_locs.as_array, bres)

nr = dict(new_res)
assert not any(key in res for key in nr)
res.update(nr)

return res

def apply(self, f, filter=None, **kwargs):
"""
Iterate over the blocks, collect and create a new BlockManager.
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None):
try:
result = getattr(values, meth)(axis, dtype=dtype_max)
result.fill(np.nan)
except (AttributeError, TypeError, ValueError, np.core._internal.AxisError):
except (AttributeError, TypeError, ValueError):
result = np.nan
else:
result = getattr(values, meth)(axis)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ def test_omit_nuisance(df):

# won't work with axis = 1
grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
msg = r"unsupported operand type\(s\) for \+: 'Timestamp'"
msg = "reduction operation 'sum' not allowed for this dtype"
with pytest.raises(TypeError, match=msg):
grouped.agg(lambda x: x.sum(0, numeric_only=False))

Expand Down

0 comments on commit d7ff4e6

Please sign in to comment.