From d7ff4e6c3254a5f197f90531b3f20f064eab1916 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Jan 2020 09:18:20 -0800 Subject: [PATCH] PERF: perform reductions block-wise (#29847) --- pandas/core/frame.py | 20 ++++++++++++++++++++ pandas/core/internals/managers.py | 26 ++++++++++++++++++++++++++ pandas/core/nanops.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0951f635b0e093..732b28d6a97fe1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7746,6 +7746,26 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + return out + if numeric_only is None: values = self.values try: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 32cd65b4dc7d68..995e6f0aaad8e7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -340,6 +340,32 @@ def _verify_integrity(self): f"tot_items: {tot_items}" ) + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + def apply(self, f, filter=None, **kwargs): """ Iterate over the blocks, collect and create a new BlockManager. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1079f516a4e40e..584972f2b2dd5f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -831,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError): result = np.nan else: result = getattr(values, meth)(axis) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 16c98f019b99d4..930d0a998e08ca 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -771,7 +771,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False))