pandas-dev · jbrockmendel · Mar 27, 2020 · Jan 13, 2020 · Feb 13, 2020 · Feb 13, 2020
diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -0,0 +1,47 @@
+"""
+masked_reductions.py is for reduction algorithms using a mask-based approach
+for missing values.
+"""
+
+import numpy as np
+
+from pandas._libs import missing as libmissing
+from pandas.compat.numpy import _np_version_under1p17
+
+from pandas.core.nanops import _below_min_count
+
+
+def sum(
+    values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0,
+):
+    """
+    Sum for 1D masked array.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        Numpy array with the values (can be of any dtype that support the
+        operation).
+    mask : np.ndarray
+        Boolean numpy array (True values indicate missing values).
+    skipna : bool, default True
+        Whether to skip NA.
+    min_count : int, default 0
+        The required number of valid values to perform the operation. If fewer than
+        ``min_count`` non-NA values are present the result will be NA.
+    """
+    if not skipna:
+        if mask.any():
+            return libmissing.NA
+        else:
+            if _below_min_count(values.shape, None, min_count):
+                return libmissing.NA
+            return np.sum(values)
+    else:
+        if _below_min_count(values.shape, mask, min_count):
+            return libmissing.NA
+
+        if _np_version_under1p17:
+            return np.sum(values[~mask])
+        else:
+            return np.sum(values, where=~mask)
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.missing import isna, notna
 
 from pandas.core import nanops, ops
+from pandas.core.array_algos import masked_reductions
 from pandas.core.indexers import check_array_indexer
 
 from .masked import BaseMaskedArray
@@ -695,6 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        if name == "sum":
+            return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
+
         # coerce to a nan-aware float if needed
         if self._hasna:
             data = self.to_numpy("float64", na_value=np.nan)
@@ -706,7 +710,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
             return libmissing.NA
 
         # if we have numeric op that would result in an int, coerce to int if possible
-        if name in ["sum", "prod"] and notna(result):
+        if name == "prod" and notna(result):
             int_result = np.int64(result)
             if int_result == result:
                 result = int_result

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import nanops, ops
+from pandas.core.array_algos import masked_reductions
 import pandas.core.common as com
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
@@ -560,6 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        if name == "sum":
+            return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
+
         # coerce to a nan-aware float if needed
         # (we explicitly use NaN within reductions)
         if self._hasna:
@@ -577,7 +581,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
 
         # if we have a preservable numeric op,
         # provide coercion back to an integer type if possible
-        elif name in ["sum", "min", "max", "prod"]:
+        elif name in ["min", "max", "prod"]:
             # GH#31409 more performant than casting-then-checking
             result = com.cast_scalar_indexer(result)
 

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1238,7 +1238,7 @@ def _maybe_null_out(
     result: np.ndarray,
     axis: Optional[int],
     mask: Optional[np.ndarray],
-    shape: Tuple,
+    shape: Tuple[int],
     min_count: int = 1,
 ) -> float:
     """
@@ -1260,16 +1260,41 @@ def _maybe_null_out(
                 # GH12941, use None to auto cast null
                 result[null_mask] = None
     elif result is not NaT:
-        if mask is not None:
-            null_mask = mask.size - mask.sum()
-        else:
-            null_mask = np.prod(shape)
-        if null_mask < min_count:
+        if _below_min_count(shape, mask, min_count):
             result = np.nan
 
     return result
 
 
+def _below_min_count(shape: Tuple[int], mask: Optional[np.ndarray], min_count: int):
+    """
+    Check for the `min_count` keyword. Returns True if below `min_count` (when
+    missing value should be returned from the reduction).
+
+    Parameters
+    ----------
+    shape : tuple
+        The shape of the values (`values.shape`).
+    mask : ndarray or None
+        Boolean numpy array (typically of same shape as `shape`) or None.
+    min_count : int
+        Keyword passed through from sum/prod call.
+
+    Returns
+    -------
+    bool
+    """
+    if min_count > 0:
+        if mask is None:
+            # no missing values, only check size
+            non_nulls = np.prod(shape)
+        else:
+            non_nulls = mask.size - mask.sum()
+        if non_nulls < min_count:
+            return True
+    return False
+
+
 def _zero_out_fperr(arg):
     # #18044 reference this behavior to fix rolling skew/kurt issue
     if isinstance(arg, np.ndarray):

diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py
@@ -46,7 +46,9 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
     if dropna:
         s = s.dropna()
 
-    if op in ("sum", "prod"):
+    if op == "sum":
+        assert isinstance(getattr(s, op)(), np.int_)
+    elif op == "prod":
         assert isinstance(getattr(s, op)(), np.int64)
     elif op in ("min", "max"):
         assert isinstance(getattr(s, op)(), np.bool_)

diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -34,7 +34,10 @@ def test_preserve_dtypes(op):
 
     # op
     result = getattr(df.C, op)()
-    assert isinstance(result, int)
+    if op == "sum":
+        assert isinstance(result, np.int64)
+    else:
+        assert isinstance(result, int)
 
     # groupby
     result = getattr(df.groupby("A"), op)()

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
@@ -531,13 +531,14 @@ def test_sum_inf(self):
         res = nanops.nansum(arr, axis=1)
         assert np.isinf(res).all()
 
+    @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"])
     @pytest.mark.parametrize("use_bottleneck", [True, False])
     @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
-    def test_empty(self, method, unit, use_bottleneck):
+    def test_empty(self, method, unit, use_bottleneck, dtype):
         with pd.option_context("use_bottleneck", use_bottleneck):
             # GH#9422 / GH#18921
             # Entirely empty
-            s = Series([], dtype=object)
+            s = Series([], dtype=dtype)
             # NA by default
             result = getattr(s, method)()
             assert result == unit
@@ -560,8 +561,14 @@ def test_empty(self, method, unit, use_bottleneck):
             result = getattr(s, method)(skipna=True, min_count=1)
             assert pd.isna(result)
 
+            result = getattr(s, method)(skipna=False, min_count=0)
+            assert result == unit
+
+            result = getattr(s, method)(skipna=False, min_count=1)
+            assert pd.isna(result)
+
             # All-NA
-            s = Series([np.nan])
+            s = Series([np.nan], dtype=dtype)
             # NA by default
             result = getattr(s, method)()
             assert result == unit
@@ -585,7 +592,7 @@ def test_empty(self, method, unit, use_bottleneck):
             assert pd.isna(result)
 
             # Mix of valid, empty
-            s = Series([np.nan, 1])
+            s = Series([np.nan, 1], dtype=dtype)
             # Default
             result = getattr(s, method)()
             assert result == 1.0
@@ -604,22 +611,22 @@ def test_empty(self, method, unit, use_bottleneck):
             result = getattr(s, method)(skipna=True, min_count=0)
             assert result == 1.0
 
-            result = getattr(s, method)(skipna=True, min_count=1)
-            assert result == 1.0
-
             # GH#844 (changed in GH#9422)
-            df = DataFrame(np.empty((10, 0)))
+            df = DataFrame(np.empty((10, 0)), dtype=dtype)
             assert (getattr(df, method)(1) == unit).all()
 
-            s = pd.Series([1])
+            s = pd.Series([1], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)
 
-            s = pd.Series([np.nan])
+            result = getattr(s, method)(skipna=False, min_count=2)
+            assert pd.isna(result)
+
+            s = pd.Series([np.nan], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)
 
-            s = pd.Series([np.nan, 1])
+            s = pd.Series([np.nan, 1], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)