Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: use BlockManager.apply for Rolling.count #35883

Merged
merged 14 commits into from
Aug 31, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 17 additions & 42 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from pandas._libs.tslibs import BaseOffset, to_offset
import pandas._libs.window.aggregations as window_aggregations
from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion, Label
from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
Expand All @@ -44,6 +44,7 @@
ABCSeries,
ABCTimedeltaIndex,
)
from pandas.core.dtypes.missing import notna

from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin
import pandas.core.common as com
Expand Down Expand Up @@ -395,40 +396,6 @@ def _wrap_result(self, result, block=None, obj=None):
return type(obj)(result, index=index, columns=block.columns)
return result

def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeriesUnion:
"""
Wrap the results.

Parameters
----------
results : list of ndarrays
obj : conformed data (may be resampled)
skipped: List[int]
Indices of blocks that are skipped.
"""
from pandas import Series, concat

if obj.ndim == 1:
if not results:
raise DataError("No numeric types to aggregate")
assert len(results) == 1
return Series(results[0], index=obj.index, name=obj.name)

exclude: List[Label] = []
orig_blocks = list(obj._to_dict_of_blocks(copy=False).values())
for i in skipped:
exclude.extend(orig_blocks[i].columns)

columns = [c for c in self._selected_obj.columns if c not in exclude]
if not columns and not len(results) and exclude:
raise DataError("No numeric types to aggregate")
elif not len(results):
return obj.astype("float64")

df = concat(results, axis=1).reindex(columns=columns, copy=False)
self._insert_on_column(df, obj)
return df

def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"):
# if we have an 'on' column we want to put it back into
# the results in the same location
Expand Down Expand Up @@ -1325,21 +1292,29 @@ def count(self):
# implementations shouldn't end up here
assert not isinstance(self.window, BaseIndexer)

blocks, obj = self._create_blocks(self._selected_obj)
results = []
for b in blocks:
result = b.notna().astype(int)
_, obj = self._create_blocks(self._selected_obj)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually hoping that count could be defined in terms of self._apply so we can have less custom logic

def count(self):
    window_func = self._get_cython_func_type("roll_count")
    return self._apply(window_func, center=self.center, floor=0, name="count", **kwargs)

Might be not as straightforward (possibly) but just a heads up of where (hopefully) count should head.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this something you're planning on doing in the forseeable future? i agree thatd be a nicer solution

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll experiment with this tonight and get a PR up if tests pass

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this version allows (inconsistently with other rolling function) non-numeric data like datetimes and strings because of the ahead of time coercing with notna().astype(int). Taking longer than expected to tie out tests with using the roll_count cython version

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if anything can be shared, but it looks like DataFrameGroupBy.count is doing things kludgily blockwise and should also use an apply pattern


def hfunc(values: np.ndarray) -> np.ndarray:
result = notna(values)
result = result.astype(int)
frame = type(obj)(result.T)
result = self._constructor(
result,
frame,
window=self._get_window(),
min_periods=self.min_periods or 0,
center=self.center,
axis=self.axis,
closed=self.closed,
).sum()
results.append(result)
return result.values.T

return self._wrap_results(results, obj, skipped=[])
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
new_mgr = obj._mgr.apply(hfunc)
out = obj._constructor(new_mgr)
if obj.ndim == 1:
out.name = obj.name
else:
self._insert_on_column(out, obj)
return out

_shared_docs["apply"] = dedent(
r"""
Expand Down