Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch agg defaults to numeric_only=None #270

Merged
merged 4 commits into from
Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 45 additions & 11 deletions eland/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import warnings
from io import StringIO
import re
from typing import Optional, Sequence, Union, Tuple
from typing import Optional, Sequence, Union, Tuple, List

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1328,7 +1328,14 @@ def keys(self) -> pd.Index:
"""
return self.columns

def aggregate(self, func, axis=0, *args, **kwargs):
def aggregate(
self,
func: Union[str, List[str]],
axis: int = 0,
numeric_only: Optional[bool] = None,
*args,
**kwargs,
) -> Union[pd.Series, pd.DataFrame]:
"""
Aggregate using one or more operations over the specified axis.

Expand All @@ -1347,8 +1354,13 @@ def aggregate(self, func, axis=0, *args, **kwargs):

Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
'rank', 'sem', 'skew', 'sum', 'std', 'var']``
axis
axis: int
Currently, we only support axis=0 (index)
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: returns all values with float64, NaN/NaT are ignored.
- False: returns all values with float64.
- None: returns all values with default datatype.
*args
Positional arguments to pass to `func`
**kwargs
Expand All @@ -1368,12 +1380,30 @@ def aggregate(self, func, axis=0, *args, **kwargs):

Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
DistanceKilometers AvgTicketPrice
sum 92616288 8204364
min 0 100
std 4578 266
>>> df = ed.DataFrame('localhost', 'flights').filter(['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int)
AvgTicketPrice DistanceKilometers
sum 8204364 92616288
min 100 0
std 266 4578

>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True)
AvgTicketPrice DistanceKilometers
sum 8.204365e+06 9.261629e+07
min 1.000205e+02 0.000000e+00
std 2.664071e+02 4.578614e+03

>>> df.aggregate(['sum', 'min', 'std'], numeric_only=False)
AvgTicketPrice DistanceKilometers timestamp DestCountry
sum 8.204365e+06 9.261629e+07 NaT NaN
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
std 2.664071e+02 4.578614e+03 NaT NaN

>>> df.aggregate(['sum', 'min', 'std'], numeric_only=None)
AvgTicketPrice DistanceKilometers timestamp DestCountry
sum 8.204365e+06 9.261629e+07 NaT NaN
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
std 2.664071e+02 4.578614e+03 NaT NaN
"""
axis = pd.DataFrame._get_axis_number(axis)

Expand All @@ -1387,10 +1417,14 @@ def aggregate(self, func, axis=0, *args, **kwargs):
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, str):
# Wrap in list
return self._query_compiler.aggs([func]).squeeze().rename(None)
return (
self._query_compiler.aggs([func], numeric_only=numeric_only)
.squeeze()
.rename(None)
)
elif is_list_like(func):
# we have a list!
return self._query_compiler.aggs(func)
return self._query_compiler.aggs(func, numeric_only=numeric_only)

agg = aggregate

Expand Down
3 changes: 3 additions & 0 deletions eland/field_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ def is_es_agg_compatible(self, es_agg) -> bool:

# Cardinality works for all types
# Numerics and bools work for all aggs
# Except "median_absolute_deviation" which doesn't support bool
if es_agg == "median_absolute_deviation" and self.is_bool:
return False
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
return True
# Timestamps also work for 'min', 'max' and 'avg'
Expand Down
Loading