From c8ebeb59f057e80b3f09f31d2a65f18dfb501b1b Mon Sep 17 00:00:00 2001 From: Palakodeti Sai Vinay Date: Thu, 3 Sep 2020 12:44:01 +0530 Subject: [PATCH 1/4] Switch agg defaults to numeric_only=None --- .gitattributes | 1 + eland/dataframe.py | 50 +- eland/field_mappings.py | 3 + eland/ndframe.py | 474 +++++++++++++-- eland/operations.py | 99 ++- eland/query_compiler.py | 20 +- eland/tests/dataframe/test_aggs_pytest.py | 34 +- eland/tests/dataframe/test_metrics_pytest.py | 606 +++++++++++++++++-- eland/tests/series/test_metrics_pytest.py | 6 +- 9 files changed, 1125 insertions(+), 168 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..dfdb8b77 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.sh text eol=lf diff --git a/eland/dataframe.py b/eland/dataframe.py index f7406876..afd67245 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,7 +19,7 @@ import warnings from io import StringIO import re -from typing import Optional, Sequence, Union, Tuple +from typing import Optional, Sequence, Union, Tuple, List import numpy as np import pandas as pd @@ -1328,7 +1328,14 @@ def keys(self) -> pd.Index: """ return self.columns - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate( + self, + func: List[str], + axis: int = 0, + numeric_only: Optional[bool] = None, + *args, + **kwargs, + ) -> Union[pd.Series, pd.DataFrame]: """ Aggregate using one or more operations over the specified axis. @@ -1347,8 +1354,13 @@ def aggregate(self, func, axis=0, *args, **kwargs): Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var']`` - axis + axis: int Currently, we only support axis=0 (index) + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatype. *args Positional arguments to pass to `func` **kwargs @@ -1369,11 +1381,33 @@ def aggregate(self, func, axis=0, *args, **kwargs): Examples -------- >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int) + >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'],numeric_only=True).astype(int) DistanceKilometers AvgTicketPrice sum 92616288 8204364 min 0 100 std 4578 266 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df[['AvgTicketPrice','timestamp', 'DestCountry']].aggregate(['sum', 'min', 'std'], numeric_only=True) + AvgTicketPrice + sum 8.204365e+06 + min 1.000205e+02 + std 2.664071e+02 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df[['AvgTicketPrice','timestamp', 'DestCountry']].aggregate(['sum', 'min', 'std'], numeric_only=False) + AvgTicketPrice timestamp DestCountry + sum 8.204365e+06 NaT NaN + min 1.000205e+02 2018-01-01 NaN + std 2.664071e+02 NaT NaN + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df[['AvgTicketPrice','timestamp', 'DestCountry']].aggregate(['sum', 'min', 'std'], numeric_only=None) + AvgTicketPrice timestamp DestCountry + sum 8.204365e+06 NaT NaN + min 1.000205e+02 2018-01-01 NaN + std 2.664071e+02 NaT NaN + """ axis = pd.DataFrame._get_axis_number(axis) @@ -1387,10 +1421,14 @@ def aggregate(self, func, axis=0, *args, **kwargs): # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] if isinstance(func, str): # Wrap in list - return self._query_compiler.aggs([func]).squeeze().rename(None) + return ( + self._query_compiler.aggs([func], numeric_only=numeric_only) + .squeeze() + .rename(None) + ) elif is_list_like(func): # we have a list! - return self._query_compiler.aggs(func) + return self._query_compiler.aggs(func, numeric_only=numeric_only) agg = aggregate diff --git a/eland/field_mappings.py b/eland/field_mappings.py index dd7757b3..2382c7f4 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -100,6 +100,9 @@ def is_es_agg_compatible(self, es_agg) -> bool: # Cardinality works for all types # Numerics and bools work for all aggs + # Except "median_absolute_deviation" supports only bool + if es_agg == "median_absolute_deviation" and self.is_bool: + return False if es_agg == "cardinality" or self.is_numeric or self.is_bool: return True # Timestamps also work for 'min', 'max' and 'avg' diff --git a/eland/ndframe.py b/eland/ndframe.py index 16a01df9..edfda0fa 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -17,7 +17,7 @@ import sys from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Tuple, Optional import pandas as pd from eland.query_compiler import QueryCompiler @@ -162,12 +162,20 @@ def __len__(self) -> int: def _es_info(self, buf): self._query_compiler.es_info(buf) - def mean(self, numeric_only: bool = True) -> pd.Series: + def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return mean value for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -181,6 +189,20 @@ def mean(self, numeric_only: bool = True) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.mean() + AvgTicketPrice 628.254 + Cancelled True + DistanceKilometers 7092.14 + DistanceMiles 4406.85 + FlightDelay True + FlightDelayMin 47 + FlightTimeHour 8.5188 + FlightTimeMin 511.128 + dayOfWeek 2 + timestamp 2018-01-21 19:20:45.564438232 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.mean(numeric_only=True) AvgTicketPrice 628.253689 Cancelled 0.128494 DistanceKilometers 7092.142457 @@ -191,15 +213,55 @@ def mean(self, numeric_only: bool = True) -> pd.Series: FlightTimeMin 511.127842 dayOfWeek 2.835975 dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.mean(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 628.254 + Cancelled 0.128494 + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 7092.14 + DistanceMiles 4406.85 + FlightDelay 0.251168 + FlightDelayMin 47.3352 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 8.5188 + FlightTimeMin 511.128 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 2.83598 + timestamp 2018-01-21 19:20:45.564438232 + dtype: object + """ return self._query_compiler.mean(numeric_only=numeric_only) - def sum(self, numeric_only: bool = True) -> pd.Series: + def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return sum for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -213,6 +275,19 @@ def sum(self, numeric_only: bool = True) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.sum() + AvgTicketPrice 8.20436e+06 + Cancelled True + DistanceKilometers 9.26163e+07 + DistanceMiles 5.75491e+07 + FlightDelay True + FlightDelayMin 618150 + FlightTimeHour 111247 + FlightTimeMin 6.67482e+06 + dayOfWeek 37035 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.sum(numeric_only=True) AvgTicketPrice 8.204365e+06 Cancelled 1.678000e+03 DistanceKilometers 9.261629e+07 @@ -223,15 +298,55 @@ def sum(self, numeric_only: bool = True) -> pd.Series: FlightTimeMin 6.674818e+06 dayOfWeek 3.703500e+04 dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.sum(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 8.20436e+06 + Cancelled 1678 + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 9.26163e+07 + DistanceMiles 5.75491e+07 + FlightDelay 3280 + FlightDelayMin 618150 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 111247 + FlightTimeMin 6.67482e+06 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 37035 + timestamp NaT + dtype: object + """ return self._query_compiler.sum(numeric_only=numeric_only) - def min(self, numeric_only: bool = True) -> pd.Series: + def min(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the minimum value for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -245,23 +360,76 @@ def min(self, numeric_only: bool = True) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.min() - AvgTicketPrice 100.021 - Cancelled False - DistanceKilometers 0 - DistanceMiles 0 - FlightDelay False - FlightDelayMin 0 - FlightTimeHour 0 - FlightTimeMin 0 - dayOfWeek 0 + AvgTicketPrice 100.021 + Cancelled False + DistanceKilometers 0 + DistanceMiles 0 + FlightDelay False + FlightDelayMin 0 + FlightTimeHour 0 + FlightTimeMin 0 + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.min(numeric_only=True) + AvgTicketPrice 100.020531 + Cancelled 0.000000 + DistanceKilometers 0.000000 + DistanceMiles 0.000000 + FlightDelay 0.000000 + FlightDelayMin 0.000000 + FlightTimeHour 0.000000 + FlightTimeMin 0.000000 + dayOfWeek 0.000000 + dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.min(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 100.021 + Cancelled False + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 0 + DistanceMiles 0 + FlightDelay False + FlightDelayMin 0 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 0 + FlightTimeMin 0 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 dtype: object """ return self._query_compiler.min(numeric_only=numeric_only) - def var(self, numeric_only: bool = True) -> pd.Series: + def var(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return variance for each numeric column + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -274,24 +442,76 @@ def var(self, numeric_only: bool = True) -> pd.Series: Examples -------- >>> df = ed.DataFrame('localhost', 'flights') - >>> df.var() # doctest: +SKIP - AvgTicketPrice 7.096185e+04 - Cancelled 1.119831e-01 - DistanceKilometers 2.096049e+07 - DistanceMiles 8.092892e+06 - FlightDelay 1.880825e-01 - FlightDelayMin 9.359209e+03 - FlightTimeHour 3.112545e+01 - FlightTimeMin 1.120516e+05 - dayOfWeek 3.761135e+00 + >>> df.var() + AvgTicketPrice 70964.6 + Cancelled True + DistanceKilometers 2.09613e+07 + DistanceMiles 8.0932e+06 + FlightDelay True + FlightDelayMin 9359 + FlightTimeHour 31.1266 + FlightTimeMin 112056 + dayOfWeek 3 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.var(numeric_only=True) + AvgTicketPrice 7.096457e+04 + Cancelled 1.119874e-01 + DistanceKilometers 2.096130e+07 + DistanceMiles 8.093202e+06 + FlightDelay 1.880897e-01 + FlightDelayMin 9.359568e+03 + FlightTimeHour 3.112664e+01 + FlightTimeMin 1.120559e+05 + dayOfWeek 3.761279e+00 dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.var(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 70964.6 + Cancelled True + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 2.09613e+07 + DistanceMiles 8.0932e+06 + FlightDelay True + FlightDelayMin 9359 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 31.1266 + FlightTimeMin 112056 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 3 + timestamp NaT + dtype: object """ return self._query_compiler.var(numeric_only=numeric_only) - def std(self, numeric_only: bool = True) -> pd.Series: + def std(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return standard deviation for each numeric column + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -304,24 +524,77 @@ def std(self, numeric_only: bool = True) -> pd.Series: Examples -------- >>> df = ed.DataFrame('localhost', 'flights') - >>> df.std() # doctest: +SKIP - AvgTicketPrice 266.386661 - Cancelled 0.334639 - DistanceKilometers 4578.263193 - DistanceMiles 2844.800855 - FlightDelay 0.433685 - FlightDelayMin 96.743006 - FlightTimeHour 5.579019 - FlightTimeMin 334.741135 - dayOfWeek 1.939365 + >>> df.std() + AvgTicketPrice 266.407 + Cancelled True + DistanceKilometers 4578.61 + DistanceMiles 2845.02 + FlightDelay True + FlightDelayMin 96 + FlightTimeHour 5.57945 + FlightTimeMin 334.767 + dayOfWeek 1 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.std(numeric_only=True) + AvgTicketPrice 266.407061 + Cancelled 0.334664 + DistanceKilometers 4578.613803 + DistanceMiles 2845.018714 + FlightDelay 0.433718 + FlightDelayMin 96.750415 + FlightTimeHour 5.579446 + FlightTimeMin 334.766770 + dayOfWeek 1.939513 dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.std(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 266.407 + Cancelled 0.334664 + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 4578.61 + DistanceMiles 2845.02 + FlightDelay 0.433718 + FlightDelayMin 96.7504 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 5.57945 + FlightTimeMin 334.767 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 1.93951 + timestamp NaT + dtype: object + """ return self._query_compiler.std(numeric_only=numeric_only) - def median(self, numeric_only: bool = True) -> pd.Series: + def median(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the median value for each numeric column + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -335,25 +608,79 @@ def median(self, numeric_only: bool = True) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.median() # doctest: +SKIP - AvgTicketPrice 640.387285 + AvgTicketPrice 640.387 + Cancelled False + DistanceKilometers 7612.07 + DistanceMiles 4729.92 + FlightDelay False + FlightDelayMin 0 + FlightTimeHour 8.38582 + FlightTimeMin 502.987 + dayOfWeek 3 + timestamp 2018-01-21 23:44:29.394281982 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.median(numeric_only=True) # doctest: +SKIP + AvgTicketPrice 640.424738 Cancelled 0.000000 - DistanceKilometers 7612.072403 - DistanceMiles 4729.922470 + DistanceKilometers 7612.145848 + DistanceMiles 4728.752630 FlightDelay 0.000000 FlightDelayMin 0.000000 - FlightTimeHour 8.383113 - FlightTimeMin 503.148975 + FlightTimeHour 8.383711 + FlightTimeMin 502.999492 dayOfWeek 3.000000 dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.median(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 640.425 + Cancelled False + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 7612.15 + DistanceMiles 4728.75 + FlightDelay False + FlightDelayMin 0 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 8.38457 + FlightTimeMin 503.074 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 3 + timestamp 2018-01-21 23:33:14.554215332 + dtype: object + """ return self._query_compiler.median(numeric_only=numeric_only) - def max(self, numeric_only: bool = True) -> pd.Series: + def max(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the maximum value for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatypes, NaN/NaT are ignored. + Returns ------- pandas.Series @@ -367,15 +694,60 @@ def max(self, numeric_only: bool = True) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.max() - AvgTicketPrice 1199.73 - Cancelled True - DistanceKilometers 19881.5 - DistanceMiles 12353.8 - FlightDelay True - FlightDelayMin 360 - FlightTimeHour 31.715 - FlightTimeMin 1902.9 - dayOfWeek 6 + AvgTicketPrice 1199.73 + Cancelled True + DistanceKilometers 19881.5 + DistanceMiles 12353.8 + FlightDelay True + FlightDelayMin 360 + FlightTimeHour 31.715 + FlightTimeMin 1902.9 + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 + dtype: object + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.max(numeric_only=True) + AvgTicketPrice 1199.729004 + Cancelled 1.000000 + DistanceKilometers 19881.482422 + DistanceMiles 12353.780273 + FlightDelay 1.000000 + FlightDelayMin 360.000000 + FlightTimeHour 31.715034 + FlightTimeMin 1902.901978 + dayOfWeek 6.000000 + dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.max(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 1199.73 + Cancelled True + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 19881.5 + DistanceMiles 12353.8 + FlightDelay True + FlightDelayMin 360 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 31.715 + FlightTimeMin 1902.9 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 dtype: object """ return self._query_compiler.max(numeric_only=numeric_only) diff --git a/eland/operations.py b/eland/operations.py index 9728be74..dce2cd1b 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -145,43 +145,59 @@ def count(self, query_compiler): return build_pd_series(data=counts, index=fields) - def mean(self, query_compiler, numeric_only=True): + def mean(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) - def var(self, query_compiler, numeric_only=True): + def var(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) - def std(self, query_compiler, numeric_only=True): + def std(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) - def median(self, query_compiler, numeric_only=True): + def median(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs( query_compiler, ["median"], numeric_only=numeric_only ) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) - def sum(self, query_compiler, numeric_only=True): + def sum(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) - def max(self, query_compiler, numeric_only=True): + def max(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) - def min(self, query_compiler, numeric_only=True): + def min(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) def nunique(self, query_compiler): results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False) return build_pd_series(results, index=results.keys()) - def mad(self, query_compiler, numeric_only=True): + def mad(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + return build_pd_series( + results, index=results.keys(), dtype=(np.float64 if numeric_only else None) + ) def value_counts(self, query_compiler, es_size): return self._terms_aggs(query_compiler, "terms", es_size) @@ -189,7 +205,13 @@ def value_counts(self, query_compiler, es_size): def hist(self, query_compiler, bins): return self._hist_aggs(query_compiler, bins) - def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True): + def _metric_aggs( + self, + query_compiler: "QueryCompiler", + pd_aggs, + numeric_only: Optional[bool] = None, + is_aggregation: bool = False, + ): query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) @@ -201,6 +223,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr results = {} fields = query_compiler._mappings.all_source_fields() if numeric_only: + # Consider if field is Int/Float/Bool fields = [field for field in fields if (field.is_numeric or field.is_bool)] body = Query(query_params.query) @@ -210,6 +233,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr for field in fields: for es_agg in es_aggs: + # NaN/NaT fields are ignored if not field.is_es_agg_compatible(es_agg): continue @@ -242,9 +266,17 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): - # If the field and agg aren't compatible we add a NaN/NaT + # If the field and agg aren't compatible we add a NaN/NaT for agg() + # If the field and agg aren't compatible we dont add NaN/NaT for non-agg() if not field.is_es_agg_compatible(es_agg): - values.append(field.nan_value) + if is_aggregation and not numeric_only: + values.append(field.nan_value) + elif not is_aggregation and numeric_only is False: + values.append(field.nan_value) + # Explicit condition for mad to add NaN because it doesnt support bool + elif is_aggregation and numeric_only: + if pd_agg == "mad": + values.append(field.nan_value) continue if isinstance(es_agg, tuple): @@ -269,7 +301,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr # All of the below calculations result in NaN if count<=1 if count <= 1: - agg_value = np.float64(np.NaN) + agg_value = np.NaN elif es_agg[1] == "std_deviation": agg_value *= count / (count - 1.0) @@ -287,8 +319,11 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr ]["value"] # Null usually means there were no results. - if agg_value is None: - agg_value = field.nan_value + if agg_value is None or np.isnan(agg_value): + if is_aggregation and not numeric_only: + agg_value = np.NaN + elif not is_aggregation and numeric_only is False: + agg_value = np.NaN # Cardinality is always either NaN or integer. elif pd_agg == "nunique": @@ -299,14 +334,15 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr agg_value = elasticsearch_date_to_pandas_date( agg_value, field.es_date_format ) - - # These aggregations maintain the column datatype - elif pd_agg in {"max", "min", "median"}: + # If numeric_only is False | None then maintan column datatype + elif not numeric_only: agg_value = field.np_dtype.type(agg_value) values.append(agg_value) - results[field.index] = values if len(values) > 1 else values[0] + # If numeric_only is True and We only have a NaN type field then we check for empty. + if len(values) != 0: + results[field.index] = values if len(values) > 1 else values[0] return results @@ -540,9 +576,14 @@ def _map_pd_aggs_to_es_aggs(pd_aggs): return es_aggs - def aggs(self, query_compiler, pd_aggs): - results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False) - return pd.DataFrame(results, index=pd_aggs) + def aggs(self, query_compiler, pd_aggs, numeric_only=None): + results = self._metric_aggs( + query_compiler, pd_aggs, numeric_only=numeric_only, is_aggregation=True + ) + if numeric_only: + return pd.DataFrame(results, index=pd_aggs).astype("float64") + else: + return pd.DataFrame(results, index=pd_aggs) def filter(self, query_compiler, items=None, like=None, regex=None): # This function is only called for axis='index', diff --git a/eland/query_compiler.py b/eland/query_compiler.py index ddceff52..10bdc978 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -490,34 +490,34 @@ def filter(self, items=None, like=None, regex=None): result._operations.filter(self, items=items, like=like, regex=regex) return result - def aggs(self, func): - return self._operations.aggs(self, func) + def aggs(self, func, numeric_only: Optional[bool] = None): + return self._operations.aggs(self, func, numeric_only=numeric_only) def count(self): return self._operations.count(self) - def mean(self, numeric_only=None): + def mean(self, numeric_only: Optional[bool] = None): return self._operations.mean(self, numeric_only=numeric_only) - def var(self, numeric_only=None): + def var(self, numeric_only: Optional[bool] = None): return self._operations.var(self, numeric_only=numeric_only) - def std(self, numeric_only=None): + def std(self, numeric_only: Optional[bool] = None): return self._operations.std(self, numeric_only=numeric_only) - def mad(self, numeric_only=None): + def mad(self, numeric_only: Optional[bool] = None): return self._operations.mad(self, numeric_only=numeric_only) - def median(self, numeric_only=None): + def median(self, numeric_only: Optional[bool] = None): return self._operations.median(self, numeric_only=numeric_only) - def sum(self, numeric_only=None): + def sum(self, numeric_only: Optional[bool] = None): return self._operations.sum(self, numeric_only=numeric_only) - def min(self, numeric_only=None): + def min(self, numeric_only: Optional[bool] = None): return self._operations.min(self, numeric_only=numeric_only) - def max(self, numeric_only=None): + def max(self, numeric_only: Optional[bool] = None): return self._operations.max(self, numeric_only=numeric_only) def nunique(self): diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 330fb61a..4d108b8d 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -29,7 +29,9 @@ def test_basic_aggs(self): ed_flights = self.ed_flights() pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg( + ["sum", "min"], numeric_only=True + ) # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more @@ -40,22 +42,22 @@ def test_basic_aggs(self): ["sum", "min", "std"] ) ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"] + ["sum", "min", "std"], numeric_only=True ) print(pd_sum_min_std.dtypes) print(ed_sum_min_std.dtypes) - assert_frame_equal( - pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True - ) + assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True) def test_terms_aggs(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg( + ["sum", "min"], numeric_only=True + ) # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more @@ -66,15 +68,13 @@ def test_terms_aggs(self): ["sum", "min", "std"] ) ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"] + ["sum", "min", "std"], numeric_only=True ) print(pd_sum_min_std.dtypes) print(ed_sum_min_std.dtypes) - assert_frame_equal( - pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True - ) + assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True) def test_aggs_median_var(self): pd_ecommerce = self.pd_ecommerce() @@ -85,10 +85,10 @@ def test_aggs_median_var(self): ].agg(["median", "var"]) ed_aggs = ed_ecommerce[ ["taxful_total_price", "taxless_total_price", "total_quantity"] - ].agg(["median", "var"]) + ].agg(["median", "var"], numeric_only=True) - print(pd_aggs, pd_aggs.dtypes) - print(ed_aggs, ed_aggs.dtypes) + # print(pd_aggs, pd_aggs.dtypes) + # print(ed_aggs, ed_aggs.dtypes) # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more @@ -102,7 +102,9 @@ def test_terms_aggs_series(self, agg): ed_flights = self.ed_flights() pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg) - ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg) + ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( + agg, numeric_only=True + ) assert_series_equal(pd_sum_min_std, ed_sum_min_std) @@ -112,7 +114,9 @@ def test_terms_aggs_series_with_single_list_agg(self): ed_flights = self.ed_flights() pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg( + ["mean"], numeric_only=True + ) assert_frame_equal(pd_sum_min, ed_sum_min) diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index 494e42d6..f46a9a53 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -16,12 +16,10 @@ # under the License. # File called _pytest for PyCharm compatibility - import pytest import numpy as np import pandas as pd from pandas.testing import assert_series_equal - from eland.tests.common import TestData @@ -29,28 +27,6 @@ class TestDataFrameMetrics(TestData): funcs = ["max", "min", "mean", "sum"] extended_funcs = ["median", "mad", "var", "std"] - @pytest.mark.parametrize("numeric_only", [False, None]) - def test_flights_metrics(self, numeric_only): - pd_flights = self.pd_flights() - ed_flights = self.ed_flights() - - for func in self.funcs: - # Pandas v1.0 doesn't support mean() on datetime - # Pandas and Eland don't support sum() on datetime - if not numeric_only: - dtype_include = ( - [np.number, np.datetime64] - if func not in ("mean", "sum") - else [np.number] - ) - pd_flights = pd_flights.select_dtypes(include=dtype_include) - ed_flights = ed_flights.select_dtypes(include=dtype_include) - - pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only) - ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only) - - assert_series_equal(pd_metric, ed_metric) - def test_flights_extended_metrics(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() @@ -86,11 +62,9 @@ def test_flights_extended_metrics_nan(self): for func in self.extended_funcs: pd_metric = getattr(pd_flights_1, func)() - ed_metric = getattr(ed_flights_1, func)() + ed_metric = getattr(ed_flights_1, func)(numeric_only=False) - assert_series_equal( - pd_metric, ed_metric, check_exact=False, check_less_precise=True - ) + assert_series_equal(pd_metric, ed_metric, check_exact=False) # Test on zero rows to test NaN behaviour of sample std/variance pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]] @@ -98,11 +72,9 @@ def test_flights_extended_metrics_nan(self): for func in self.extended_funcs: pd_metric = getattr(pd_flights_0, func)() - ed_metric = getattr(ed_flights_0, func)() + ed_metric = getattr(ed_flights_0, func)(numeric_only=False) - assert_series_equal( - pd_metric, ed_metric, check_exact=False, check_less_precise=True - ) + assert_series_equal(pd_metric, ed_metric, check_exact=False) def test_ecommerce_selected_non_numeric_source_fields(self): # None of these are numeric @@ -114,14 +86,14 @@ def test_ecommerce_selected_non_numeric_source_fields(self): "user", ] - pd_ecommerce = self.pd_ecommerce()[columns] - ed_ecommerce = self.ed_ecommerce()[columns] + pd_ecommerce = self.pd_ecommerce().filter(columns) + ed_ecommerce = self.ed_ecommerce().filter(columns) for func in self.funcs: assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), - check_less_precise=True, + check_exact=False, ) def test_ecommerce_selected_mixed_numeric_source_fields(self): @@ -136,48 +108,48 @@ def test_ecommerce_selected_mixed_numeric_source_fields(self): "user", ] - pd_ecommerce = self.pd_ecommerce()[columns] - ed_ecommerce = self.ed_ecommerce()[columns] + pd_ecommerce = self.pd_ecommerce().filter(columns) + ed_ecommerce = self.ed_ecommerce().filter(columns) for func in self.funcs: assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), - check_less_precise=True, + check_exact=False, ) def test_ecommerce_selected_all_numeric_source_fields(self): # All of these are numeric columns = ["total_quantity", "taxful_total_price", "taxless_total_price"] - pd_ecommerce = self.pd_ecommerce()[columns] - ed_ecommerce = self.ed_ecommerce()[columns] + pd_ecommerce = self.pd_ecommerce().filter(columns) + ed_ecommerce = self.ed_ecommerce().filter(columns) for func in self.funcs: assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), - check_less_precise=True, + check_exact=False, ) def test_flights_datetime_metrics_agg(self): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { - "timestamp": { - "min": pd.Timestamp("2018-01-01 00:00:00"), - "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), - "max": pd.Timestamp("2018-02-11 23:50:12"), - "nunique": 12236, - "mad": pd.NaT, - "std": pd.NaT, - "sum": pd.NaT, - "var": pd.NaT, - } + "mad": pd.NaT, + "max": pd.Timestamp("2018-02-11 23:50:12"), + "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), + "min": pd.Timestamp("2018-01-01 00:00:00"), + "nunique": 12236, + "std": pd.NaT, + "sum": pd.NaT, + "var": pd.NaT, } - ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"]) - ed_metrics_dict = ed_metrics.to_dict() - ed_metrics_dict["timestamp"].pop("median") # Median is tested below. + ed_metrics = ed_timestamps.agg( + self.funcs + self.extended_funcs + ["nunique"], numeric_only=False + ) + ed_metrics_dict = ed_metrics["timestamp"].to_dict() + ed_metrics_dict.pop("median") # Median is tested below. assert ed_metrics_dict == expected_values @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) @@ -230,7 +202,7 @@ def test_flights_datetime_metrics_median(self): ) def test_metric_agg_keep_dtypes(self): - # max, min, and median maintain their dtypes + # max, min, and median maintain their dtypes for numeric_only=None df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]] assert df.min().tolist() == [131.81910705566406, False, 0] assert df.max().tolist() == [989.9527587890625, True, 0] @@ -250,3 +222,527 @@ def test_metric_agg_keep_dtypes(self): "Cancelled": {"max": True, "median": False, "min": False}, "dayOfWeek": {"max": 0, "median": 0, "min": 0}, } + + def test_flights_numeric_only(self): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + # All Aggregations Data Check + ed_flights = self.ed_flights().filter(filter_data) + pd_flights = self.pd_flights().filter(filter_data) + # agg => numeric_only True returns float64 values + # We compare it with individual non-agg functions of pandas with numeric_only=True + # not checking mad because it returns nan value for booleans. + filtered_aggs = self.funcs + self.extended_funcs + filtered_aggs.remove("mad") + agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose() + for agg in filtered_aggs: + assert_series_equal( + agg_data[agg].rename(None), + getattr(pd_flights, agg)( + **({"numeric_only": True} if agg != "mad" else {}) + ), + check_exact=False, + rtol=True, + ) + + # Mean + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_mean_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 628.2536888148849, + "Cancelled": 0.1284937590933456, + "dayOfWeek": 2.835975189524466, + } + calculated_values = ed_flights.mean(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert expected_values == calculated_values.to_dict() + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 628.2536888148849, + "Cancelled": True, + "dayOfWeek": 2, + } + calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert np.isnan(calculated_values["DestCountry"]) + calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 628.2536888148849, + "Cancelled": True, + "dayOfWeek": 2, + "timestamp": pd.Timestamp("2018-01-21 19:20:45.564438232"), + } + calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert expected_values == calculated_values.to_dict() + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + calculated_values = calculated_values.drop("timestamp") + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + + # Min + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_min_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 100.0205307006836, + "Cancelled": 0.0, + "dayOfWeek": 0.0, + } + calculated_values = ed_flights.min(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert expected_values == calculated_values.to_dict() + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 100.0205307006836, + "Cancelled": False, + "dayOfWeek": 0, + } + calculated_values = ed_flights.min(numeric_only=numeric_only) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert np.isnan(calculated_values["DestCountry"]) + calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 100.0205307006836, + "Cancelled": False, + "dayOfWeek": 0, + "timestamp": pd.Timestamp("2018-01-01 00:00:00"), + } + calculated_values = ed_flights.min(numeric_only=numeric_only) + assert expected_values == calculated_values.to_dict() + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + calculated_values = calculated_values.drop("timestamp") + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + + # max + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_max_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 1199.72900390625, + "Cancelled": 1.0, + "dayOfWeek": 6.0, + } + calculated_values = ed_flights.max(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert expected_values == calculated_values.to_dict() + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 1199.72900390625, + "Cancelled": True, + "dayOfWeek": 6, + } + calculated_values = ed_flights.max(numeric_only=numeric_only) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert np.isnan(calculated_values["DestCountry"]) + calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 1199.72900390625, + "Cancelled": True, + "dayOfWeek": 6, + "timestamp": pd.Timestamp("2018-02-11 23:50:12"), + } + calculated_values = ed_flights.max(numeric_only=numeric_only) + assert expected_values == calculated_values.to_dict() + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + calculated_values = calculated_values.drop("timestamp") + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + + # sum + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_sum_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 8204364.922233582, + "Cancelled": 1678.0, + "dayOfWeek": 37035.0, + } + calculated_values = ed_flights.sum(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert expected_values == calculated_values.to_dict() + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 8204364.922233582, + "Cancelled": True, + "dayOfWeek": 37035, + } + calculated_values = ed_flights.sum(numeric_only=numeric_only) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 8204364.922233582, + "Cancelled": True, + "dayOfWeek": 37035, + } + calculated_values = ed_flights.sum(numeric_only=numeric_only) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + + # std + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_std_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 266.4070611666801, + "Cancelled": 0.33466440694020916, + "dayOfWeek": 1.9395130445445228, + } + calculated_values = ed_flights.std(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert expected_values == calculated_values.to_dict() + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 266.4070611666801, + "Cancelled": True, + "dayOfWeek": 1, + } + calculated_values = ed_flights.std(numeric_only=numeric_only) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 266.4070611666801, + "Cancelled": True, + "dayOfWeek": 1, + } + calculated_values = ed_flights.std(numeric_only=numeric_only) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + + # var + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_var_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 70964.57023354847, + "Cancelled": 0.111987400797438, + "dayOfWeek": 3.7612787756607213, + } + calculated_values = ed_flights.var(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert expected_values == calculated_values.to_dict() + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 70964.57023354847, + "Cancelled": True, + "dayOfWeek": 3, + } + calculated_values = ed_flights.var(numeric_only=numeric_only) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 70964.57023354847, + "Cancelled": True, + "dayOfWeek": 3, + } + calculated_values = ed_flights.var(numeric_only=numeric_only) + assert expected_values == calculated_values.to_dict() + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + + # median + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_median_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = { + "AvgTicketPrice": 640.3872852064159, + "Cancelled": 0.0, + "dayOfWeek": 3.0, + } + calculated_values = ed_flights.median(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert ( + expected_values["Cancelled"] == calculated_values.to_dict()["Cancelled"] + ) + assert ( + expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] + ) + assert ( + (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) + <= expected_values["AvgTicketPrice"] + <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) + ) + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = { + "AvgTicketPrice": 640.3222933002547, + "Cancelled": False, + "dayOfWeek": 3, + "timestamp": pd.Timestamp("2018-01-21 23:58:10.414120850"), + } + calculated_values = ed_flights.median(numeric_only=numeric_only) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert np.isnan(calculated_values["DestCountry"]) + assert ( + expected_values["Cancelled"] == calculated_values.to_dict()["Cancelled"] + ) + assert ( + (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) + <= expected_values["AvgTicketPrice"] + <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) + ) + assert ( + pd.to_datetime("2018-01-21 23:00:00.000") + <= expected_values["timestamp"] + <= pd.to_datetime("2018-01-21 23:59:59.000") + ) + assert ( + expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] + ) + assert isinstance(calculated_values["Cancelled"], np.bool_) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + elif numeric_only is None: + expected_values = { + "AvgTicketPrice": 640.3872852064159, + "Cancelled": False, + "dayOfWeek": 3, + "timestamp": pd.Timestamp("2018-01-21 23:58:10.414120850"), + } + calculated_values = ed_flights.median(numeric_only=numeric_only) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert ( + (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) + <= expected_values["AvgTicketPrice"] + <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) + ) + assert ( + pd.to_datetime("2018-01-21 23:00:00.000") + <= expected_values["timestamp"] + <= pd.to_datetime("2018-01-21 23:59:00.000") + ) + assert isinstance(calculated_values["Cancelled"], np.bool_) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + + # mad + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_mad_numeric_only(self, numeric_only): + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] + ed_flights = self.ed_flights().filter(filter_data) + if numeric_only is True: + expected_values = {"AvgTicketPrice": 213.47889841845912, "dayOfWeek": 2.0} + calculated_values = ed_flights.mad(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert ( + expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] + ) + assert ( + (calculated_values["AvgTicketPrice"] * 0.9) + <= expected_values["AvgTicketPrice"] + <= (calculated_values["AvgTicketPrice"] * 1.1) + ) + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + ] + elif numeric_only is False: + expected_values = {"AvgTicketPrice": 213.36870923117985, "dayOfWeek": 2.0} + calculated_values = ed_flights.mad(numeric_only=numeric_only) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert np.isnan(calculated_values["Cancelled"]) + calculated_values = calculated_values.drop( + ["timestamp", "DestCountry", "Cancelled"] + ) + assert ( + expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] + ) + assert ( + (calculated_values["AvgTicketPrice"] * 0.9) + <= expected_values["AvgTicketPrice"] + <= (calculated_values["AvgTicketPrice"] * 1.1) + ) + isinstance(calculated_values["AvgTicketPrice"], float) + isinstance(calculated_values["dayOfWeek"], float) + + elif numeric_only is None: + expected_values = {"AvgTicketPrice": 213.4408885767035, "dayOfWeek": 2.0} + calculated_values = ed_flights.mad(numeric_only=numeric_only) + assert ( + (calculated_values["AvgTicketPrice"] * 0.9) + <= expected_values["AvgTicketPrice"] + <= (calculated_values["AvgTicketPrice"] * 1.1) + ) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("float64"), + ] diff --git a/eland/tests/series/test_metrics_pytest.py b/eland/tests/series/test_metrics_pytest.py index 86a9557c..d8e213d2 100644 --- a/eland/tests/series/test_metrics_pytest.py +++ b/eland/tests/series/test_metrics_pytest.py @@ -72,7 +72,7 @@ def test_ecommerce_selected_non_numeric_source_fields(self): if func == "nunique": # nunique never returns 'NaN' continue - ed_metric = getattr(ed_ecommerce, func)() + ed_metric = getattr(ed_ecommerce, func)(numeric_only=False) print(func, ed_metric) assert np.isnan(ed_metric) @@ -86,7 +86,9 @@ def test_ecommerce_selected_all_numeric_source_fields(self): for func in self.all_funcs: pd_metric = getattr(pd_ecommerce, func)() - ed_metric = getattr(ed_ecommerce, func)() + ed_metric = getattr(ed_ecommerce, func)( + **({"numeric_only": True} if (func != "nunique") else {}) + ) self.assert_almost_equal_for_agg(func, pd_metric, ed_metric) @pytest.mark.parametrize("agg", ["mean", "min", "max"]) From f4cf351d195216a473d94d717e1e894fca05c346 Mon Sep 17 00:00:00 2001 From: Palakodeti Sai Vinay Date: Sun, 6 Sep 2020 20:56:51 +0530 Subject: [PATCH 2/4] Addressed the changes, optimized tests --- eland/dataframe.py | 2 +- eland/field_mappings.py | 2 +- eland/ndframe.py | 279 ++++++++------ eland/operations.py | 104 +++-- eland/tests/dataframe/test_metrics_pytest.py | 379 ++++++------------- 5 files changed, 350 insertions(+), 416 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index afd67245..bf5f886e 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1381,7 +1381,7 @@ def aggregate( Examples -------- >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'],numeric_only=True).astype(int) + >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int) DistanceKilometers AvgTicketPrice sum 92616288 8204364 min 0 100 diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 2382c7f4..892d6a2c 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -100,7 +100,7 @@ def is_es_agg_compatible(self, es_agg) -> bool: # Cardinality works for all types # Numerics and bools work for all aggs - # Except "median_absolute_deviation" supports only bool + # except "median_absolute_deviation" which doesn't support bool if es_agg == "median_absolute_deviation" and self.is_bool: return False if es_agg == "cardinality" or self.is_numeric or self.is_bool: diff --git a/eland/ndframe.py b/eland/ndframe.py index edfda0fa..78e28e2a 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -172,10 +172,9 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. - + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.Series @@ -190,14 +189,14 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: >>> df = ed.DataFrame('localhost', 'flights') >>> df.mean() AvgTicketPrice 628.254 - Cancelled True + Cancelled 0.128494 DistanceKilometers 7092.14 DistanceMiles 4406.85 - FlightDelay True - FlightDelayMin 47 + FlightDelay 0.251168 + FlightDelayMin 47.3352 FlightTimeHour 8.5188 FlightTimeMin 511.128 - dayOfWeek 2 + dayOfWeek 2.83598 timestamp 2018-01-21 19:20:45.564438232 dtype: object @@ -244,7 +243,6 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: dayOfWeek 2.83598 timestamp 2018-01-21 19:20:45.564438232 dtype: object - """ return self._query_compiler.mean(numeric_only=numeric_only) @@ -258,9 +256,9 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- @@ -276,10 +274,10 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: >>> df = ed.DataFrame('localhost', 'flights') >>> df.sum() AvgTicketPrice 8.20436e+06 - Cancelled True + Cancelled 1678 DistanceKilometers 9.26163e+07 DistanceMiles 5.75491e+07 - FlightDelay True + FlightDelay 3280 FlightDelayMin 618150 FlightTimeHour 111247 FlightTimeMin 6.67482e+06 @@ -329,7 +327,6 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: dayOfWeek 37035 timestamp NaT dtype: object - """ return self._query_compiler.sum(numeric_only=numeric_only) @@ -343,9 +340,9 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- @@ -387,33 +384,33 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series: >>> df = ed.DataFrame('localhost', 'flights') >>> df.min(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 100.021 - Cancelled False - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 0 - DistanceMiles 0 - FlightDelay False - FlightDelayMin 0 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 0 - FlightTimeMin 0 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 0 - timestamp 2018-01-01 00:00:00 + AvgTicketPrice 100.021 + Cancelled False + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 0 + DistanceMiles 0 + FlightDelay False + FlightDelayMin 0 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 0 + FlightTimeMin 0 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 dtype: object """ return self._query_compiler.min(numeric_only=numeric_only) @@ -426,9 +423,9 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- @@ -444,14 +441,14 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: >>> df = ed.DataFrame('localhost', 'flights') >>> df.var() AvgTicketPrice 70964.6 - Cancelled True + Cancelled 0.111987 DistanceKilometers 2.09613e+07 DistanceMiles 8.0932e+06 - FlightDelay True - FlightDelayMin 9359 + FlightDelay 0.18809 + FlightDelayMin 9359.57 FlightTimeHour 31.1266 FlightTimeMin 112056 - dayOfWeek 3 + dayOfWeek 3.76128 dtype: object >>> df = ed.DataFrame('localhost', 'flights') @@ -470,7 +467,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: >>> df = ed.DataFrame('localhost', 'flights') >>> df.var(numeric_only=False) # doctest: +SKIP AvgTicketPrice 70964.6 - Cancelled True + Cancelled 0.111987 Carrier NaN Dest NaN DestAirportID NaN @@ -481,8 +478,8 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: DestWeather NaN DistanceKilometers 2.09613e+07 DistanceMiles 8.0932e+06 - FlightDelay True - FlightDelayMin 9359 + FlightDelay 0.18809 + FlightDelayMin 9359.57 FlightDelayType NaN FlightNum NaN FlightTimeHour 31.1266 @@ -494,7 +491,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: OriginLocation NaN OriginRegion NaN OriginWeather NaN - dayOfWeek 3 + dayOfWeek 3.76128 timestamp NaT dtype: object """ @@ -508,9 +505,9 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- @@ -525,15 +522,15 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.std() - AvgTicketPrice 266.407 - Cancelled True - DistanceKilometers 4578.61 - DistanceMiles 2845.02 - FlightDelay True - FlightDelayMin 96 - FlightTimeHour 5.57945 - FlightTimeMin 334.767 - dayOfWeek 1 + AvgTicketPrice 266.407 + Cancelled 0.334664 + DistanceKilometers 4578.61 + DistanceMiles 2845.02 + FlightDelay 0.433718 + FlightDelayMin 96.7504 + FlightTimeHour 5.57945 + FlightTimeMin 334.767 + dayOfWeek 1.93951 dtype: object >>> df = ed.DataFrame('localhost', 'flights') @@ -550,7 +547,7 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: dtype: float64 >>> df = ed.DataFrame('localhost', 'flights') - >>> df.std(numeric_only=False) # doctest: +SKIP + >>> df.std(numeric_only=False) # doctest: +SKIP AvgTicketPrice 266.407 Cancelled 0.334664 Carrier NaN @@ -579,7 +576,6 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: dayOfWeek 1.93951 timestamp NaT dtype: object - """ return self._query_compiler.std(numeric_only=numeric_only) @@ -591,9 +587,9 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- @@ -615,27 +611,27 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: FlightDelay False FlightDelayMin 0 FlightTimeHour 8.38582 - FlightTimeMin 502.987 + FlightTimeMin 503.149 dayOfWeek 3 - timestamp 2018-01-21 23:44:29.394281982 + timestamp 2018-01-21 23:25:13.113169922 dtype: object >>> df = ed.DataFrame('localhost', 'flights') >>> df.median(numeric_only=True) # doctest: +SKIP - AvgTicketPrice 640.424738 + AvgTicketPrice 640.387285 Cancelled 0.000000 - DistanceKilometers 7612.145848 - DistanceMiles 4728.752630 + DistanceKilometers 7612.072403 + DistanceMiles 4729.922470 FlightDelay 0.000000 FlightDelayMin 0.000000 - FlightTimeHour 8.383711 - FlightTimeMin 502.999492 + FlightTimeHour 8.385816 + FlightTimeMin 503.148975 dayOfWeek 3.000000 dtype: float64 >>> df = ed.DataFrame('localhost', 'flights') >>> df.median(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 640.425 + AvgTicketPrice 640.387 Cancelled False Carrier NaN Dest NaN @@ -645,14 +641,14 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: DestLocation NaN DestRegion NaN DestWeather NaN - DistanceKilometers 7612.15 - DistanceMiles 4728.75 + DistanceKilometers 7612.07 + DistanceMiles 4729.92 FlightDelay False FlightDelayMin 0 FlightDelayType NaN FlightNum NaN - FlightTimeHour 8.38457 - FlightTimeMin 503.074 + FlightTimeHour 8.38582 + FlightTimeMin 503.149 Origin NaN OriginAirportID NaN OriginCityName NaN @@ -661,9 +657,8 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: OriginRegion NaN OriginWeather NaN dayOfWeek 3 - timestamp 2018-01-21 23:33:14.554215332 + timestamp 2018-01-22 00:43:09.223130126 dtype: object - """ return self._query_compiler.median(numeric_only=numeric_only) @@ -677,9 +672,9 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series: ---------- numeric_only: {True, False, None} Default is None Which datatype to be returned - - True: returns all values with float64, NaN/NaT are ignored. - - False: returns all values with float64. - - None: returns all values with default datatypes, NaN/NaT are ignored. + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- @@ -721,33 +716,33 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series: >>> df = ed.DataFrame('localhost', 'flights') >>> df.max(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 1199.73 - Cancelled True - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 19881.5 - DistanceMiles 12353.8 - FlightDelay True - FlightDelayMin 360 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 31.715 - FlightTimeMin 1902.9 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 6 - timestamp 2018-02-11 23:50:12 + AvgTicketPrice 1199.73 + Cancelled True + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 19881.5 + DistanceMiles 12353.8 + FlightDelay True + FlightDelayMin 360 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 31.715 + FlightTimeMin 1902.9 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 dtype: object """ return self._query_compiler.max(numeric_only=numeric_only) @@ -815,16 +810,56 @@ def mad(self, numeric_only: bool = True) -> pd.Series: -------- >>> df = ed.DataFrame('localhost', 'flights') >>> df.mad() # doctest: +SKIP + AvgTicketPrice 213.443470 + DistanceKilometers 2948.631194 + DistanceMiles 1830.663947 + FlightDelayMin 0.000000 + FlightTimeHour 3.819254 + FlightTimeMin 229.158256 + dayOfWeek 2.000000 + dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.mad(numeric_only=True) # doctest: +SKIP AvgTicketPrice 213.368709 - Cancelled 0.000000 DistanceKilometers 2946.168236 - DistanceMiles 1830.987236 - FlightDelay 0.000000 + DistanceMiles 1829.899362 FlightDelayMin 0.000000 - FlightTimeHour 3.819435 - FlightTimeMin 229.142297 + FlightTimeHour 3.819654 + FlightTimeMin 229.176708 dayOfWeek 2.000000 dtype: float64 + + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.mad(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 213.451 + Cancelled NaN + Carrier NaN + Dest NaN + DestAirportID NaN + DestCityName NaN + DestCountry NaN + DestLocation NaN + DestRegion NaN + DestWeather NaN + DistanceKilometers 2946.98 + DistanceMiles 1830.66 + FlightDelay NaN + FlightDelayMin 0 + FlightDelayType NaN + FlightNum NaN + FlightTimeHour 3.81919 + FlightTimeMin 229.177 + Origin NaN + OriginAirportID NaN + OriginCityName NaN + OriginCountry NaN + OriginLocation NaN + OriginRegion NaN + OriginWeather NaN + dayOfWeek 2 + timestamp NaT + dtype: object """ return self._query_compiler.mad(numeric_only=numeric_only) diff --git a/eland/operations.py b/eland/operations.py index dce2cd1b..238a36c2 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -147,47 +147,82 @@ def count(self, query_compiler): def mean(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def var(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def std(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def median(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs( query_compiler, ["median"], numeric_only=numeric_only ) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def sum(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def max(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def min(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def nunique(self, query_compiler): results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False) @@ -195,9 +230,14 @@ def nunique(self, query_compiler): def mad(self, query_compiler, numeric_only: Optional[bool] = None): results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only) - return build_pd_series( - results, index=results.keys(), dtype=(np.float64 if numeric_only else None) - ) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + return build_pd_series( + results, + index=results.keys(), + dtype=(None if len(results) <= 1 else "object"), + ) def value_counts(self, query_compiler, es_size): return self._terms_aggs(query_compiler, "terms", es_size) @@ -265,15 +305,15 @@ def _metric_aggs( for field in fields: values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): - + # is_aggregation is used to differentiate agg() and non-agg() # If the field and agg aren't compatible we add a NaN/NaT for agg() - # If the field and agg aren't compatible we dont add NaN/NaT for non-agg() + # If the field and agg aren't compatible we don't add NaN/NaT for non-agg() if not field.is_es_agg_compatible(es_agg): if is_aggregation and not numeric_only: values.append(field.nan_value) elif not is_aggregation and numeric_only is False: values.append(field.nan_value) - # Explicit condition for mad to add NaN because it doesnt support bool + # Explicit condition for mad to add NaN because it doesn't support bool elif is_aggregation and numeric_only: if pd_agg == "mad": values.append(field.nan_value) @@ -336,7 +376,13 @@ def _metric_aggs( ) # If numeric_only is False | None then maintan column datatype elif not numeric_only: - agg_value = field.np_dtype.type(agg_value) + # we're only converting to bool for lossless aggs like min, max, and median. + if pd_agg in {"max", "min", "median", "sum"}: + # 'sum' isn't representable with bool, use int64 + if pd_agg == "sum" and field.is_bool: + agg_value = np.int64(agg_value) + else: + agg_value = field.np_dtype.type(agg_value) values.append(agg_value) @@ -581,7 +627,7 @@ def aggs(self, query_compiler, pd_aggs, numeric_only=None): query_compiler, pd_aggs, numeric_only=numeric_only, is_aggregation=True ) if numeric_only: - return pd.DataFrame(results, index=pd_aggs).astype("float64") + return pd.DataFrame(results, index=pd_aggs, dtype=np.float64) else: return pd.DataFrame(results, index=pd_aggs) diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index f46a9a53..b4869825 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -26,6 +26,13 @@ class TestDataFrameMetrics(TestData): funcs = ["max", "min", "mean", "sum"] extended_funcs = ["median", "mad", "var", "std"] + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] def test_flights_extended_metrics(self): pd_flights = self.pd_flights() @@ -135,14 +142,14 @@ def test_ecommerce_selected_all_numeric_source_fields(self): def test_flights_datetime_metrics_agg(self): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { - "mad": pd.NaT, "max": pd.Timestamp("2018-02-11 23:50:12"), - "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "min": pd.Timestamp("2018-01-01 00:00:00"), - "nunique": 12236, - "std": pd.NaT, + "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "sum": pd.NaT, + "mad": pd.NaT, "var": pd.NaT, + "std": pd.NaT, + "nunique": 12236, } ed_metrics = ed_timestamps.agg( @@ -253,384 +260,244 @@ def test_flights_numeric_only(self): # Mean @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_mean_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 628.2536888148849, - "Cancelled": 0.1284937590933456, - "dayOfWeek": 2.835975189524466, - } calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert calculated_values.to_list() == [ + 628.2536888148849, + 0.1284937590933456, + 2.835975189524466, + ] dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert expected_values == calculated_values.to_dict() assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), np.dtype("float64"), ] elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 628.2536888148849, - "Cancelled": True, - "dayOfWeek": 2, - } calculated_values = ed_flights.mean(numeric_only=numeric_only) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), + assert calculated_values.to_list() == [ + 628.2536888148849, + 0.1284937590933456, + 2.835975189524466, ] + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["Cancelled"], float) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 628.2536888148849, - "Cancelled": True, - "dayOfWeek": 2, - "timestamp": pd.Timestamp("2018-01-21 19:20:45.564438232"), - } calculated_values = ed_flights.mean(numeric_only=numeric_only) - assert expected_values == calculated_values.to_dict() + assert calculated_values.to_list() == [ + 628.2536888148849, + 0.1284937590933456, + 2.835975189524466, + pd.Timestamp("2018-01-21 19:20:45.564438232"), + ] assert isinstance(calculated_values["timestamp"], pd.Timestamp) calculated_values = calculated_values.drop("timestamp") - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), - ] + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["Cancelled"], float) # Min @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_min_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 100.0205307006836, - "Cancelled": 0.0, - "dayOfWeek": 0.0, - } calculated_values = ed_flights.min(numeric_only=numeric_only) + assert calculated_values.to_list() == [100.0205307006836, 0.0, 0.0] dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert expected_values == calculated_values.to_dict() assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), np.dtype("float64"), ] elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 100.0205307006836, - "Cancelled": False, - "dayOfWeek": 0, - } calculated_values = ed_flights.min(numeric_only=numeric_only) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), - ] + assert calculated_values.to_list() == [100.0205307006836, 0, False] + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["Cancelled"], np.bool_) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 100.0205307006836, - "Cancelled": False, - "dayOfWeek": 0, - "timestamp": pd.Timestamp("2018-01-01 00:00:00"), - } calculated_values = ed_flights.min(numeric_only=numeric_only) - assert expected_values == calculated_values.to_dict() - assert isinstance(calculated_values["timestamp"], pd.Timestamp) - calculated_values = calculated_values.drop("timestamp") - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), + assert calculated_values.to_list() == [ + 100.0205307006836, + 0, + False, + pd.Timestamp("2018-01-01 00:00:00"), ] + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["Cancelled"], np.bool_) # max @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_max_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 1199.72900390625, - "Cancelled": 1.0, - "dayOfWeek": 6.0, - } calculated_values = ed_flights.max(numeric_only=numeric_only) + assert calculated_values.to_list() == [1199.72900390625, 1.0, 6.0] dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert expected_values == calculated_values.to_dict() assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), np.dtype("float64"), ] elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 1199.72900390625, - "Cancelled": True, - "dayOfWeek": 6, - } calculated_values = ed_flights.max(numeric_only=numeric_only) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), - ] + assert calculated_values.to_list() == [1199.72900390625, True, 6] + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["Cancelled"], np.bool_) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 1199.72900390625, - "Cancelled": True, - "dayOfWeek": 6, - "timestamp": pd.Timestamp("2018-02-11 23:50:12"), - } calculated_values = ed_flights.max(numeric_only=numeric_only) - assert expected_values == calculated_values.to_dict() assert isinstance(calculated_values["timestamp"], pd.Timestamp) calculated_values = calculated_values.drop("timestamp") - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), - ] + assert calculated_values.to_list() == [1199.72900390625, True, 6] + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["Cancelled"], np.bool_) # sum @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_sum_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 8204364.922233582, - "Cancelled": 1678.0, - "dayOfWeek": 37035.0, - } calculated_values = ed_flights.sum(numeric_only=numeric_only) dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert expected_values == calculated_values.to_dict() + assert calculated_values.to_list() == [8204364.922233582, 1678.0, 37035.0] assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), np.dtype("float64"), ] elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 8204364.922233582, - "Cancelled": True, - "dayOfWeek": 37035, - } calculated_values = ed_flights.sum(numeric_only=numeric_only) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), - ] + assert calculated_values.to_list() == [8204364.922233582, 1678, 37035] + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["Cancelled"], np.int64) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 8204364.922233582, - "Cancelled": True, - "dayOfWeek": 37035, - } calculated_values = ed_flights.sum(numeric_only=numeric_only) - assert expected_values == calculated_values.to_dict() + assert calculated_values.to_list() == [8204364.922233582, 1678, 37035] dtype_list = [calculated_values[i].dtype for i in calculated_values.index] assert dtype_list == [ np.dtype("float64"), - np.dtype("bool"), + np.dtype("int64"), np.dtype("int64"), ] # std @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_std_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 266.4070611666801, - "Cancelled": 0.33466440694020916, - "dayOfWeek": 1.9395130445445228, - } calculated_values = ed_flights.std(numeric_only=numeric_only) dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert expected_values == calculated_values.to_dict() + assert calculated_values.to_list() == [ + 266.4070611666801, + 0.33466440694020916, + 1.9395130445445228, + ] assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), np.dtype("float64"), ] elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 266.4070611666801, - "Cancelled": True, - "dayOfWeek": 1, - } calculated_values = ed_flights.std(numeric_only=numeric_only) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), + assert calculated_values.to_list() == [ + 266.4070611666801, + 0.33466440694020916, + 1.9395130445445228, ] + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["Cancelled"], float) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 266.4070611666801, - "Cancelled": True, - "dayOfWeek": 1, - } calculated_values = ed_flights.std(numeric_only=numeric_only) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), + assert calculated_values.to_list() == [ + 266.4070611666801, + 0.33466440694020916, + 1.9395130445445228, ] + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["Cancelled"], float) # var @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_var_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 70964.57023354847, - "Cancelled": 0.111987400797438, - "dayOfWeek": 3.7612787756607213, - } calculated_values = ed_flights.var(numeric_only=numeric_only) dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert expected_values == calculated_values.to_dict() + assert calculated_values.to_list() == [ + 70964.57023354847, + 0.111987400797438, + 3.7612787756607213, + ] assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), np.dtype("float64"), ] elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 70964.57023354847, - "Cancelled": True, - "dayOfWeek": 3, - } calculated_values = ed_flights.var(numeric_only=numeric_only) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert expected_values == calculated_values.to_dict() - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), + assert calculated_values.to_list() == [ + 70964.57023354847, + 0.111987400797438, + 3.7612787756607213, ] + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.float64) + assert isinstance(calculated_values["Cancelled"], np.float64) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 70964.57023354847, - "Cancelled": True, - "dayOfWeek": 3, - } calculated_values = ed_flights.var(numeric_only=numeric_only) - assert expected_values == calculated_values.to_dict() + assert calculated_values.to_list() == [ + 70964.57023354847, + 0.111987400797438, + 3.7612787756607213, + ] dtype_list = [calculated_values[i].dtype for i in calculated_values.index] assert dtype_list == [ np.dtype("float64"), - np.dtype("bool"), - np.dtype("int64"), + np.dtype("float64"), + np.dtype("float64"), ] # median @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_median_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: - expected_values = { - "AvgTicketPrice": 640.3872852064159, - "Cancelled": 0.0, - "dayOfWeek": 3.0, - } calculated_values = ed_flights.median(numeric_only=numeric_only) dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert ( - expected_values["Cancelled"] == calculated_values.to_dict()["Cancelled"] - ) - assert ( - expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] - ) assert ( (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) - <= expected_values["AvgTicketPrice"] + <= 640.3872852064159 <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) ) + assert calculated_values["Cancelled"] == 0.0 + assert calculated_values["dayOfWeek"] == 3.0 assert dtype_list == [ np.dtype("float64"), np.dtype("float64"), @@ -689,18 +556,10 @@ def test_median_numeric_only(self, numeric_only): # mad @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_mad_numeric_only(self, numeric_only): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] - ed_flights = self.ed_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is True: expected_values = {"AvgTicketPrice": 213.47889841845912, "dayOfWeek": 2.0} calculated_values = ed_flights.mad(numeric_only=numeric_only) - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] assert ( expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] ) @@ -709,10 +568,7 @@ def test_mad_numeric_only(self, numeric_only): <= expected_values["AvgTicketPrice"] <= (calculated_values["AvgTicketPrice"] * 1.1) ) - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - ] + assert calculated_values["AvgTicketPrice"].dtype == np.dtype("float64") elif numeric_only is False: expected_values = {"AvgTicketPrice": 213.36870923117985, "dayOfWeek": 2.0} calculated_values = ed_flights.mad(numeric_only=numeric_only) @@ -730,8 +586,8 @@ def test_mad_numeric_only(self, numeric_only): <= expected_values["AvgTicketPrice"] <= (calculated_values["AvgTicketPrice"] * 1.1) ) - isinstance(calculated_values["AvgTicketPrice"], float) - isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) elif numeric_only is None: expected_values = {"AvgTicketPrice": 213.4408885767035, "dayOfWeek": 2.0} @@ -741,8 +597,5 @@ def test_mad_numeric_only(self, numeric_only): <= expected_values["AvgTicketPrice"] <= (calculated_values["AvgTicketPrice"] * 1.1) ) - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - ] + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) From 19585a04129981325139a1b3542a14c0d723b1ae Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Sun, 13 Sep 2020 21:00:10 +0530 Subject: [PATCH 3/4] Fixed requested changes --- .gitattributes | 1 - eland/dataframe.py | 54 +- eland/field_mappings.py | 2 +- eland/ndframe.py | 498 ++++--------------- eland/operations.py | 150 ++---- eland/query_compiler.py | 40 +- eland/tests/dataframe/test_aggs_pytest.py | 4 +- eland/tests/dataframe/test_metrics_pytest.py | 433 +++++----------- 8 files changed, 325 insertions(+), 857 deletions(-) delete mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index dfdb8b77..00000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.sh text eol=lf diff --git a/eland/dataframe.py b/eland/dataframe.py index bf5f886e..b0e1555c 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1330,7 +1330,7 @@ def keys(self) -> pd.Index: def aggregate( self, - func: List[str], + func: Union[str, List[str]], axis: int = 0, numeric_only: Optional[bool] = None, *args, @@ -1380,34 +1380,30 @@ def aggregate( Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int) - DistanceKilometers AvgTicketPrice - sum 92616288 8204364 - min 0 100 - std 4578 266 - - >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['AvgTicketPrice','timestamp', 'DestCountry']].aggregate(['sum', 'min', 'std'], numeric_only=True) - AvgTicketPrice - sum 8.204365e+06 - min 1.000205e+02 - std 2.664071e+02 - - >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['AvgTicketPrice','timestamp', 'DestCountry']].aggregate(['sum', 'min', 'std'], numeric_only=False) - AvgTicketPrice timestamp DestCountry - sum 8.204365e+06 NaT NaN - min 1.000205e+02 2018-01-01 NaN - std 2.664071e+02 NaT NaN - - >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['AvgTicketPrice','timestamp', 'DestCountry']].aggregate(['sum', 'min', 'std'], numeric_only=None) - AvgTicketPrice timestamp DestCountry - sum 8.204365e+06 NaT NaN - min 1.000205e+02 2018-01-01 NaN - std 2.664071e+02 NaT NaN - + >>> df = ed.DataFrame('localhost', 'flights').filter(['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry']) + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int) + AvgTicketPrice DistanceKilometers + sum 8204364 92616288 + min 100 0 + std 266 4578 + + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True) + AvgTicketPrice DistanceKilometers + sum 8.204365e+06 9.261629e+07 + min 1.000205e+02 0.000000e+00 + std 2.664071e+02 4.578614e+03 + + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=False) + AvgTicketPrice DistanceKilometers timestamp DestCountry + sum 8.204365e+06 9.261629e+07 NaT NaN + min 1.000205e+02 0.000000e+00 2018-01-01 NaN + std 2.664071e+02 4.578614e+03 NaT NaN + + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=None) + AvgTicketPrice DistanceKilometers timestamp DestCountry + sum 8.204365e+06 9.261629e+07 NaT NaN + min 1.000205e+02 0.000000e+00 2018-01-01 NaN + std 2.664071e+02 4.578614e+03 NaT NaN """ axis = pd.DataFrame._get_axis_number(axis) diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 892d6a2c..32b71c41 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -100,7 +100,7 @@ def is_es_agg_compatible(self, es_agg) -> bool: # Cardinality works for all types # Numerics and bools work for all aggs - # except "median_absolute_deviation" which doesn't support bool + # Except "median_absolute_deviation" which doesn't support bool if es_agg == "median_absolute_deviation" and self.is_bool: return False if es_agg == "cardinality" or self.is_numeric or self.is_bool: diff --git a/eland/ndframe.py b/eland/ndframe.py index 78e28e2a..b0257188 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -186,62 +186,26 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mean() - AvgTicketPrice 628.254 - Cancelled 0.128494 - DistanceKilometers 7092.14 - DistanceMiles 4406.85 - FlightDelay 0.251168 - FlightDelayMin 47.3352 - FlightTimeHour 8.5188 - FlightTimeMin 511.128 - dayOfWeek 2.83598 - timestamp 2018-01-21 19:20:45.564438232 + AvgTicketPrice 628.254 + Cancelled 0.128494 + dayOfWeek 2.83598 + timestamp 2018-01-21 19:20:45.564438232 dtype: object - >>> df = ed.DataFrame('localhost', 'flights') >>> df.mean(numeric_only=True) - AvgTicketPrice 628.253689 - Cancelled 0.128494 - DistanceKilometers 7092.142457 - DistanceMiles 4406.853010 - FlightDelay 0.251168 - FlightDelayMin 47.335171 - FlightTimeHour 8.518797 - FlightTimeMin 511.127842 - dayOfWeek 2.835975 + AvgTicketPrice 628.253689 + Cancelled 0.128494 + dayOfWeek 2.835975 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.mean(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 628.254 - Cancelled 0.128494 - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 7092.14 - DistanceMiles 4406.85 - FlightDelay 0.251168 - FlightDelayMin 47.3352 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 8.5188 - FlightTimeMin 511.128 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 2.83598 - timestamp 2018-01-21 19:20:45.564438232 + >>> df.mean(numeric_only=False) + AvgTicketPrice 628.254 + Cancelled 0.128494 + dayOfWeek 2.83598 + timestamp 2018-01-21 19:20:45.564438232 + DestCountry NaN dtype: object """ return self._query_compiler.mean(numeric_only=numeric_only) @@ -271,61 +235,25 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.sum() - AvgTicketPrice 8.20436e+06 - Cancelled 1678 - DistanceKilometers 9.26163e+07 - DistanceMiles 5.75491e+07 - FlightDelay 3280 - FlightDelayMin 618150 - FlightTimeHour 111247 - FlightTimeMin 6.67482e+06 - dayOfWeek 37035 + AvgTicketPrice 8.20436e+06 + Cancelled 1678 + dayOfWeek 37035 dtype: object - >>> df = ed.DataFrame('localhost', 'flights') >>> df.sum(numeric_only=True) - AvgTicketPrice 8.204365e+06 - Cancelled 1.678000e+03 - DistanceKilometers 9.261629e+07 - DistanceMiles 5.754909e+07 - FlightDelay 3.280000e+03 - FlightDelayMin 6.181500e+05 - FlightTimeHour 1.112470e+05 - FlightTimeMin 6.674818e+06 - dayOfWeek 3.703500e+04 + AvgTicketPrice 8.204365e+06 + Cancelled 1.678000e+03 + dayOfWeek 3.703500e+04 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.sum(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 8.20436e+06 - Cancelled 1678 - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 9.26163e+07 - DistanceMiles 5.75491e+07 - FlightDelay 3280 - FlightDelayMin 618150 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 111247 - FlightTimeMin 6.67482e+06 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 37035 - timestamp NaT + >>> df.sum(numeric_only=False) + AvgTicketPrice 8.20436e+06 + Cancelled 1678 + dayOfWeek 37035 + timestamp NaT + DestCountry NaN dtype: object """ return self._query_compiler.sum(numeric_only=numeric_only) @@ -355,62 +283,26 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.min() - AvgTicketPrice 100.021 - Cancelled False - DistanceKilometers 0 - DistanceMiles 0 - FlightDelay False - FlightDelayMin 0 - FlightTimeHour 0 - FlightTimeMin 0 - dayOfWeek 0 - timestamp 2018-01-01 00:00:00 + AvgTicketPrice 100.021 + Cancelled False + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 dtype: object - >>> df = ed.DataFrame('localhost', 'flights') >>> df.min(numeric_only=True) - AvgTicketPrice 100.020531 - Cancelled 0.000000 - DistanceKilometers 0.000000 - DistanceMiles 0.000000 - FlightDelay 0.000000 - FlightDelayMin 0.000000 - FlightTimeHour 0.000000 - FlightTimeMin 0.000000 - dayOfWeek 0.000000 + AvgTicketPrice 100.020531 + Cancelled 0.000000 + dayOfWeek 0.000000 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.min(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 100.021 - Cancelled False - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 0 - DistanceMiles 0 - FlightDelay False - FlightDelayMin 0 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 0 - FlightTimeMin 0 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 0 - timestamp 2018-01-01 00:00:00 + >>> df.min(numeric_only=False) + AvgTicketPrice 100.021 + Cancelled False + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 + DestCountry NaN dtype: object """ return self._query_compiler.min(numeric_only=numeric_only) @@ -438,61 +330,25 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.var() - AvgTicketPrice 70964.6 - Cancelled 0.111987 - DistanceKilometers 2.09613e+07 - DistanceMiles 8.0932e+06 - FlightDelay 0.18809 - FlightDelayMin 9359.57 - FlightTimeHour 31.1266 - FlightTimeMin 112056 - dayOfWeek 3.76128 - dtype: object + AvgTicketPrice 70964.570234 + Cancelled 0.111987 + dayOfWeek 3.761279 + dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') >>> df.var(numeric_only=True) - AvgTicketPrice 7.096457e+04 - Cancelled 1.119874e-01 - DistanceKilometers 2.096130e+07 - DistanceMiles 8.093202e+06 - FlightDelay 1.880897e-01 - FlightDelayMin 9.359568e+03 - FlightTimeHour 3.112664e+01 - FlightTimeMin 1.120559e+05 - dayOfWeek 3.761279e+00 + AvgTicketPrice 70964.570234 + Cancelled 0.111987 + dayOfWeek 3.761279 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.var(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 70964.6 - Cancelled 0.111987 - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 2.09613e+07 - DistanceMiles 8.0932e+06 - FlightDelay 0.18809 - FlightDelayMin 9359.57 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 31.1266 - FlightTimeMin 112056 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 3.76128 - timestamp NaT + >>> df.var(numeric_only=False) + AvgTicketPrice 70964.6 + Cancelled 0.111987 + dayOfWeek 3.76128 + timestamp NaT + DestCountry NaN dtype: object """ return self._query_compiler.var(numeric_only=numeric_only) @@ -520,61 +376,25 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.std() - AvgTicketPrice 266.407 - Cancelled 0.334664 - DistanceKilometers 4578.61 - DistanceMiles 2845.02 - FlightDelay 0.433718 - FlightDelayMin 96.7504 - FlightTimeHour 5.57945 - FlightTimeMin 334.767 - dayOfWeek 1.93951 - dtype: object + AvgTicketPrice 266.407061 + Cancelled 0.334664 + dayOfWeek 1.939513 + dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') >>> df.std(numeric_only=True) - AvgTicketPrice 266.407061 - Cancelled 0.334664 - DistanceKilometers 4578.613803 - DistanceMiles 2845.018714 - FlightDelay 0.433718 - FlightDelayMin 96.750415 - FlightTimeHour 5.579446 - FlightTimeMin 334.766770 - dayOfWeek 1.939513 + AvgTicketPrice 266.407061 + Cancelled 0.334664 + dayOfWeek 1.939513 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.std(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 266.407 - Cancelled 0.334664 - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 4578.61 - DistanceMiles 2845.02 - FlightDelay 0.433718 - FlightDelayMin 96.7504 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 5.57945 - FlightTimeMin 334.767 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 1.93951 - timestamp NaT + >>> df.std(numeric_only=False) + AvgTicketPrice 266.407 + Cancelled 0.334664 + dayOfWeek 1.93951 + timestamp NaT + DestCountry NaN dtype: object """ return self._query_compiler.std(numeric_only=numeric_only) @@ -602,62 +422,26 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.median() # doctest: +SKIP - AvgTicketPrice 640.387 - Cancelled False - DistanceKilometers 7612.07 - DistanceMiles 4729.92 - FlightDelay False - FlightDelayMin 0 - FlightTimeHour 8.38582 - FlightTimeMin 503.149 - dayOfWeek 3 - timestamp 2018-01-21 23:25:13.113169922 + AvgTicketPrice 640.363 + Cancelled False + dayOfWeek 3 + timestamp 2018-01-21 23:54:06.624776611 dtype: object - >>> df = ed.DataFrame('localhost', 'flights') >>> df.median(numeric_only=True) # doctest: +SKIP - AvgTicketPrice 640.387285 - Cancelled 0.000000 - DistanceKilometers 7612.072403 - DistanceMiles 4729.922470 - FlightDelay 0.000000 - FlightDelayMin 0.000000 - FlightTimeHour 8.385816 - FlightTimeMin 503.148975 - dayOfWeek 3.000000 + AvgTicketPrice 640.362667 + Cancelled 0.000000 + dayOfWeek 3.000000 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') >>> df.median(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 640.387 - Cancelled False - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 7612.07 - DistanceMiles 4729.92 - FlightDelay False - FlightDelayMin 0 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 8.38582 - FlightTimeMin 503.149 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 3 - timestamp 2018-01-22 00:43:09.223130126 + AvgTicketPrice 640.387 + Cancelled False + dayOfWeek 3 + timestamp 2018-01-21 23:54:06.624776611 + DestCountry NaN dtype: object """ return self._query_compiler.median(numeric_only=numeric_only) @@ -687,62 +471,26 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp","DestCountry"]) >>> df.max() - AvgTicketPrice 1199.73 - Cancelled True - DistanceKilometers 19881.5 - DistanceMiles 12353.8 - FlightDelay True - FlightDelayMin 360 - FlightTimeHour 31.715 - FlightTimeMin 1902.9 - dayOfWeek 6 - timestamp 2018-02-11 23:50:12 + AvgTicketPrice 1199.73 + Cancelled True + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 dtype: object - >>> df = ed.DataFrame('localhost', 'flights') >>> df.max(numeric_only=True) - AvgTicketPrice 1199.729004 - Cancelled 1.000000 - DistanceKilometers 19881.482422 - DistanceMiles 12353.780273 - FlightDelay 1.000000 - FlightDelayMin 360.000000 - FlightTimeHour 31.715034 - FlightTimeMin 1902.901978 - dayOfWeek 6.000000 + AvgTicketPrice 1199.729004 + Cancelled 1.000000 + dayOfWeek 6.000000 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.max(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 1199.73 - Cancelled True - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 19881.5 - DistanceMiles 12353.8 - FlightDelay True - FlightDelayMin 360 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 31.715 - FlightTimeMin 1902.9 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 6 - timestamp 2018-02-11 23:50:12 + >>> df.max(numeric_only=False) + AvgTicketPrice 1199.73 + Cancelled True + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 + DestCountry NaN dtype: object """ return self._query_compiler.max(numeric_only=numeric_only) @@ -808,57 +556,23 @@ def mad(self, numeric_only: bool = True) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mad() # doctest: +SKIP - AvgTicketPrice 213.443470 - DistanceKilometers 2948.631194 - DistanceMiles 1830.663947 - FlightDelayMin 0.000000 - FlightTimeHour 3.819254 - FlightTimeMin 229.158256 - dayOfWeek 2.000000 + AvgTicketPrice 213.35497 + dayOfWeek 2.00000 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') >>> df.mad(numeric_only=True) # doctest: +SKIP - AvgTicketPrice 213.368709 - DistanceKilometers 2946.168236 - DistanceMiles 1829.899362 - FlightDelayMin 0.000000 - FlightTimeHour 3.819654 - FlightTimeMin 229.176708 - dayOfWeek 2.000000 + AvgTicketPrice 213.473011 + dayOfWeek 2.000000 dtype: float64 - >>> df = ed.DataFrame('localhost', 'flights') >>> df.mad(numeric_only=False) # doctest: +SKIP - AvgTicketPrice 213.451 - Cancelled NaN - Carrier NaN - Dest NaN - DestAirportID NaN - DestCityName NaN - DestCountry NaN - DestLocation NaN - DestRegion NaN - DestWeather NaN - DistanceKilometers 2946.98 - DistanceMiles 1830.66 - FlightDelay NaN - FlightDelayMin 0 - FlightDelayType NaN - FlightNum NaN - FlightTimeHour 3.81919 - FlightTimeMin 229.177 - Origin NaN - OriginAirportID NaN - OriginCityName NaN - OriginCountry NaN - OriginLocation NaN - OriginRegion NaN - OriginWeather NaN - dayOfWeek 2 - timestamp NaT + AvgTicketPrice 213.484 + Cancelled NaN + dayOfWeek 2 + timestamp NaT + DestCountry NaN dtype: object """ return self._query_compiler.mad(numeric_only=numeric_only) diff --git a/eland/operations.py b/eland/operations.py index 238a36c2..cd48ef5c 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -145,99 +145,28 @@ def count(self, query_compiler): return build_pd_series(data=counts, index=fields) - def mean(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def var(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def std(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def median(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs( - query_compiler, ["median"], numeric_only=numeric_only - ) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def sum(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def max(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def min(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only) - if numeric_only: - return build_pd_series(results, index=results.keys(), dtype=np.float64) - else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) - - def nunique(self, query_compiler): - results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False) - return build_pd_series(results, index=results.keys()) - - def mad(self, query_compiler, numeric_only: Optional[bool] = None): - results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only) + def _metric_agg_series( + self, + query_compiler: "QueryCompiler", + agg: List, + numeric_only: Optional[bool] = None, + ) -> pd.Series: + results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only) if numeric_only: return build_pd_series(results, index=results.keys(), dtype=np.float64) else: - return build_pd_series( - results, - index=results.keys(), - dtype=(None if len(results) <= 1 else "object"), - ) + # If all results are float convert into float64 + if all(isinstance(i, float) for i in results.values()): + dtype = np.float64 + # If all results are int convert into int64 + elif all(isinstance(i, int) for i in results.values()): + dtype = np.int64 + # If single result is present consider that datatype instead of object + elif len(results) <= 1: + dtype = None + else: + dtype = "object" + return build_pd_series(results, index=results.keys(), dtype=dtype) def value_counts(self, query_compiler, es_size): return self._terms_aggs(query_compiler, "terms", es_size) @@ -245,13 +174,21 @@ def value_counts(self, query_compiler, es_size): def hist(self, query_compiler, bins): return self._hist_aggs(query_compiler, bins) + def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame: + results = self._metric_aggs( + query_compiler, pd_aggs, numeric_only=numeric_only, is_dataframe_agg=True + ) + return pd.DataFrame( + results, index=pd_aggs, dtype=(np.float64 if numeric_only else None) + ) + def _metric_aggs( self, query_compiler: "QueryCompiler", pd_aggs, numeric_only: Optional[bool] = None, - is_aggregation: bool = False, - ): + is_dataframe_agg: bool = False, + ) -> Dict: query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) @@ -305,16 +242,16 @@ def _metric_aggs( for field in fields: values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): - # is_aggregation is used to differentiate agg() and non-agg() - # If the field and agg aren't compatible we add a NaN/NaT for agg() - # If the field and agg aren't compatible we don't add NaN/NaT for non-agg() + # is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean() + # If the field and agg aren't compatible we add a NaN/NaT for agg + # If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean() if not field.is_es_agg_compatible(es_agg): - if is_aggregation and not numeric_only: + if is_dataframe_agg and not numeric_only: values.append(field.nan_value) - elif not is_aggregation and numeric_only is False: + elif not is_dataframe_agg and numeric_only is False: values.append(field.nan_value) # Explicit condition for mad to add NaN because it doesn't support bool - elif is_aggregation and numeric_only: + elif is_dataframe_agg and numeric_only: if pd_agg == "mad": values.append(field.nan_value) continue @@ -360,9 +297,9 @@ def _metric_aggs( # Null usually means there were no results. if agg_value is None or np.isnan(agg_value): - if is_aggregation and not numeric_only: + if is_dataframe_agg and not numeric_only: agg_value = np.NaN - elif not is_aggregation and numeric_only is False: + elif not is_dataframe_agg and numeric_only is False: agg_value = np.NaN # Cardinality is always either NaN or integer. @@ -374,7 +311,7 @@ def _metric_aggs( agg_value = elasticsearch_date_to_pandas_date( agg_value, field.es_date_format ) - # If numeric_only is False | None then maintan column datatype + # If numeric_only is False | None then maintain column datatype elif not numeric_only: # we're only converting to bool for lossless aggs like min, max, and median. if pd_agg in {"max", "min", "median", "sum"}: @@ -387,7 +324,7 @@ def _metric_aggs( values.append(agg_value) # If numeric_only is True and We only have a NaN type field then we check for empty. - if len(values) != 0: + if values: results[field.index] = values if len(values) > 1 else values[0] return results @@ -622,15 +559,6 @@ def _map_pd_aggs_to_es_aggs(pd_aggs): return es_aggs - def aggs(self, query_compiler, pd_aggs, numeric_only=None): - results = self._metric_aggs( - query_compiler, pd_aggs, numeric_only=numeric_only, is_aggregation=True - ) - if numeric_only: - return pd.DataFrame(results, index=pd_aggs, dtype=np.float64) - else: - return pd.DataFrame(results, index=pd_aggs) - def filter(self, query_compiler, items=None, like=None, regex=None): # This function is only called for axis='index', # DataFrame.filter(..., axis="columns") calls .drop() diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 10bdc978..60309f7f 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -17,7 +17,7 @@ import copy from datetime import datetime -from typing import Optional, TYPE_CHECKING +from typing import Optional, TYPE_CHECKING, List import numpy as np import pandas as pd @@ -490,38 +490,56 @@ def filter(self, items=None, like=None, regex=None): result._operations.filter(self, items=items, like=like, regex=regex) return result - def aggs(self, func, numeric_only: Optional[bool] = None): + def aggs(self, func: List[str], numeric_only: Optional[bool] = None): return self._operations.aggs(self, func, numeric_only=numeric_only) def count(self): return self._operations.count(self) def mean(self, numeric_only: Optional[bool] = None): - return self._operations.mean(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["mean"], numeric_only=numeric_only + ) def var(self, numeric_only: Optional[bool] = None): - return self._operations.var(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["var"], numeric_only=numeric_only + ) def std(self, numeric_only: Optional[bool] = None): - return self._operations.std(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["std"], numeric_only=numeric_only + ) def mad(self, numeric_only: Optional[bool] = None): - return self._operations.mad(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["mad"], numeric_only=numeric_only + ) def median(self, numeric_only: Optional[bool] = None): - return self._operations.median(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["median"], numeric_only=numeric_only + ) def sum(self, numeric_only: Optional[bool] = None): - return self._operations.sum(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["sum"], numeric_only=numeric_only + ) def min(self, numeric_only: Optional[bool] = None): - return self._operations.min(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["min"], numeric_only=numeric_only + ) def max(self, numeric_only: Optional[bool] = None): - return self._operations.max(self, numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, ["max"], numeric_only=numeric_only + ) def nunique(self): - return self._operations.nunique(self) + return self._operations._metric_agg_series( + self, ["nunique"], numeric_only=False + ) def value_counts(self, es_size): return self._operations.value_counts(self, es_size) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 4d108b8d..e483f471 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -87,8 +87,8 @@ def test_aggs_median_var(self): ["taxful_total_price", "taxless_total_price", "total_quantity"] ].agg(["median", "var"], numeric_only=True) - # print(pd_aggs, pd_aggs.dtypes) - # print(ed_aggs, ed_aggs.dtypes) + print(pd_aggs, pd_aggs.dtypes) + print(ed_aggs, ed_aggs.dtypes) # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index b4869825..dbef894e 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -34,6 +34,28 @@ class TestDataFrameMetrics(TestData): "DestCountry", ] + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_flights_metrics(self, numeric_only): + pd_flights = self.pd_flights() + ed_flights = self.ed_flights() + + for func in self.funcs: + # Pandas v1.0 doesn't support mean() on datetime + # Pandas and Eland don't support sum() on datetime + if not numeric_only: + dtype_include = ( + [np.number, np.datetime64] + if func not in ("mean", "sum") + else [np.number] + ) + pd_flights = pd_flights.select_dtypes(include=dtype_include) + ed_flights = ed_flights.select_dtypes(include=dtype_include) + + pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only) + ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only) + + assert_series_equal(pd_metric, ed_metric, check_dtype=False) + def test_flights_extended_metrics(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() @@ -93,8 +115,8 @@ def test_ecommerce_selected_non_numeric_source_fields(self): "user", ] - pd_ecommerce = self.pd_ecommerce().filter(columns) - ed_ecommerce = self.ed_ecommerce().filter(columns) + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] for func in self.funcs: assert_series_equal( @@ -115,8 +137,8 @@ def test_ecommerce_selected_mixed_numeric_source_fields(self): "user", ] - pd_ecommerce = self.pd_ecommerce().filter(columns) - ed_ecommerce = self.ed_ecommerce().filter(columns) + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] for func in self.funcs: assert_series_equal( @@ -129,8 +151,8 @@ def test_ecommerce_selected_all_numeric_source_fields(self): # All of these are numeric columns = ["total_quantity", "taxful_total_price", "taxless_total_price"] - pd_ecommerce = self.pd_ecommerce().filter(columns) - ed_ecommerce = self.ed_ecommerce().filter(columns) + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] for func in self.funcs: assert_series_equal( @@ -171,8 +193,10 @@ def test_flights_datetime_metrics_single_agg(self, agg): ed_metric = ed_timestamps.agg([agg]) if agg == "nunique": + # df with timestamp column should return int64 assert ed_metric.dtypes["timestamp"] == np.int64 else: + # df with timestamp column should return datetime64[ns] assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert ed_metric["timestamp"][0] == expected_values[agg] @@ -209,7 +233,7 @@ def test_flights_datetime_metrics_median(self): ) def test_metric_agg_keep_dtypes(self): - # max, min, and median maintain their dtypes for numeric_only=None + # max, min and median maintain their dtypes df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]] assert df.min().tolist() == [131.81910705566406, False, 0] assert df.max().tolist() == [989.9527587890625, True, 0] @@ -229,373 +253,162 @@ def test_metric_agg_keep_dtypes(self): "Cancelled": {"max": True, "median": False, "min": False}, "dayOfWeek": {"max": 0, "median": 0, "min": 0}, } + # sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64. + sum_agg = df.agg(["sum"]) + assert sum_agg.dtypes.to_list() == [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype("int64"), + ] + assert sum_agg.to_dict() == { + "AvgTicketPrice": {"sum": 26521.624084472656}, + "Cancelled": {"sum": 6}, + "dayOfWeek": {"sum": 0}, + } def test_flights_numeric_only(self): - filter_data = [ - "AvgTicketPrice", - "Cancelled", - "dayOfWeek", - "timestamp", - "DestCountry", - ] # All Aggregations Data Check - ed_flights = self.ed_flights().filter(filter_data) - pd_flights = self.pd_flights().filter(filter_data) + ed_flights = self.ed_flights().filter(self.filter_data) + pd_flights = self.pd_flights().filter(self.filter_data) # agg => numeric_only True returns float64 values - # We compare it with individual non-agg functions of pandas with numeric_only=True - # not checking mad because it returns nan value for booleans. + # We compare it with individual single agg functions of pandas with numeric_only=True filtered_aggs = self.funcs + self.extended_funcs - filtered_aggs.remove("mad") agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose() for agg in filtered_aggs: - assert_series_equal( - agg_data[agg].rename(None), - getattr(pd_flights, agg)( - **({"numeric_only": True} if agg != "mad" else {}) - ), - check_exact=False, - rtol=True, - ) - - # Mean - @pytest.mark.parametrize("numeric_only", [True, False, None]) - def test_mean_numeric_only(self, numeric_only): + # Explicitly check for mad because it returns nan for bools + if agg == "mad": + assert np.isnan(agg_data[agg]["Cancelled"]) + else: + assert_series_equal( + agg_data[agg].rename(None), + getattr(pd_flights, agg)(numeric_only=True), + check_exact=False, + rtol=True, + ) + + # all single aggs return float64 for numeric_only=True + def test_numeric_only_true_single_aggs(self): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - calculated_values = ed_flights.mean(numeric_only=numeric_only) - assert calculated_values.to_list() == [ - 628.2536888148849, - 0.1284937590933456, - 2.835975189524466, - ] - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: - calculated_values = ed_flights.mean(numeric_only=numeric_only) - assert isinstance(calculated_values["timestamp"], pd.Timestamp) - assert np.isnan(calculated_values["DestCountry"]) - calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert calculated_values.to_list() == [ - 628.2536888148849, - 0.1284937590933456, - 2.835975189524466, - ] - assert isinstance(calculated_values["AvgTicketPrice"], float) - assert isinstance(calculated_values["dayOfWeek"], float) - assert isinstance(calculated_values["Cancelled"], float) - elif numeric_only is None: - calculated_values = ed_flights.mean(numeric_only=numeric_only) - assert calculated_values.to_list() == [ - 628.2536888148849, - 0.1284937590933456, - 2.835975189524466, - pd.Timestamp("2018-01-21 19:20:45.564438232"), - ] - assert isinstance(calculated_values["timestamp"], pd.Timestamp) - calculated_values = calculated_values.drop("timestamp") - assert isinstance(calculated_values["AvgTicketPrice"], float) - assert isinstance(calculated_values["dayOfWeek"], float) - assert isinstance(calculated_values["Cancelled"], float) - - # Min - @pytest.mark.parametrize("numeric_only", [True, False, None]) - def test_min_numeric_only(self, numeric_only): + for agg in self.funcs + self.extended_funcs: + result = getattr(ed_flights, agg)(numeric_only=True) + assert result.dtype == np.dtype("float64") + assert result.shape == ((3,) if agg != "mad" else (2,)) + + # check dtypes and shape of min, max and median for numeric_only=False | None + @pytest.mark.parametrize("agg", ["min", "max", "median"]) + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_min_max_median_numeric_only(self, agg, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - calculated_values = ed_flights.min(numeric_only=numeric_only) - assert calculated_values.to_list() == [100.0205307006836, 0.0, 0.0] - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: - calculated_values = ed_flights.min(numeric_only=numeric_only) - assert isinstance(calculated_values["timestamp"], pd.Timestamp) - assert np.isnan(calculated_values["DestCountry"]) - calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert calculated_values.to_list() == [100.0205307006836, 0, False] + if numeric_only is False: + calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], np.float64) - assert isinstance(calculated_values["dayOfWeek"], np.int64) assert isinstance(calculated_values["Cancelled"], np.bool_) - elif numeric_only is None: - calculated_values = ed_flights.min(numeric_only=numeric_only) - assert calculated_values.to_list() == [ - 100.0205307006836, - 0, - False, - pd.Timestamp("2018-01-01 00:00:00"), - ] - assert isinstance(calculated_values["timestamp"], pd.Timestamp) - assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["dayOfWeek"], np.int64) - assert isinstance(calculated_values["Cancelled"], np.bool_) - - # max - @pytest.mark.parametrize("numeric_only", [True, False, None]) - def test_max_numeric_only(self, numeric_only): - ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - calculated_values = ed_flights.max(numeric_only=numeric_only) - assert calculated_values.to_list() == [1199.72900390625, 1.0, 6.0] - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: - calculated_values = ed_flights.max(numeric_only=numeric_only) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) - calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert calculated_values.to_list() == [1199.72900390625, True, 6] - assert isinstance(calculated_values["AvgTicketPrice"], np.float64) - assert isinstance(calculated_values["dayOfWeek"], np.int64) - assert isinstance(calculated_values["Cancelled"], np.bool_) + assert calculated_values.shape == (5,) elif numeric_only is None: - calculated_values = ed_flights.max(numeric_only=numeric_only) - assert isinstance(calculated_values["timestamp"], pd.Timestamp) - calculated_values = calculated_values.drop("timestamp") - assert calculated_values.to_list() == [1199.72900390625, True, 6] + calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], np.float64) - assert isinstance(calculated_values["dayOfWeek"], np.int64) assert isinstance(calculated_values["Cancelled"], np.bool_) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert calculated_values.shape == (4,) - # sum - @pytest.mark.parametrize("numeric_only", [True, False, None]) + # check dtypes and shape for sum + @pytest.mark.parametrize("numeric_only", [False, None]) def test_sum_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - calculated_values = ed_flights.sum(numeric_only=numeric_only) - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert calculated_values.to_list() == [8204364.922233582, 1678.0, 37035.0] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: + if numeric_only is False: calculated_values = ed_flights.sum(numeric_only=numeric_only) - assert pd.isnull(calculated_values["timestamp"]) - assert np.isnan(calculated_values["DestCountry"]) - calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert calculated_values.to_list() == [8204364.922233582, 1678, 37035] assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["dayOfWeek"], np.int64) assert isinstance(calculated_values["Cancelled"], np.int64) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.sum(numeric_only=numeric_only) - assert calculated_values.to_list() == [8204364.922233582, 1678, 37035] dtype_list = [calculated_values[i].dtype for i in calculated_values.index] assert dtype_list == [ np.dtype("float64"), np.dtype("int64"), np.dtype("int64"), ] + assert calculated_values.shape == (3,) - # std - @pytest.mark.parametrize("numeric_only", [True, False, None]) + # check dtypes and shape for std + @pytest.mark.parametrize("numeric_only", [False, None]) def test_std_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: + if numeric_only is False: calculated_values = ed_flights.std(numeric_only=numeric_only) - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert calculated_values.to_list() == [ - 266.4070611666801, - 0.33466440694020916, - 1.9395130445445228, - ] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: - calculated_values = ed_flights.std(numeric_only=numeric_only) - assert pd.isnull(calculated_values["timestamp"]) - assert np.isnan(calculated_values["DestCountry"]) - calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert calculated_values.to_list() == [ - 266.4070611666801, - 0.33466440694020916, - 1.9395130445445228, - ] assert isinstance(calculated_values["AvgTicketPrice"], float) - assert isinstance(calculated_values["dayOfWeek"], float) assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.std(numeric_only=numeric_only) - assert calculated_values.to_list() == [ - 266.4070611666801, - 0.33466440694020916, - 1.9395130445445228, - ] assert isinstance(calculated_values["AvgTicketPrice"], float) - assert isinstance(calculated_values["dayOfWeek"], float) assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert calculated_values.shape == (3,) - # var - @pytest.mark.parametrize("numeric_only", [True, False, None]) + # check dtypes and shape for var + @pytest.mark.parametrize("numeric_only", [False, None]) def test_var_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - calculated_values = ed_flights.var(numeric_only=numeric_only) - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert calculated_values.to_list() == [ - 70964.57023354847, - 0.111987400797438, - 3.7612787756607213, - ] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: + if numeric_only is False: calculated_values = ed_flights.var(numeric_only=numeric_only) - assert pd.isnull(calculated_values["timestamp"]) - assert np.isnan(calculated_values["DestCountry"]) - calculated_values = calculated_values.drop(["timestamp", "DestCountry"]) - assert calculated_values.to_list() == [ - 70964.57023354847, - 0.111987400797438, - 3.7612787756607213, - ] assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["dayOfWeek"], np.float64) assert isinstance(calculated_values["Cancelled"], np.float64) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.var(numeric_only=numeric_only) - assert calculated_values.to_list() == [ - 70964.57023354847, - 0.111987400797438, - 3.7612787756607213, - ] - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert calculated_values.shape == (3,) - # median - @pytest.mark.parametrize("numeric_only", [True, False, None]) - def test_median_numeric_only(self, numeric_only): + # check dtypes and shape for mean + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_mean_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - calculated_values = ed_flights.median(numeric_only=numeric_only) - dtype_list = [calculated_values[i].dtype for i in calculated_values.index] - assert ( - (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) - <= 640.3872852064159 - <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) - ) - assert calculated_values["Cancelled"] == 0.0 - assert calculated_values["dayOfWeek"] == 3.0 - assert dtype_list == [ - np.dtype("float64"), - np.dtype("float64"), - np.dtype("float64"), - ] - elif numeric_only is False: - expected_values = { - "AvgTicketPrice": 640.3222933002547, - "Cancelled": False, - "dayOfWeek": 3, - "timestamp": pd.Timestamp("2018-01-21 23:58:10.414120850"), - } - calculated_values = ed_flights.median(numeric_only=numeric_only) + if numeric_only is False: + calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["Cancelled"], float) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) - assert ( - expected_values["Cancelled"] == calculated_values.to_dict()["Cancelled"] - ) - assert ( - (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) - <= expected_values["AvgTicketPrice"] - <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) - ) - assert ( - pd.to_datetime("2018-01-21 23:00:00.000") - <= expected_values["timestamp"] - <= pd.to_datetime("2018-01-21 23:59:59.000") - ) - assert ( - expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] - ) - assert isinstance(calculated_values["Cancelled"], np.bool_) - assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert calculated_values.shape == (5,) elif numeric_only is None: - expected_values = { - "AvgTicketPrice": 640.3872852064159, - "Cancelled": False, - "dayOfWeek": 3, - "timestamp": pd.Timestamp("2018-01-21 23:58:10.414120850"), - } - calculated_values = ed_flights.median(numeric_only=numeric_only) + calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) assert isinstance(calculated_values["timestamp"], pd.Timestamp) - assert ( - (calculated_values.to_dict()["AvgTicketPrice"] * 0.9) - <= expected_values["AvgTicketPrice"] - <= (calculated_values.to_dict()["AvgTicketPrice"] * 1.1) - ) - assert ( - pd.to_datetime("2018-01-21 23:00:00.000") - <= expected_values["timestamp"] - <= pd.to_datetime("2018-01-21 23:59:00.000") - ) - assert isinstance(calculated_values["Cancelled"], np.bool_) - assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert calculated_values.shape == (4,) - # mad - @pytest.mark.parametrize("numeric_only", [True, False, None]) + # check dtypes and shape for mad + @pytest.mark.parametrize("numeric_only", [False, None]) def test_mad_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) - if numeric_only is True: - expected_values = {"AvgTicketPrice": 213.47889841845912, "dayOfWeek": 2.0} - calculated_values = ed_flights.mad(numeric_only=numeric_only) - assert ( - expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] - ) - assert ( - (calculated_values["AvgTicketPrice"] * 0.9) - <= expected_values["AvgTicketPrice"] - <= (calculated_values["AvgTicketPrice"] * 1.1) - ) - assert calculated_values["AvgTicketPrice"].dtype == np.dtype("float64") - elif numeric_only is False: - expected_values = {"AvgTicketPrice": 213.36870923117985, "dayOfWeek": 2.0} + if numeric_only is False: calculated_values = ed_flights.mad(numeric_only=numeric_only) - assert pd.isnull(calculated_values["timestamp"]) - assert np.isnan(calculated_values["DestCountry"]) - assert np.isnan(calculated_values["Cancelled"]) - calculated_values = calculated_values.drop( - ["timestamp", "DestCountry", "Cancelled"] - ) - assert ( - expected_values["dayOfWeek"] == calculated_values.to_dict()["dayOfWeek"] - ) - assert ( - (calculated_values["AvgTicketPrice"] * 0.9) - <= expected_values["AvgTicketPrice"] - <= (calculated_values["AvgTicketPrice"] * 1.1) - ) assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], np.float64) assert isinstance(calculated_values["dayOfWeek"], float) - + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) elif numeric_only is None: - expected_values = {"AvgTicketPrice": 213.4408885767035, "dayOfWeek": 2.0} calculated_values = ed_flights.mad(numeric_only=numeric_only) - assert ( - (calculated_values["AvgTicketPrice"] * 0.9) - <= expected_values["AvgTicketPrice"] - <= (calculated_values["AvgTicketPrice"] * 1.1) - ) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["dayOfWeek"], float) + assert calculated_values.shape == (2,) From 0d8cec9b4a2cef026f0fbf4428d523f03e91a053 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Tue, 22 Sep 2020 19:56:16 +0530 Subject: [PATCH 4/4] Update .filter to column[...] --- eland/dataframe.py | 2 +- eland/ndframe.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index b0e1555c..bd25a75c 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1380,7 +1380,7 @@ def aggregate( Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry']) + >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry']) >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int) AvgTicketPrice DistanceKilometers sum 8204364 92616288 diff --git a/eland/ndframe.py b/eland/ndframe.py index b0257188..6659ee28 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -186,7 +186,7 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mean() AvgTicketPrice 628.254 Cancelled 0.128494 @@ -235,7 +235,7 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.sum() AvgTicketPrice 8.20436e+06 Cancelled 1678 @@ -283,7 +283,7 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.min() AvgTicketPrice 100.021 Cancelled False @@ -330,7 +330,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.var() AvgTicketPrice 70964.570234 Cancelled 0.111987 @@ -376,7 +376,7 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.std() AvgTicketPrice 266.407061 Cancelled 0.334664 @@ -422,7 +422,7 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.median() # doctest: +SKIP AvgTicketPrice 640.363 Cancelled False @@ -471,7 +471,7 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp","DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.max() AvgTicketPrice 1199.73 Cancelled True @@ -556,7 +556,7 @@ def mad(self, numeric_only: bool = True) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights').filter(["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mad() # doctest: +SKIP AvgTicketPrice 213.35497 dayOfWeek 2.00000