Skip to content

Commit

Permalink
Fix Series.describe(), median agg dtype
Browse files Browse the repository at this point in the history
  • Loading branch information
sethmlarson committed Aug 17, 2020
1 parent f5b37e6 commit 68c2937
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 5 deletions.
2 changes: 1 addition & 1 deletion eland/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr
)

# These aggregations maintain the column datatype
elif pd_agg in ("max", "min"):
elif pd_agg in {"max", "min", "median"}:
agg_value = field.np_dtype.type(agg_value)

values.append(agg_value)
Expand Down
43 changes: 39 additions & 4 deletions eland/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def to_pandas(self, show_progress: bool = False) -> pd.Series:
return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]

@property
def _dtype(self) -> np.dtype:
def dtype(self) -> np.dtype:
# DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
return self._query_compiler.dtypes[0]

Expand Down Expand Up @@ -1192,7 +1192,7 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series":
self._query_compiler.check_arithmetics(right._query_compiler)

right_object = ArithmeticSeries(
right._query_compiler, right.name, right._dtype
right._query_compiler, right.name, right.dtype
)
display_name = None
elif np.issubdtype(np.dtype(type(right)), np.number):
Expand All @@ -1204,11 +1204,11 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series":
else:
raise TypeError(
f"unsupported operation type(s) [{method_name!r}] "
f"for operands ['{type(self)}' with dtype '{self._dtype}', "
f"for operands ['{type(self)}' with dtype '{self.dtype}', "
f"'{type(right).__name__}']"
)

left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
left_object.arithmetic_operation(method_name, right_object)

series = Series(
Expand Down Expand Up @@ -1430,6 +1430,41 @@ def mad(self, numeric_only=None):
results = super().mad(numeric_only=numeric_only)
return results.squeeze()

def describe(self) -> pd.Series:
"""
Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
dataset’s distribution, excluding NaN values.
Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
The output will vary depending on what is provided. Refer to the notes below for more detail.
TODO - add additional arguments (current only numeric values supported)
Returns
-------
pandas.Series:
Summary information
See Also
--------
:pandas_api_docs:`pandas.Series.describe`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
count 13059.000000
mean 628.253689
std 266.386661
min 100.020531
...
...
...
max 1199.729004
Name: AvgTicketPrice, dtype: float64
"""
return super().describe().squeeze()

# def values TODO - not implemented as causes current implementation of query to fail

def to_numpy(self):
Expand Down
22 changes: 22 additions & 0 deletions eland/tests/dataframe/test_metrics_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,25 @@ def test_flights_datetime_metrics_median(self):
<= median
<= pd.to_datetime("2018-01-01 12:00:00.000")
)

def test_metric_agg_keep_dtypes(self):
# max, min, and median maintain their dtypes
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
assert df.min().tolist() == [131.81910705566406, False, 0]
assert df.max().tolist() == [989.9527587890625, True, 0]
assert df.median().tolist() == [550.276123046875, False, 0]
all_agg = df.agg(["min", "max", "median"])
assert all_agg.dtypes.tolist() == [
np.dtype("float64"),
np.dtype("bool"),
np.dtype("int64"),
]
assert all_agg.to_dict() == {
"AvgTicketPrice": {
"max": 989.9527587890625,
"median": 550.276123046875,
"min": 131.81910705566406,
},
"Cancelled": {"max": True, "median": False, "min": False},
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
}
40 changes: 40 additions & 0 deletions eland/tests/series/test_describe_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import pandas as pd
from eland.tests.common import TestData, assert_series_equal


class TestSeriesDescribe(TestData):
def test_series_describe(self):
ed_df = self.ed_flights_small()
pd_df = self.pd_flights_small()

ed_desc = ed_df.AvgTicketPrice.describe()
pd_desc = pd_df.AvgTicketPrice.describe()

assert isinstance(ed_desc, pd.Series)
assert ed_desc.shape == pd_desc.shape
assert ed_desc.dtype == pd_desc.dtype
assert ed_desc.index.equals(pd_desc.index)

# Percentiles calculations vary for Elasticsearch
assert_series_equal(
ed_desc[["count", "mean", "std", "min", "max"]],
pd_desc[["count", "mean", "std", "min", "max"]],
rtol=0.2,
)

0 comments on commit 68c2937

Please sign in to comment.