Fix Series.describe(), median agg dtype

elastic · Aug 17, 2020 · 68c2937 · 68c2937
1 parent f5b37e6
commit 68c2937
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 5 deletions.
diff --git a/eland/operations.py b/eland/operations.py
@@ -301,7 +301,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr
                     )
 
                 # These aggregations maintain the column datatype
-                elif pd_agg in ("max", "min"):
+                elif pd_agg in {"max", "min", "median"}:
                     agg_value = field.np_dtype.type(agg_value)
 
                 values.append(agg_value)

diff --git a/eland/series.py b/eland/series.py
@@ -425,7 +425,7 @@ def to_pandas(self, show_progress: bool = False) -> pd.Series:
         return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
 
     @property
-    def _dtype(self) -> np.dtype:
+    def dtype(self) -> np.dtype:
         # DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
         return self._query_compiler.dtypes[0]
 
@@ -1192,7 +1192,7 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series":
             self._query_compiler.check_arithmetics(right._query_compiler)
 
             right_object = ArithmeticSeries(
-                right._query_compiler, right.name, right._dtype
+                right._query_compiler, right.name, right.dtype
             )
             display_name = None
         elif np.issubdtype(np.dtype(type(right)), np.number):
@@ -1204,11 +1204,11 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series":
         else:
             raise TypeError(
                 f"unsupported operation type(s) [{method_name!r}] "
-                f"for operands ['{type(self)}' with dtype '{self._dtype}', "
+                f"for operands ['{type(self)}' with dtype '{self.dtype}', "
                 f"'{type(right).__name__}']"
             )
 
-        left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
+        left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
         left_object.arithmetic_operation(method_name, right_object)
 
         series = Series(
@@ -1430,6 +1430,41 @@ def mad(self, numeric_only=None):
         results = super().mad(numeric_only=numeric_only)
         return results.squeeze()
 
+    def describe(self) -> pd.Series:
+        """
+        Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
+        dataset’s distribution, excluding NaN values.
+
+        Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
+        The output will vary depending on what is provided. Refer to the notes below for more detail.
+
+        TODO - add additional arguments (current only numeric values supported)
+
+        Returns
+        -------
+        pandas.Series:
+            Summary information
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.Series.describe`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
+        count    13059.000000
+        mean       628.253689
+        std        266.386661
+        min        100.020531
+        ...
+        ...
+        ...
+        max       1199.729004
+        Name: AvgTicketPrice, dtype: float64
+        """
+        return super().describe().squeeze()
+
     # def values TODO - not implemented as causes current implementation of query to fail
 
     def to_numpy(self):

diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py
@@ -228,3 +228,25 @@ def test_flights_datetime_metrics_median(self):
             <= median
             <= pd.to_datetime("2018-01-01 12:00:00.000")
         )
+
+    def test_metric_agg_keep_dtypes(self):
+        # max, min, and median maintain their dtypes
+        df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
+        assert df.min().tolist() == [131.81910705566406, False, 0]
+        assert df.max().tolist() == [989.9527587890625, True, 0]
+        assert df.median().tolist() == [550.276123046875, False, 0]
+        all_agg = df.agg(["min", "max", "median"])
+        assert all_agg.dtypes.tolist() == [
+            np.dtype("float64"),
+            np.dtype("bool"),
+            np.dtype("int64"),
+        ]
+        assert all_agg.to_dict() == {
+            "AvgTicketPrice": {
+                "max": 989.9527587890625,
+                "median": 550.276123046875,
+                "min": 131.81910705566406,
+            },
+            "Cancelled": {"max": True, "median": False, "min": False},
+            "dayOfWeek": {"max": 0, "median": 0, "min": 0},
+        }
diff --git a/eland/tests/series/test_describe_pytest.py b/eland/tests/series/test_describe_pytest.py
@@ -0,0 +1,40 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+import pandas as pd
+from eland.tests.common import TestData, assert_series_equal
+
+
+class TestSeriesDescribe(TestData):
+    def test_series_describe(self):
+        ed_df = self.ed_flights_small()
+        pd_df = self.pd_flights_small()
+
+        ed_desc = ed_df.AvgTicketPrice.describe()
+        pd_desc = pd_df.AvgTicketPrice.describe()
+
+        assert isinstance(ed_desc, pd.Series)
+        assert ed_desc.shape == pd_desc.shape
+        assert ed_desc.dtype == pd_desc.dtype
+        assert ed_desc.index.equals(pd_desc.index)
+
+        # Percentiles calculations vary for Elasticsearch
+        assert_series_equal(
+            ed_desc[["count", "mean", "std", "min", "max"]],
+            pd_desc[["count", "mean", "std", "min", "max"]],
+            rtol=0.2,
+        )