From 1f2ae026b6afd21579be14e89f9194f5c48d6599 Mon Sep 17 00:00:00 2001 From: Jabin Kong <1059978534@qq.com> Date: Wed, 18 Aug 2021 15:50:38 +0800 Subject: [PATCH 01/11] Add iterrows() and itertuples() DataFrame API --- .../api/eland.DataFrame.iterrows.rst | 6 ++ .../api/eland.DataFrame.itertuples.rst | 6 ++ docs/sphinx/reference/dataframe.rst | 2 + eland/dataframe.py | 100 +++++++++++++++++- eland/operations.py | 59 +++++++++++ eland/query_compiler.py | 33 ++++++ .../test_iterrows_itertuples_pytest.py | 45 ++++++++ 7 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 docs/sphinx/reference/api/eland.DataFrame.iterrows.rst create mode 100644 docs/sphinx/reference/api/eland.DataFrame.itertuples.rst create mode 100644 tests/dataframe/test_iterrows_itertuples_pytest.py diff --git a/docs/sphinx/reference/api/eland.DataFrame.iterrows.rst b/docs/sphinx/reference/api/eland.DataFrame.iterrows.rst new file mode 100644 index 00000000..2e3812ce --- /dev/null +++ b/docs/sphinx/reference/api/eland.DataFrame.iterrows.rst @@ -0,0 +1,6 @@ +eland.DataFrame.iterrows +======================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.iterrows diff --git a/docs/sphinx/reference/api/eland.DataFrame.itertuples.rst b/docs/sphinx/reference/api/eland.DataFrame.itertuples.rst new file mode 100644 index 00000000..3c3959df --- /dev/null +++ b/docs/sphinx/reference/api/eland.DataFrame.itertuples.rst @@ -0,0 +1,6 @@ +eland.DataFrame.itertuples +========================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.itertuples diff --git a/docs/sphinx/reference/dataframe.rst b/docs/sphinx/reference/dataframe.rst index 3a5c24be..2faf9dd5 100644 --- a/docs/sphinx/reference/dataframe.rst +++ b/docs/sphinx/reference/dataframe.rst @@ -38,6 +38,8 @@ Indexing, Iteration DataFrame.get DataFrame.query DataFrame.sample + DataFrame.iterrows + DataFrame.itertuples Function Application, GroupBy & Window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/eland/dataframe.py b/eland/dataframe.py index 80352a6f..ce367102 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,7 +19,7 @@ import sys import warnings from io import StringIO -from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd # type: ignore @@ -1446,6 +1446,104 @@ def keys(self) -> pd.Index: """ return self.columns + def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: + """ + Iterate over eland.DataFrame rows as (index, pandas.Series) pairs. + + Yields: + index: index + The index of the row. + data: pandas Series + The data of the row as a pandas Series. + + See Also + -------- + eland.DataFrame.itertuples: Iterate over eland.DataFrame rows as namedtuples. + + Examples + -------- + >>> df = ed.DataFrame('localhost:9200', 'flights') + >>> df.head() + AvgTicketPrice Cancelled ... dayOfWeek timestamp + 0 841.265642 False ... 0 2018-01-01 00:00:00 + 1 882.982662 False ... 0 2018-01-01 18:27:00 + 2 190.636904 False ... 0 2018-01-01 17:11:14 + 3 181.694216 True ... 0 2018-01-01 10:33:28 + 4 730.041778 False ... 0 2018-01-01 05:13:00 + + [5 rows x 27 columns] + + >>> for index, row in df.iterrows() + ... print(row) + + AvgTicketPrice 841.265642 + Cancelled False + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 + Name: 0, dtype: object + + AvgTicketPrice 882.982662 + Cancelled False + dayOfWeek 0 + timestamp 2018-01-01 18:27:00 + Name: 1, dtype: object + """ + return self._query_compiler.iterrows() + + def itertuples( + self, index: bool = True, name: Union[str, None] = "Eland" + ) -> Iterable[Tuple[Any, ...]]: + """ + Iterate over eland.DataFrame rows as namedtuples. + + Args: + index: bool, default True + If True, return the index as the first element of the tuple. + name: str or None, default "Eland" + The name of the returned namedtuples or None to return regular tuples. + + Returns: + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + eland.DataFrame.iterrows: Iterate over eland.DataFrame rows as (index, pandas.Series) pairs. + + Examples + -------- + >>> df = ed.DataFrame('localhost:9200', 'flights') + >>> df.head() + AvgTicketPrice Cancelled ... dayOfWeek timestamp + 0 841.265642 False ... 0 2018-01-01 00:00:00 + 1 882.982662 False ... 0 2018-01-01 18:27:00 + 2 190.636904 False ... 0 2018-01-01 17:11:14 + 3 181.694216 True ... 0 2018-01-01 10:33:28 + 4 730.041778 False ... 0 2018-01-01 05:13:00 + + [5 rows x 27 columns] + + >>> for row in df.itertuples(): + ... print(row) + Eland(Index='0', AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') + Eland(Index='1', AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') + + By setting the `index` parameter to False we can remove the index as the first element of the tuple: + >>> for row in df.itertuples(index=False): + ... print(row) + Eland(AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') + Eland(AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') + + With the `name` parameter set we set a custom name for the yielded namedtuples: + >>> for row in df.itertuples(name='Flight'): + ... print(row) + Flight(Index='0', AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') + Flight(Index='1', AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') + """ + return self._query_compiler.itertuples(index=index, name=name) + def aggregate( self, func: Union[str, List[str]], diff --git a/eland/operations.py b/eland/operations.py index 48c33448..7fc494ca 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -23,6 +23,7 @@ Any, Dict, Generator, + Iterable, List, Optional, Sequence, @@ -1195,6 +1196,64 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] ) + def iterrows( + self, query_compiler: "QueryCompiler" + ) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: + query_params, post_processing = self._resolve_tasks(query_compiler) + result_size, sort_params = Operations._query_params_to_size_and_sort( + query_params + ) + + script_fields = query_params.script_fields + query = Query(query_params.query) + + body = query.to_search_body() + if script_fields is not None: + body["script_fields"] = script_fields + + # Only return requested field_names and add them to body + _source = query_compiler.get_field_names(include_scripted_fields=False) + body["_source"] = _source if _source else False + + if sort_params: + body["sort"] = [sort_params] + + for hits in _search_yield_hits( + query_compiler=query_compiler, body=body, max_number_of_hits=result_size + ): + df = query_compiler._es_results_to_pandas(hits) + df = self._apply_df_post_processing(df, post_processing) + yield from df.iterrows() + + def itertuples( + self, query_compiler: "QueryCompiler", index: bool, name: Union[str, None] + ) -> Iterable[Tuple[Any, ...]]: + query_params, post_processing = self._resolve_tasks(query_compiler) + result_size, sort_params = Operations._query_params_to_size_and_sort( + query_params + ) + + script_fields = query_params.script_fields + query = Query(query_params.query) + + body = query.to_search_body() + if script_fields is not None: + body["script_fields"] = script_fields + + # Only return requested field_names and add them to body + _source = query_compiler.get_field_names(include_scripted_fields=False) + body["_source"] = _source if _source else False + + if sort_params: + body["sort"] = [sort_params] + + for hits in _search_yield_hits( + query_compiler=query_compiler, body=body, max_number_of_hits=result_size + ): + df = query_compiler._es_results_to_pandas(hits) + df = self._apply_df_post_processing(df, post_processing) + yield from df.itertuples(index=index, name=name) + def to_pandas( self, query_compiler: "QueryCompiler", show_progress: bool = False ) -> pd.DataFrame: diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 207d1c15..e06db4e6 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -21,6 +21,7 @@ TYPE_CHECKING, Any, Dict, + Iterable, List, Optional, Sequence, @@ -527,6 +528,38 @@ def to_csv(self, **kwargs) -> Optional[str]: """ return self._operations.to_csv(self, **kwargs) + def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: + """ + Iterate over ed.DataFrame rows as (index, pd.Series) pairs. + + Yields: + index: index + The index of the row. + data: pandas Series + The data of the row as a pandas Series. + """ + return self._operations.iterrows(self) + + def itertuples( + self, index: bool, name: Union[str, None] + ) -> Iterable[Tuple[Any, ...]]: + """ + Iterate over eland.DataFrame rows as namedtuples. + + Args: + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Eland" + The name of the returned namedtuples or None to return regular tuples. + + Returns: + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + """ + return self._operations.itertuples(self, index, name) + # __getitem__ methods def getitem_column_array(self, key, numeric=False): """Get column data for target labels. diff --git a/tests/dataframe/test_iterrows_itertuples_pytest.py b/tests/dataframe/test_iterrows_itertuples_pytest.py new file mode 100644 index 00000000..5a7ea689 --- /dev/null +++ b/tests/dataframe/test_iterrows_itertuples_pytest.py @@ -0,0 +1,45 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# File called _pytest for PyCharm compatability + +from pandas.testing import assert_index_equal, assert_series_equal + +from tests.common import TestData + + +class TestDataFrameIterrowsItertuples(TestData): + def test_iterrows(self): + ed_flights_iterrows = self.ed_flights().iterrows() + pd_flights_iterrows = self.pd_flights().iterrows() + assert len(ed_flights_iterrows) == len(pd_flights_iterrows) + + for i in len(ed_flights_iterrows): + ed_index, ed_row = next(ed_flights_iterrows) + pd_index, pd_row = next(pd_flights_iterrows) + assert_index_equal(ed_index, pd_index) + assert_series_equal(ed_row, pd_row) + + def test_itertuples(self): + ed_flights_itertuples = self.ed_flights().itertuples(name=None) + pd_flights_itertuples = self.pd_flights().itertuples(name=None) + assert len(ed_flights_itertuples) == len(pd_flights_itertuples) + + for i in len(ed_flights_itertuples): + ed_row = next(ed_flights_itertuples) + pd_row = next(pd_flights_itertuples) + assert ed_row == pd_row From 4d3bde29abad659fa62da3820070bbc9092a7401 Mon Sep 17 00:00:00 2001 From: Jabin Kong <1059978534@qq.com> Date: Thu, 19 Aug 2021 14:29:24 +0800 Subject: [PATCH 02/11] Yielding from the `iterrows()` and `itertuples()` in dataframe.py --- eland/dataframe.py | 15 ++++--- eland/operations.py | 30 +++++++++++++ eland/query_compiler.py | 35 ++------------- .../test_iterrows_itertuples_pytest.py | 43 ++++++++++++++----- 4 files changed, 76 insertions(+), 47 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index ce367102..5b4875c8 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1450,7 +1450,8 @@ def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: """ Iterate over eland.DataFrame rows as (index, pandas.Series) pairs. - Yields: + Yields + ------ index: index The index of the row. data: pandas Series @@ -1488,7 +1489,8 @@ def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: timestamp 2018-01-01 18:27:00 Name: 1, dtype: object """ - return self._query_compiler.iterrows() + for df in self._query_compiler.yield_pandas_dataframe(): + yield from df.iterrows() def itertuples( self, index: bool = True, name: Union[str, None] = "Eland" @@ -1496,13 +1498,15 @@ def itertuples( """ Iterate over eland.DataFrame rows as namedtuples. - Args: + Args + ---- index: bool, default True If True, return the index as the first element of the tuple. name: str or None, default "Eland" The name of the returned namedtuples or None to return regular tuples. - Returns: + Returns + ------- iterator An object to iterate over namedtuples for each row in the DataFrame with the first field possibly being the index and @@ -1542,7 +1546,8 @@ def itertuples( Flight(Index='0', AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') Flight(Index='1', AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') """ - return self._query_compiler.itertuples(index=index, name=name) + for df in self._query_compiler.yield_pandas_dataframe(): + yield from df.itertuples(index=index, name=name) def aggregate( self, diff --git a/eland/operations.py b/eland/operations.py index 7fc494ca..784d31a4 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -1270,6 +1270,36 @@ def to_csv( df = self._es_results(query_compiler, show_progress) return df.to_csv(**kwargs) # type: ignore[no-any-return] + def search_yield_pandas_dataframe( + self, query_compiler: "QueryCompiler" + ) -> Generator["pd.DataFrame", None, None]: + query_params, post_processing = self._resolve_tasks(query_compiler) + + result_size, sort_params = Operations._query_params_to_size_and_sort( + query_params + ) + + script_fields = query_params.script_fields + query = Query(query_params.query) + + body = query.to_search_body() + if script_fields is not None: + body["script_fields"] = script_fields + + # Only return requested field_names and add them to body + _source = query_compiler.get_field_names(include_scripted_fields=False) + body["_source"] = _source if _source else False + + if sort_params: + body["sort"] = [sort_params] + + for hits in _search_yield_hits( + query_compiler=query_compiler, body=body, max_number_of_hits=result_size + ): + df = query_compiler._es_results_to_pandas(hits) + df = self._apply_df_post_processing(df, post_processing) + yield df + def _es_results( self, query_compiler: "QueryCompiler", show_progress: bool = False ) -> pd.DataFrame: diff --git a/eland/query_compiler.py b/eland/query_compiler.py index e06db4e6..79f2abae 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -21,7 +21,7 @@ TYPE_CHECKING, Any, Dict, - Iterable, + Generator, List, Optional, Sequence, @@ -528,37 +528,8 @@ def to_csv(self, **kwargs) -> Optional[str]: """ return self._operations.to_csv(self, **kwargs) - def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: - """ - Iterate over ed.DataFrame rows as (index, pd.Series) pairs. - - Yields: - index: index - The index of the row. - data: pandas Series - The data of the row as a pandas Series. - """ - return self._operations.iterrows(self) - - def itertuples( - self, index: bool, name: Union[str, None] - ) -> Iterable[Tuple[Any, ...]]: - """ - Iterate over eland.DataFrame rows as namedtuples. - - Args: - index : bool, default True - If True, return the index as the first element of the tuple. - name : str or None, default "Eland" - The name of the returned namedtuples or None to return regular tuples. - - Returns: - iterator - An object to iterate over namedtuples for each row in the - DataFrame with the first field possibly being the index and - following fields being the column values. - """ - return self._operations.itertuples(self, index, name) + def yield_pandas_dataframe(self) -> Generator["pd.DataFrame", None, None]: + return self._operations.search_yield_pandas_dataframe(self) # __getitem__ methods def getitem_column_array(self, key, numeric=False): diff --git a/tests/dataframe/test_iterrows_itertuples_pytest.py b/tests/dataframe/test_iterrows_itertuples_pytest.py index 5a7ea689..2400417f 100644 --- a/tests/dataframe/test_iterrows_itertuples_pytest.py +++ b/tests/dataframe/test_iterrows_itertuples_pytest.py @@ -24,22 +24,45 @@ class TestDataFrameIterrowsItertuples(TestData): def test_iterrows(self): - ed_flights_iterrows = self.ed_flights().iterrows() - pd_flights_iterrows = self.pd_flights().iterrows() - assert len(ed_flights_iterrows) == len(pd_flights_iterrows) + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_flights_iterrows = ed_flights.iterrows() + pd_flights_iterrows = pd_flights.iterrows() + + assert len(list(ed_flights_iterrows)) == len(list(pd_flights_iterrows)) + + for ed_index, ed_row in ed_flights_iterrows: - for i in len(ed_flights_iterrows): - ed_index, ed_row = next(ed_flights_iterrows) pd_index, pd_row = next(pd_flights_iterrows) + assert_index_equal(ed_index, pd_index) assert_series_equal(ed_row, pd_row) + for pd_index, pd_row in pd_flights_iterrows: + + ed_index, ed_row = next(ed_flights_iterrows) + + assert_index_equal(pd_index, ed_index) + assert_series_equal(pd_row, ed_row) + def test_itertuples(self): - ed_flights_itertuples = self.ed_flights().itertuples(name=None) - pd_flights_itertuples = self.pd_flights().itertuples(name=None) - assert len(ed_flights_itertuples) == len(pd_flights_itertuples) + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_flights_itertuples = ed_flights.itertuples(name=None) + pd_flights_itertuples = pd_flights.itertuples(name=None) + + assert len(list(ed_flights_itertuples)) == len(list(pd_flights_itertuples)) + + for ed_row in ed_flights_itertuples: - for i in len(ed_flights_itertuples): - ed_row = next(ed_flights_itertuples) pd_row = next(pd_flights_itertuples) + assert ed_row == pd_row + + for pd_row in pd_flights_itertuples: + + ed_row = next(ed_flights_itertuples) + + assert pd_row == ed_row From 9a0a9da033e4bdb66ca58d2e0c37b705cd846b02 Mon Sep 17 00:00:00 2001 From: Jabin Kong <1059978534@qq.com> Date: Thu, 19 Aug 2021 14:37:27 +0800 Subject: [PATCH 03/11] Remove Operations.iterrows and Operations.itertuples --- eland/operations.py | 58 --------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/eland/operations.py b/eland/operations.py index 784d31a4..8faddbe2 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -1196,64 +1196,6 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] ) - def iterrows( - self, query_compiler: "QueryCompiler" - ) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: - query_params, post_processing = self._resolve_tasks(query_compiler) - result_size, sort_params = Operations._query_params_to_size_and_sort( - query_params - ) - - script_fields = query_params.script_fields - query = Query(query_params.query) - - body = query.to_search_body() - if script_fields is not None: - body["script_fields"] = script_fields - - # Only return requested field_names and add them to body - _source = query_compiler.get_field_names(include_scripted_fields=False) - body["_source"] = _source if _source else False - - if sort_params: - body["sort"] = [sort_params] - - for hits in _search_yield_hits( - query_compiler=query_compiler, body=body, max_number_of_hits=result_size - ): - df = query_compiler._es_results_to_pandas(hits) - df = self._apply_df_post_processing(df, post_processing) - yield from df.iterrows() - - def itertuples( - self, query_compiler: "QueryCompiler", index: bool, name: Union[str, None] - ) -> Iterable[Tuple[Any, ...]]: - query_params, post_processing = self._resolve_tasks(query_compiler) - result_size, sort_params = Operations._query_params_to_size_and_sort( - query_params - ) - - script_fields = query_params.script_fields - query = Query(query_params.query) - - body = query.to_search_body() - if script_fields is not None: - body["script_fields"] = script_fields - - # Only return requested field_names and add them to body - _source = query_compiler.get_field_names(include_scripted_fields=False) - body["_source"] = _source if _source else False - - if sort_params: - body["sort"] = [sort_params] - - for hits in _search_yield_hits( - query_compiler=query_compiler, body=body, max_number_of_hits=result_size - ): - df = query_compiler._es_results_to_pandas(hits) - df = self._apply_df_post_processing(df, post_processing) - yield from df.itertuples(index=index, name=name) - def to_pandas( self, query_compiler: "QueryCompiler", show_progress: bool = False ) -> pd.DataFrame: From 37c3e31dc0e9fa6fb68cb561c6b03337d011a398 Mon Sep 17 00:00:00 2001 From: Jabin Kong <1059978534@qq.com> Date: Thu, 19 Aug 2021 14:46:31 +0800 Subject: [PATCH 04/11] Remove unuser import --- eland/operations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eland/operations.py b/eland/operations.py index 8faddbe2..29d66bef 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -23,7 +23,6 @@ Any, Dict, Generator, - Iterable, List, Optional, Sequence, From 24618113a7645cd73e90c2ebda7d69723a502b31 Mon Sep 17 00:00:00 2001 From: Jabin Kong <1059978534@qq.com> Date: Fri, 20 Aug 2021 11:48:02 +0800 Subject: [PATCH 05/11] Format docstrings --- eland/dataframe.py | 82 ++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index 5b4875c8..32093991 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1463,31 +1463,34 @@ def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: Examples -------- - >>> df = ed.DataFrame('localhost:9200', 'flights') - >>> df.head() - AvgTicketPrice Cancelled ... dayOfWeek timestamp - 0 841.265642 False ... 0 2018-01-01 00:00:00 - 1 882.982662 False ... 0 2018-01-01 18:27:00 - 2 190.636904 False ... 0 2018-01-01 17:11:14 - 3 181.694216 True ... 0 2018-01-01 10:33:28 - 4 730.041778 False ... 0 2018-01-01 05:13:00 + >>> df = ed.DataFrame('localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head() + >>> df + AvgTicketPrice Cancelled + 0 841.265642 False + 1 882.982662 False + 2 190.636904 False + 3 181.694216 True + 4 730.041778 False - [5 rows x 27 columns] + [5 rows x 2 columns] - >>> for index, row in df.iterrows() + >>> for index, row in df.iterrows(): ... print(row) - - AvgTicketPrice 841.265642 - Cancelled False - dayOfWeek 0 - timestamp 2018-01-01 00:00:00 + AvgTicketPrice 841.265642 + Cancelled False Name: 0, dtype: object - - AvgTicketPrice 882.982662 - Cancelled False - dayOfWeek 0 - timestamp 2018-01-01 18:27:00 + AvgTicketPrice 882.982662 + Cancelled False Name: 1, dtype: object + AvgTicketPrice 190.636904 + Cancelled False + Name: 2, dtype: object + AvgTicketPrice 181.694216 + Cancelled True + Name: 3, dtype: object + AvgTicketPrice 730.041778 + Cancelled False + Name: 4, dtype: object """ for df in self._query_compiler.yield_pandas_dataframe(): yield from df.iterrows() @@ -1518,33 +1521,42 @@ def itertuples( Examples -------- - >>> df = ed.DataFrame('localhost:9200', 'flights') - >>> df.head() - AvgTicketPrice Cancelled ... dayOfWeek timestamp - 0 841.265642 False ... 0 2018-01-01 00:00:00 - 1 882.982662 False ... 0 2018-01-01 18:27:00 - 2 190.636904 False ... 0 2018-01-01 17:11:14 - 3 181.694216 True ... 0 2018-01-01 10:33:28 - 4 730.041778 False ... 0 2018-01-01 05:13:00 + >>> df = ed.DataFrame('localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head() + >>> df + AvgTicketPrice Cancelled + 0 841.265642 False + 1 882.982662 False + 2 190.636904 False + 3 181.694216 True + 4 730.041778 False - [5 rows x 27 columns] + [5 rows x 2 columns] >>> for row in df.itertuples(): ... print(row) - Eland(Index='0', AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') - Eland(Index='1', AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') + Eland(Index='0', AvgTicketPrice=841.2656419677076, Cancelled=False) + Eland(Index='1', AvgTicketPrice=882.9826615595518, Cancelled=False) + Eland(Index='2', AvgTicketPrice=190.6369038508356, Cancelled=False) + Eland(Index='3', AvgTicketPrice=181.69421554118, Cancelled=True) + Eland(Index='4', AvgTicketPrice=730.041778346198, Cancelled=False) By setting the `index` parameter to False we can remove the index as the first element of the tuple: >>> for row in df.itertuples(index=False): ... print(row) - Eland(AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') - Eland(AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') + Eland(AvgTicketPrice=841.2656419677076, Cancelled=False) + Eland(AvgTicketPrice=882.9826615595518, Cancelled=False) + Eland(AvgTicketPrice=190.6369038508356, Cancelled=False) + Eland(AvgTicketPrice=181.69421554118, Cancelled=True) + Eland(AvgTicketPrice=730.041778346198, Cancelled=False) With the `name` parameter set we set a custom name for the yielded namedtuples: >>> for row in df.itertuples(name='Flight'): ... print(row) - Flight(Index='0', AvgTicketPrice=841.265642, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 00:00:00') - Flight(Index='1', AvgTicketPrice=882.982662, Cancelled=False, ..., dayOfWeek=0, timestamp='2018-01-01 18:27:00') + Flight(Index='0', AvgTicketPrice=841.2656419677076, Cancelled=False) + Flight(Index='1', AvgTicketPrice=882.9826615595518, Cancelled=False) + Flight(Index='2', AvgTicketPrice=190.6369038508356, Cancelled=False) + Flight(Index='3', AvgTicketPrice=181.69421554118, Cancelled=True) + Flight(Index='4', AvgTicketPrice=730.041778346198, Cancelled=False) """ for df in self._query_compiler.yield_pandas_dataframe(): yield from df.itertuples(index=index, name=name) From 408f01a9cc4c9c98f446327573cb3c6ac2b2911f Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Fri, 20 Aug 2021 07:38:32 -0500 Subject: [PATCH 06/11] Rename to search_yield_pandas_dataframes() --- eland/operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eland/operations.py b/eland/operations.py index 29d66bef..0fad8ef2 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -1211,7 +1211,7 @@ def to_csv( df = self._es_results(query_compiler, show_progress) return df.to_csv(**kwargs) # type: ignore[no-any-return] - def search_yield_pandas_dataframe( + def search_yield_pandas_dataframes( self, query_compiler: "QueryCompiler" ) -> Generator["pd.DataFrame", None, None]: query_params, post_processing = self._resolve_tasks(query_compiler) From 8169879c8fa7163f1095d1982aae3f70a19d3bb2 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Fri, 20 Aug 2021 07:39:19 -0500 Subject: [PATCH 07/11] Rename to search_yield_pandas_dataframes() --- eland/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index 32093991..d579379c 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1492,7 +1492,7 @@ def iterrows(self) -> Iterable[Tuple[Union[str, Tuple[str, ...]], pd.Series]]: Cancelled False Name: 4, dtype: object """ - for df in self._query_compiler.yield_pandas_dataframe(): + for df in self._query_compiler.search_yield_pandas_dataframes(): yield from df.iterrows() def itertuples( @@ -1558,7 +1558,7 @@ def itertuples( Flight(Index='3', AvgTicketPrice=181.69421554118, Cancelled=True) Flight(Index='4', AvgTicketPrice=730.041778346198, Cancelled=False) """ - for df in self._query_compiler.yield_pandas_dataframe(): + for df in self._query_compiler.search_yield_pandas_dataframes(): yield from df.itertuples(index=index, name=name) def aggregate( From 3bee54c72da04ed0a1ee1b0afe0ab5942c081efa Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Fri, 20 Aug 2021 07:39:57 -0500 Subject: [PATCH 08/11] Rename to search_yield_pandas_dataframes() --- eland/query_compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 79f2abae..4a818049 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -528,8 +528,8 @@ def to_csv(self, **kwargs) -> Optional[str]: """ return self._operations.to_csv(self, **kwargs) - def yield_pandas_dataframe(self) -> Generator["pd.DataFrame", None, None]: - return self._operations.search_yield_pandas_dataframe(self) + def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]: + return self._operations.search_yield_pandas_dataframes(self) # __getitem__ methods def getitem_column_array(self, key, numeric=False): From be0ce73671f0438f60d2084992a82cd036594218 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Fri, 20 Aug 2021 07:46:41 -0500 Subject: [PATCH 09/11] Update test_iterrows_itertuples_pytest.py --- .../test_iterrows_itertuples_pytest.py | 32 +++++-------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/tests/dataframe/test_iterrows_itertuples_pytest.py b/tests/dataframe/test_iterrows_itertuples_pytest.py index 2400417f..90523f9a 100644 --- a/tests/dataframe/test_iterrows_itertuples_pytest.py +++ b/tests/dataframe/test_iterrows_itertuples_pytest.py @@ -30,39 +30,23 @@ def test_iterrows(self): ed_flights_iterrows = ed_flights.iterrows() pd_flights_iterrows = pd_flights.iterrows() - assert len(list(ed_flights_iterrows)) == len(list(pd_flights_iterrows)) - for ed_index, ed_row in ed_flights_iterrows: - pd_index, pd_row = next(pd_flights_iterrows) assert_index_equal(ed_index, pd_index) assert_series_equal(ed_row, pd_row) - for pd_index, pd_row in pd_flights_iterrows: - - ed_index, ed_row = next(ed_flights_iterrows) - - assert_index_equal(pd_index, ed_index) - assert_series_equal(pd_row, ed_row) + # Assert that both are the same length and are exhausted. + with pytest.raises(StopIteration): + next(ed_flights_iterrows) + with pytest.raises(StopIteration): + next(pd_flights_iterrows) def test_itertuples(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() - ed_flights_itertuples = ed_flights.itertuples(name=None) - pd_flights_itertuples = pd_flights.itertuples(name=None) - - assert len(list(ed_flights_itertuples)) == len(list(pd_flights_itertuples)) - - for ed_row in ed_flights_itertuples: - - pd_row = next(pd_flights_itertuples) - - assert ed_row == pd_row - - for pd_row in pd_flights_itertuples: - - ed_row = next(ed_flights_itertuples) + ed_flights_itertuples = list(ed_flights.itertuples(name=None)) + pd_flights_itertuples = list(pd_flights.itertuples(name=None)) - assert pd_row == ed_row + assert ed_flights_itertuples == pd_flights_itertuples From 3445ca8733481df0d87c22ecc0b608ec1c13d151 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Fri, 20 Aug 2021 07:49:22 -0500 Subject: [PATCH 10/11] Add pytest import --- tests/dataframe/test_iterrows_itertuples_pytest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/dataframe/test_iterrows_itertuples_pytest.py b/tests/dataframe/test_iterrows_itertuples_pytest.py index 90523f9a..5021dd0e 100644 --- a/tests/dataframe/test_iterrows_itertuples_pytest.py +++ b/tests/dataframe/test_iterrows_itertuples_pytest.py @@ -17,6 +17,7 @@ # File called _pytest for PyCharm compatability +import pytest from pandas.testing import assert_index_equal, assert_series_equal from tests.common import TestData From 152e0b65415538293d6ef62eeb24d47eef07a27c Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Fri, 20 Aug 2021 08:20:25 -0500 Subject: [PATCH 11/11] Fix test cases to use proper assertions --- .../test_iterrows_itertuples_pytest.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/dataframe/test_iterrows_itertuples_pytest.py b/tests/dataframe/test_iterrows_itertuples_pytest.py index 5021dd0e..9dc495e9 100644 --- a/tests/dataframe/test_iterrows_itertuples_pytest.py +++ b/tests/dataframe/test_iterrows_itertuples_pytest.py @@ -18,7 +18,7 @@ # File called _pytest for PyCharm compatability import pytest -from pandas.testing import assert_index_equal, assert_series_equal +from pandas.testing import assert_series_equal from tests.common import TestData @@ -34,7 +34,7 @@ def test_iterrows(self): for ed_index, ed_row in ed_flights_iterrows: pd_index, pd_row = next(pd_flights_iterrows) - assert_index_equal(ed_index, pd_index) + assert ed_index == pd_index assert_series_equal(ed_row, pd_row) # Assert that both are the same length and are exhausted. @@ -50,4 +50,15 @@ def test_itertuples(self): ed_flights_itertuples = list(ed_flights.itertuples(name=None)) pd_flights_itertuples = list(pd_flights.itertuples(name=None)) - assert ed_flights_itertuples == pd_flights_itertuples + def assert_tuples_almost_equal(left, right): + # Shim which uses pytest.approx() for floating point values inside tuples. + assert len(left) == len(right) + assert all( + (lt == rt) # Not floats? Use == + if not isinstance(lt, float) and not isinstance(rt, float) + else (lt == pytest.approx(rt)) # If both are floats use pytest.approx() + for lt, rt in zip(left, right) + ) + + for ed_tuple, pd_tuple in zip(ed_flights_itertuples, pd_flights_itertuples): + assert_tuples_almost_equal(ed_tuple, pd_tuple)