Add support for es_match() to DataFrame and Series

elastic · Oct 29, 2020 · cb4cd08 · cb4cd08
1 parent 92a8040
commit cb4cd08
Show file tree

Hide file tree

Showing 11 changed files with 537 additions and 20 deletions.
diff --git a/docs/sphinx/reference/api/eland.DataFrame.es_match.rst b/docs/sphinx/reference/api/eland.DataFrame.es_match.rst
@@ -0,0 +1,6 @@
+eland.DataFrame.es_match
+========================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.es_match
diff --git a/docs/sphinx/reference/api/eland.Series.es_match.rst b/docs/sphinx/reference/api/eland.Series.es_match.rst
@@ -0,0 +1,6 @@
+eland.Series.es_match
+=====================
+
+.. currentmodule:: eland
+
+.. automethod:: Series.es_match
diff --git a/docs/sphinx/reference/dataframe.rst b/docs/sphinx/reference/dataframe.rst
@@ -111,6 +111,7 @@ Elasticsearch Functions
    :toctree: api/
 
    DataFrame.es_info
+   DataFrame.es_match
    DataFrame.es_query
    DataFrame.es_dtypes
 

diff --git a/docs/sphinx/reference/series.rst b/docs/sphinx/reference/series.rst
@@ -115,5 +115,6 @@ Elasticsearch Functions
    :toctree: api/
 
    Series.es_info
+   Series.es_match
    Series.es_dtype
    Series.es_dtypes
diff --git a/eland/dataframe.py b/eland/dataframe.py
@@ -19,7 +19,7 @@
 import sys
 import warnings
 from io import StringIO
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -632,6 +632,103 @@ def es_info(self):
     def info_es(self):
         return self.es_info()
 
+    def es_match(
+        self,
+        text: str,
+        *,
+        columns: Optional[Union[str, Sequence[str]]] = None,
+        match_phrase: bool = False,
+        must_not_match: bool = False,
+        multi_match_type: Optional[str] = None,
+        match_only_text_fields: bool = True,
+        analyzer: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
+        **kwargs: Any,
+    ) -> "DataFrame":
+        """Filters data with an Elasticsearch ``match``, ``match_phrase``, or
+        ``multi_match`` query depending on the given parameters and columns.
+
+        Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
+
+        By default all fields of type 'text' within Elasticsearch are queried
+        otherwise specific columns can be specified via the ``columns`` parameter
+        or a single column can be filtered on with :py:meth:`eland.Series.es_match`
+
+        All additional keyword arguments are passed in the body of the match query.
+
+        Parameters
+        ----------
+        text: str
+            String of text to search for
+        columns: str, List[str], optional
+            List of columns to search over. Defaults to all 'text' fields in Elasticsearch
+        match_phrase: bool, default False
+            If True will use ``match_phrase`` instead of ``match`` query which takes into account
+            the order of the ``text`` parameter.
+        must_not_match: bool, default False
+            If True will apply a boolean NOT (~) to the
+            query. Instead of requiring a match the query
+            will require text to not match.
+        multi_match_type: str, optional
+            If given and matching against multiple columns will set the ``multi_match.type`` setting
+        match_only_text_fields: bool, default True
+            When True this function will raise an error if any non-text fields
+            are queried to prevent fields that aren't analyzed from not working properly.
+            Set to False to ignore this preventative check.
+        analyzer: str, optional
+            Specify which analyzer to use for the match query
+        fuzziness: int, str, optional
+            Specify the fuzziness option for the match query
+
+        Returns
+        -------
+        DataFrame
+            A filtered :py:class:`eland.DataFrame` with the given match query
+
+        Examples
+        --------
+        >>> df = ed.DataFrame("localhost:9200", "ecommerce")
+        >>> df.es_match("Men's", columns=["category"])
+                                                      category currency  ...   type     user
+        0                                     [Men's Clothing]      EUR  ...  order    eddie
+        4                  [Men's Clothing, Men's Accessories]      EUR  ...  order    eddie
+        6                                     [Men's Clothing]      EUR  ...  order   oliver
+        7     [Men's Clothing, Men's Accessories, Men's Shoes]      EUR  ...  order      abd
+        11                 [Men's Accessories, Men's Clothing]      EUR  ...  order    eddie
+        ...                                                ...      ...  ...    ...      ...
+        4663                     [Men's Shoes, Men's Clothing]      EUR  ...  order    samir
+        4667                     [Men's Clothing, Men's Shoes]      EUR  ...  order   sultan
+        4671                                  [Men's Clothing]      EUR  ...  order      jim
+        4672                                  [Men's Clothing]      EUR  ...  order    yahya
+        4674             [Women's Accessories, Men's Clothing]      EUR  ...  order  jackson
+        <BLANKLINE>
+        [2310 rows x 45 columns]
+        """
+        # Determine which columns will be used
+        es_dtypes = self.es_dtypes.to_dict()
+        if columns is None:
+            columns = [
+                column for column, es_dtype in es_dtypes.items() if es_dtype == "text"
+            ]
+        elif isinstance(columns, str):
+            columns = [columns]
+        columns = list(columns)
+
+        qc = self._query_compiler
+        filter = qc.es_match(
+            text,
+            columns,
+            match_phrase=match_phrase,
+            match_only_text_fields=match_only_text_fields,
+            multi_match_type=multi_match_type,
+            analyzer=analyzer,
+            fuzziness=fuzziness,
+            **kwargs,
+        )
+        if must_not_match:
+            filter = ~filter
+        return DataFrame(_query_compiler=qc._update_query(filter))
+
     def es_query(self, query) -> "DataFrame":
         """Applies an Elasticsearch DSL query to the current DataFrame.
 

diff --git a/eland/operations.py b/eland/operations.py
@@ -995,24 +995,12 @@ def _es_results(self, query_compiler, collector):
         is_scan = False
         if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
             if size > 0:
-                try:
-
-                    es_results = query_compiler._client.search(
-                        index=query_compiler._index_pattern,
-                        size=size,
-                        sort=sort_params,
-                        body=body,
-                    )
-                except Exception:
-                    # Catch all ES errors and print debug (currently to stdout)
-                    error = {
-                        "index": query_compiler._index_pattern,
-                        "size": size,
-                        "sort": sort_params,
-                        "body": body,
-                    }
-                    print("Elasticsearch error:", error)
-                    raise
+                es_results = query_compiler._client.search(
+                    index=query_compiler._index_pattern,
+                    size=size,
+                    sort=sort_params,
+                    body=body,
+                )
         else:
             is_scan = True
             es_results = scan(

diff --git a/eland/query_compiler.py b/eland/query_compiler.py
@@ -17,7 +17,7 @@
 
 import copy
 from datetime import datetime
-from typing import TYPE_CHECKING, List, Optional, Sequence
+from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union
 
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@@ -430,6 +430,77 @@ def sample(self, n=None, frac=None, random_state=None):
 
         return result
 
+    def es_match(
+        self,
+        text: str,
+        columns: Sequence[str],
+        *,
+        match_phrase: bool = False,
+        match_only_text_fields: bool = True,
+        multi_match_type: Optional[str] = None,
+        analyzer: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
+        **kwargs: Any,
+    ) -> QueryFilter:
+        if len(columns) < 1:
+            raise ValueError("columns can't be empty")
+
+        es_dtypes = self.es_dtypes.to_dict()
+
+        # Build the base options for the 'match_*' query
+        options = {"query": text}
+        if analyzer is not None:
+            options["analyzer"] = analyzer
+        if fuzziness is not None:
+            options["fuzziness"] = fuzziness
+        options.update(kwargs)
+
+        # Warn the user if they're not querying text columns
+        if match_only_text_fields:
+            non_text_columns = {}
+            for column in columns:
+                # Don't worry about wildcards
+                if "*" in column:
+                    continue
+
+                es_dtype = es_dtypes[column]
+                if es_dtype != "text":
+                    non_text_columns[column] = es_dtype
+            if non_text_columns:
+                raise ValueError(
+                    f"Attempting to run es_match() on non-text fields "
+                    f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) "
+                    f"means that these fields may not be analyzed properly. "
+                    f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
+                    f"to use match anyways"
+                )
+        else:
+            options.setdefault("lenient", True)
+
+        # If only one column use 'match'
+        # otherwise use 'multi_match' with 'fields'
+        if len(columns) == 1:
+            if multi_match_type is not None:
+                raise ValueError(
+                    "multi_match_type parameter only valid "
+                    "when searching more than one column"
+                )
+            query = {"match_phrase" if match_phrase else "match": {columns[0]: options}}
+        else:
+            options["fields"] = columns
+            if match_phrase:
+                if multi_match_type not in ("phrase", None):
+                    raise ValueError(
+                        f"match_phrase=True and multi_match_type={multi_match_type!r} "
+                        f"are not compatible. Must be multi_match_type='phrase'"
+                    )
+                multi_match_type = "phrase"
+            if multi_match_type is not None:
+                options["type"] = multi_match_type
+
+            query = {"multi_match": options}
+        return QueryFilter(query)
+
     def es_query(self, query):
         return self._update_query(QueryFilter(query))
 

diff --git a/eland/series.py b/eland/series.py
@@ -55,6 +55,7 @@
     LessEqual,
     NotFilter,
     NotNull,
+    QueryFilter,
     ScriptFilter,
 )
 from eland.ndframe import NDFrame
@@ -636,6 +637,74 @@ def filter(
         )
         return Series(_query_compiler=new_query_compiler)
 
+    def es_match(
+        self,
+        text: str,
+        *,
+        match_phrase: bool = False,
+        match_only_text_fields: bool = True,
+        analyzer: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
+        **kwargs: Any,
+    ) -> QueryFilter:
+        """Filters data with an Elasticsearch ``match`` or ``match_phrase``
+        query depending on the given parameters.
+
+        Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
+
+        All additional keyword arguments are passed in the body of the match query.
+
+        Parameters
+        ----------
+        text: str
+            String of text to search for
+        match_phrase: bool, default False
+            If True will use ``match_phrase`` instead of ``match`` query which takes into account
+            the order of the ``text`` parameter.
+        match_only_text_fields: bool, default True
+            When True this function will raise an error if any non-text fields
+            are queried to prevent fields that aren't analyzed from not working properly.
+            Set to False to ignore this preventative check.
+        analyzer: str, optional
+            Specify which analyzer to use for the match query
+        fuzziness: int, str, optional
+            Specify the fuzziness option for the match query
+
+        Returns
+        -------
+        QueryFilter
+            Boolean filter to be combined with other filters and
+            then passed to DataFrame[...].
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost:9200", "ecommerce",
+        ...   columns=["category", "taxful_total_price"]
+        ... )
+        >>> df[
+        ...     df.category.es_match("Men's")
+        ...     & (df.taxful_total_price > 200.0)
+        ... ].head(5)
+                                       category  taxful_total_price
+        13                     [Men's Clothing]              266.96
+        33                     [Men's Clothing]              221.98
+        54                     [Men's Clothing]              234.98
+        93   [Men's Shoes, Women's Accessories]              239.98
+        273                       [Men's Shoes]              214.98
+        <BLANKLINE>
+        [5 rows x 2 columns]
+        """
+        return self._query_compiler.es_match(
+            text,
+            columns=[self.name],
+            match_phrase=match_phrase,
+            match_only_text_fields=match_only_text_fields,
+            analyzer=analyzer,
+            fuzziness=fuzziness,
+            **kwargs,
+        )
+
     def es_info(self) -> str:
         buf = StringIO()
 

diff --git a/eland/tests/dataframe/test_es_match_pytest.py b/eland/tests/dataframe/test_es_match_pytest.py
@@ -0,0 +1,41 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+# File called _pytest for PyCharm compatability
+
+from eland.tests.common import TestData
+
+
+class TestEsMatch(TestData):
+    def test_match(self):
+        df = self.ed_ecommerce()
+
+        categories = list(df.es_match("Men's").category.to_pandas())
+        assert len(categories) > 0
+        assert all(any("Men's" in y for y in x) for x in categories)
+
+    def test_must_not_match(self):
+        df = self.ed_ecommerce()
+
+        categories = list(
+            df.es_match("Men's", must_not_match=True)
+            .es_match("Women's")
+            .category.to_pandas()
+        )
+        assert len(categories) > 0
+        assert all(all("Men's" not in y for y in x) for x in categories)
+        assert all(any("Women's" in y for y in x) for x in categories)