Skip to content

Commit

Permalink
Add support for es_match() to DataFrame and Series
Browse files Browse the repository at this point in the history
  • Loading branch information
sethmlarson committed Oct 29, 2020
1 parent 92a8040 commit cb4cd08
Show file tree
Hide file tree
Showing 11 changed files with 537 additions and 20 deletions.
6 changes: 6 additions & 0 deletions docs/sphinx/reference/api/eland.DataFrame.es_match.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
eland.DataFrame.es_match
========================

.. currentmodule:: eland

.. automethod:: DataFrame.es_match
6 changes: 6 additions & 0 deletions docs/sphinx/reference/api/eland.Series.es_match.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
eland.Series.es_match
=====================

.. currentmodule:: eland

.. automethod:: Series.es_match
1 change: 1 addition & 0 deletions docs/sphinx/reference/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Elasticsearch Functions
:toctree: api/

DataFrame.es_info
DataFrame.es_match
DataFrame.es_query
DataFrame.es_dtypes

Expand Down
1 change: 1 addition & 0 deletions docs/sphinx/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,5 +115,6 @@ Elasticsearch Functions
:toctree: api/

Series.es_info
Series.es_match
Series.es_dtype
Series.es_dtypes
99 changes: 98 additions & 1 deletion eland/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import sys
import warnings
from io import StringIO
from typing import List, Optional, Sequence, Tuple, Union
from typing import Any, List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -632,6 +632,103 @@ def es_info(self):
def info_es(self):
return self.es_info()

def es_match(
self,
text: str,
*,
columns: Optional[Union[str, Sequence[str]]] = None,
match_phrase: bool = False,
must_not_match: bool = False,
multi_match_type: Optional[str] = None,
match_only_text_fields: bool = True,
analyzer: Optional[str] = None,
fuzziness: Optional[Union[int, str]] = None,
**kwargs: Any,
) -> "DataFrame":
"""Filters data with an Elasticsearch ``match``, ``match_phrase``, or
``multi_match`` query depending on the given parameters and columns.
Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
By default all fields of type 'text' within Elasticsearch are queried
otherwise specific columns can be specified via the ``columns`` parameter
or a single column can be filtered on with :py:meth:`eland.Series.es_match`
All additional keyword arguments are passed in the body of the match query.
Parameters
----------
text: str
String of text to search for
columns: str, List[str], optional
List of columns to search over. Defaults to all 'text' fields in Elasticsearch
match_phrase: bool, default False
If True will use ``match_phrase`` instead of ``match`` query which takes into account
the order of the ``text`` parameter.
must_not_match: bool, default False
If True will apply a boolean NOT (~) to the
query. Instead of requiring a match the query
will require text to not match.
multi_match_type: str, optional
If given and matching against multiple columns will set the ``multi_match.type`` setting
match_only_text_fields: bool, default True
When True this function will raise an error if any non-text fields
are queried to prevent fields that aren't analyzed from not working properly.
Set to False to ignore this preventative check.
analyzer: str, optional
Specify which analyzer to use for the match query
fuzziness: int, str, optional
Specify the fuzziness option for the match query
Returns
-------
DataFrame
A filtered :py:class:`eland.DataFrame` with the given match query
Examples
--------
>>> df = ed.DataFrame("localhost:9200", "ecommerce")
>>> df.es_match("Men's", columns=["category"])
category currency ... type user
0 [Men's Clothing] EUR ... order eddie
4 [Men's Clothing, Men's Accessories] EUR ... order eddie
6 [Men's Clothing] EUR ... order oliver
7 [Men's Clothing, Men's Accessories, Men's Shoes] EUR ... order abd
11 [Men's Accessories, Men's Clothing] EUR ... order eddie
... ... ... ... ... ...
4663 [Men's Shoes, Men's Clothing] EUR ... order samir
4667 [Men's Clothing, Men's Shoes] EUR ... order sultan
4671 [Men's Clothing] EUR ... order jim
4672 [Men's Clothing] EUR ... order yahya
4674 [Women's Accessories, Men's Clothing] EUR ... order jackson
<BLANKLINE>
[2310 rows x 45 columns]
"""
# Determine which columns will be used
es_dtypes = self.es_dtypes.to_dict()
if columns is None:
columns = [
column for column, es_dtype in es_dtypes.items() if es_dtype == "text"
]
elif isinstance(columns, str):
columns = [columns]
columns = list(columns)

qc = self._query_compiler
filter = qc.es_match(
text,
columns,
match_phrase=match_phrase,
match_only_text_fields=match_only_text_fields,
multi_match_type=multi_match_type,
analyzer=analyzer,
fuzziness=fuzziness,
**kwargs,
)
if must_not_match:
filter = ~filter
return DataFrame(_query_compiler=qc._update_query(filter))

def es_query(self, query) -> "DataFrame":
"""Applies an Elasticsearch DSL query to the current DataFrame.
Expand Down
24 changes: 6 additions & 18 deletions eland/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,24 +995,12 @@ def _es_results(self, query_compiler, collector):
is_scan = False
if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
if size > 0:
try:

es_results = query_compiler._client.search(
index=query_compiler._index_pattern,
size=size,
sort=sort_params,
body=body,
)
except Exception:
# Catch all ES errors and print debug (currently to stdout)
error = {
"index": query_compiler._index_pattern,
"size": size,
"sort": sort_params,
"body": body,
}
print("Elasticsearch error:", error)
raise
es_results = query_compiler._client.search(
index=query_compiler._index_pattern,
size=size,
sort=sort_params,
body=body,
)
else:
is_scan = True
es_results = scan(
Expand Down
73 changes: 72 additions & 1 deletion eland/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import copy
from datetime import datetime
from typing import TYPE_CHECKING, List, Optional, Sequence
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union

import numpy as np # type: ignore
import pandas as pd # type: ignore
Expand Down Expand Up @@ -430,6 +430,77 @@ def sample(self, n=None, frac=None, random_state=None):

return result

def es_match(
self,
text: str,
columns: Sequence[str],
*,
match_phrase: bool = False,
match_only_text_fields: bool = True,
multi_match_type: Optional[str] = None,
analyzer: Optional[str] = None,
fuzziness: Optional[Union[int, str]] = None,
**kwargs: Any,
) -> QueryFilter:
if len(columns) < 1:
raise ValueError("columns can't be empty")

es_dtypes = self.es_dtypes.to_dict()

# Build the base options for the 'match_*' query
options = {"query": text}
if analyzer is not None:
options["analyzer"] = analyzer
if fuzziness is not None:
options["fuzziness"] = fuzziness
options.update(kwargs)

# Warn the user if they're not querying text columns
if match_only_text_fields:
non_text_columns = {}
for column in columns:
# Don't worry about wildcards
if "*" in column:
continue

es_dtype = es_dtypes[column]
if es_dtype != "text":
non_text_columns[column] = es_dtype
if non_text_columns:
raise ValueError(
f"Attempting to run es_match() on non-text fields "
f"({', '.join([k + '=' + v for k, v in non_text_columns.items()])}) "
f"means that these fields may not be analyzed properly. "
f"Consider reindexing these fields as text or use 'match_only_text_es_dtypes=False' "
f"to use match anyways"
)
else:
options.setdefault("lenient", True)

# If only one column use 'match'
# otherwise use 'multi_match' with 'fields'
if len(columns) == 1:
if multi_match_type is not None:
raise ValueError(
"multi_match_type parameter only valid "
"when searching more than one column"
)
query = {"match_phrase" if match_phrase else "match": {columns[0]: options}}
else:
options["fields"] = columns
if match_phrase:
if multi_match_type not in ("phrase", None):
raise ValueError(
f"match_phrase=True and multi_match_type={multi_match_type!r} "
f"are not compatible. Must be multi_match_type='phrase'"
)
multi_match_type = "phrase"
if multi_match_type is not None:
options["type"] = multi_match_type

query = {"multi_match": options}
return QueryFilter(query)

def es_query(self, query):
return self._update_query(QueryFilter(query))

Expand Down
69 changes: 69 additions & 0 deletions eland/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
LessEqual,
NotFilter,
NotNull,
QueryFilter,
ScriptFilter,
)
from eland.ndframe import NDFrame
Expand Down Expand Up @@ -636,6 +637,74 @@ def filter(
)
return Series(_query_compiler=new_query_compiler)

def es_match(
self,
text: str,
*,
match_phrase: bool = False,
match_only_text_fields: bool = True,
analyzer: Optional[str] = None,
fuzziness: Optional[Union[int, str]] = None,
**kwargs: Any,
) -> QueryFilter:
"""Filters data with an Elasticsearch ``match`` or ``match_phrase``
query depending on the given parameters.
Read more about `Full-Text Queries in Elasticsearch <https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html>`_
All additional keyword arguments are passed in the body of the match query.
Parameters
----------
text: str
String of text to search for
match_phrase: bool, default False
If True will use ``match_phrase`` instead of ``match`` query which takes into account
the order of the ``text`` parameter.
match_only_text_fields: bool, default True
When True this function will raise an error if any non-text fields
are queried to prevent fields that aren't analyzed from not working properly.
Set to False to ignore this preventative check.
analyzer: str, optional
Specify which analyzer to use for the match query
fuzziness: int, str, optional
Specify the fuzziness option for the match query
Returns
-------
QueryFilter
Boolean filter to be combined with other filters and
then passed to DataFrame[...].
Examples
--------
>>> df = ed.DataFrame(
... "localhost:9200", "ecommerce",
... columns=["category", "taxful_total_price"]
... )
>>> df[
... df.category.es_match("Men's")
... & (df.taxful_total_price > 200.0)
... ].head(5)
category taxful_total_price
13 [Men's Clothing] 266.96
33 [Men's Clothing] 221.98
54 [Men's Clothing] 234.98
93 [Men's Shoes, Women's Accessories] 239.98
273 [Men's Shoes] 214.98
<BLANKLINE>
[5 rows x 2 columns]
"""
return self._query_compiler.es_match(
text,
columns=[self.name],
match_phrase=match_phrase,
match_only_text_fields=match_only_text_fields,
analyzer=analyzer,
fuzziness=fuzziness,
**kwargs,
)

def es_info(self) -> str:
buf = StringIO()

Expand Down
41 changes: 41 additions & 0 deletions eland/tests/dataframe/test_es_match_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# File called _pytest for PyCharm compatability

from eland.tests.common import TestData


class TestEsMatch(TestData):
def test_match(self):
df = self.ed_ecommerce()

categories = list(df.es_match("Men's").category.to_pandas())
assert len(categories) > 0
assert all(any("Men's" in y for y in x) for x in categories)

def test_must_not_match(self):
df = self.ed_ecommerce()

categories = list(
df.es_match("Men's", must_not_match=True)
.es_match("Women's")
.category.to_pandas()
)
assert len(categories) > 0
assert all(all("Men's" not in y for y in x) for x in categories)
assert all(any("Women's" in y for y in x) for x in categories)
Loading

0 comments on commit cb4cd08

Please sign in to comment.