Skip to content

Commit

Permalink
Adds Dataset.query() method, analogous to pandas DataFrame.query() (#…
Browse files Browse the repository at this point in the history
…4984)

* initial work on Dataset.query

* dataset query: test backends, engines, parsers; add docstring

* add error test

* unfortunate typo

* test three dims

* refine tests

* fix error message

Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com>

* add requires decorators

* revert change, should be func name

* improve Dataset.query tests

* add DataArray.query

* add query to API docs

* add query to whats new

* fix black, mypy

* refine test parameterisation and requirements

Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com>
  • Loading branch information
alimanfoo and max-sixty authored Mar 16, 2021
1 parent 14b288b commit 37fe544
Show file tree
Hide file tree
Showing 8 changed files with 346 additions and 0 deletions.
1 change: 1 addition & 0 deletions ci/requirements/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- nc-time-axis
- netcdf4
- numba
- numexpr
- numpy
- pandas
- pint
Expand Down
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ Indexing
Dataset.set_index
Dataset.reset_index
Dataset.reorder_levels
Dataset.query

Missing value handling
----------------------
Expand Down Expand Up @@ -321,6 +322,7 @@ Indexing
DataArray.set_index
DataArray.reset_index
DataArray.reorder_levels
DataArray.query

Missing value handling
----------------------
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ v0.17.1 (unreleased)

New Features
~~~~~~~~~~~~

- Add :py:meth:`Dataset.query` and :py:meth:`DataArray.query` which enable indexing
of datasets and data arrays by evaluating query expressions against the values of the
data variables (:pull:`4984`). By `Alistair Miles <https://github.com/alimanfoo>`_.
- Allow passing ``combine_attrs`` to :py:meth:`Dataset.merge` (:pull:`4895`).
By `Justus Magin <https://github.com/keewis>`_.
- Support for `dask.graph_manipulation
Expand Down
64 changes: 64 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4354,6 +4354,70 @@ def argmax(
else:
return self._replace_maybe_drop_dims(result)

def query(
self,
queries: Mapping[Hashable, Any] = None,
parser: str = "pandas",
engine: str = None,
missing_dims: str = "raise",
**queries_kwargs: Any,
) -> "DataArray":
"""Return a new data array indexed along the specified
dimension(s), where the indexers are given as strings containing
Python expressions to be evaluated against the values in the array.
Parameters
----------
queries : dict, optional
A dict with keys matching dimensions and values given by strings
containing Python expressions to be evaluated against the data variables
in the dataset. The expressions will be evaluated using the pandas
eval() function, and can contain any valid Python expressions but cannot
contain any Python statements.
parser : {"pandas", "python"}, default: "pandas"
The parser to use to construct the syntax tree from the expression.
The default of 'pandas' parses code slightly different than standard
Python. Alternatively, you can parse an expression using the 'python'
parser to retain strict Python semantics.
engine: {"python", "numexpr", None}, default: None
The engine used to evaluate the expression. Supported engines are:
- None: tries to use numexpr, falls back to python
- "numexpr": evaluates expressions using numexpr
- "python": performs operations as if you had eval’d in top level python
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
What to do if dimensions that should be selected from are not present in the
Dataset:
- "raise": raise an exception
- "warning": raise a warning, and ignore the missing dimensions
- "ignore": ignore the missing dimensions
**queries_kwargs : {dim: query, ...}, optional
The keyword arguments form of ``queries``.
One of queries or queries_kwargs must be provided.
Returns
-------
obj : DataArray
A new DataArray with the same contents as this dataset, indexed by
the results of the appropriate queries.
See Also
--------
DataArray.isel
Dataset.query
pandas.eval
"""

ds = self._to_dataset_whole(shallow_copy=True)
ds = ds.query(
queries=queries,
parser=parser,
engine=engine,
missing_dims=missing_dims,
**queries_kwargs,
)
return ds[self.name]

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor)
Expand Down
73 changes: 73 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7001,5 +7001,78 @@ def argmax(self, dim=None, **kwargs):
"Dataset.argmin() with a sequence or ... for dim"
)

def query(
self,
queries: Mapping[Hashable, Any] = None,
parser: str = "pandas",
engine: str = None,
missing_dims: str = "raise",
**queries_kwargs: Any,
) -> "Dataset":
"""Return a new dataset with each array indexed along the specified
dimension(s), where the indexers are given as strings containing
Python expressions to be evaluated against the data variables in the
dataset.
Parameters
----------
queries : dict, optional
A dict with keys matching dimensions and values given by strings
containing Python expressions to be evaluated against the data variables
in the dataset. The expressions will be evaluated using the pandas
eval() function, and can contain any valid Python expressions but cannot
contain any Python statements.
parser : {"pandas", "python"}, default: "pandas"
The parser to use to construct the syntax tree from the expression.
The default of 'pandas' parses code slightly different than standard
Python. Alternatively, you can parse an expression using the 'python'
parser to retain strict Python semantics.
engine: {"python", "numexpr", None}, default: None
The engine used to evaluate the expression. Supported engines are:
- None: tries to use numexpr, falls back to python
- "numexpr": evaluates expressions using numexpr
- "python": performs operations as if you had eval’d in top level python
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
What to do if dimensions that should be selected from are not present in the
Dataset:
- "raise": raise an exception
- "warning": raise a warning, and ignore the missing dimensions
- "ignore": ignore the missing dimensions
**queries_kwargs : {dim: query, ...}, optional
The keyword arguments form of ``queries``.
One of queries or queries_kwargs must be provided.
Returns
-------
obj : Dataset
A new Dataset with the same contents as this dataset, except each
array and dimension is indexed by the results of the appropriate
queries.
See Also
--------
Dataset.isel
pandas.eval
"""

# allow queries to be given either as a dict or as kwargs
queries = either_dict_or_kwargs(queries, queries_kwargs, "query")

# check queries
for dim, expr in queries.items():
if not isinstance(expr, str):
msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
raise ValueError(msg)

# evaluate the queries to create the indexers
indexers = {
dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
for dim, expr in queries.items()
}

# apply the selection
return self.isel(indexers, missing_dims=missing_dims)


ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
1 change: 1 addition & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def LooseVersion(vstring):
has_cartopy, requires_cartopy = _importorskip("cartopy")
# Need Pint 0.15 for __dask_tokenize__ tests for Quantity wrapped Dask Arrays
has_pint_0_15, requires_pint_0_15 = _importorskip("pint", minversion="0.15")
has_numexpr, requires_numexpr = _importorskip("numexpr")

# some special cases
has_scipy_or_netCDF4 = has_scipy or has_netCDF4
Expand Down
70 changes: 70 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.core.computation.ops import UndefinedVariableError
from pandas.tseries.frequencies import to_offset

import xarray as xr
Expand Down Expand Up @@ -39,6 +40,7 @@
requires_dask,
requires_iris,
requires_numbagg,
requires_numexpr,
requires_scipy,
requires_sparse,
source_ndarray,
Expand Down Expand Up @@ -4620,6 +4622,74 @@ def test_pad_reflect(self, mode, reflect_type):
assert actual.shape == (7, 4, 9)
assert_identical(actual, expected)

@pytest.mark.parametrize("parser", ["pandas", "python"])
@pytest.mark.parametrize(
"engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])]
)
@pytest.mark.parametrize(
"backend", ["numpy", pytest.param("dask", marks=[requires_dask])]
)
def test_query(self, backend, engine, parser):
"""Test querying a dataset."""

# setup test data
np.random.seed(42)
a = np.arange(0, 10, 1)
b = np.random.randint(0, 100, size=10)
c = np.linspace(0, 1, 20)
d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype(
object
)
if backend == "numpy":
aa = DataArray(data=a, dims=["x"], name="a")
bb = DataArray(data=b, dims=["x"], name="b")
cc = DataArray(data=c, dims=["y"], name="c")
dd = DataArray(data=d, dims=["z"], name="d")

elif backend == "dask":
import dask.array as da

aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a")
bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b")
cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c")
dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d")

# query single dim, single variable
actual = aa.query(x="a > 5", engine=engine, parser=parser)
expect = aa.isel(x=(a > 5))
assert_identical(expect, actual)

# query single dim, single variable, via dict
actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser)
expect = aa.isel(dict(x=(a > 5)))
assert_identical(expect, actual)

# query single dim, single variable
actual = bb.query(x="b > 50", engine=engine, parser=parser)
expect = bb.isel(x=(b > 50))
assert_identical(expect, actual)

# query single dim, single variable
actual = cc.query(y="c < .5", engine=engine, parser=parser)
expect = cc.isel(y=(c < 0.5))
assert_identical(expect, actual)

# query single dim, single string variable
if parser == "pandas":
# N.B., this query currently only works with the pandas parser
# xref https://github.com/pandas-dev/pandas/issues/40436
actual = dd.query(z='d == "bar"', engine=engine, parser=parser)
expect = dd.isel(z=(d == "bar"))
assert_identical(expect, actual)

# test error handling
with pytest.raises(ValueError):
aa.query("a > 5") # must be dict or kwargs
with pytest.raises(ValueError):
aa.query(x=(a > 5)) # must be query string
with pytest.raises(UndefinedVariableError):
aa.query(x="spam > 50") # name not present


class TestReduce:
@pytest.fixture(autouse=True)
Expand Down
Loading

0 comments on commit 37fe544

Please sign in to comment.