Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds Dataset.query() method, analogous to pandas DataFrame.query() #4984

Merged
merged 16 commits into from
Mar 16, 2021
Merged
31 changes: 31 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6980,5 +6980,36 @@ def argmax(self, dim=None, axis=None, **kwargs):
"Dataset.argmin() with a sequence or ... for dim"
)

def query(
self,
queries: Mapping[Hashable, Any] = None,
parser: str = "pandas",
engine: str = None,
missing_dims: str = "raise",
**queries_kwargs: Any,
) -> "Dataset":
"""TODO docstring"""

# allow queries to be given either as a dict or as kwargs
queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
alimanfoo marked this conversation as resolved.
Show resolved Hide resolved

# check queries
for dim, expr in queries.items():
if not isinstance(expr, str):
msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
raise ValueError(msg)
# TODO check missing dims here, or delegate to isel?

# evaluate the queries to create the indexers
indexers = {
dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
for dim, expr in queries.items()
}

# TODO any validation of indexers? Or just let isel try to handle it?

# apply the selection
return self.isel(indexers, missing_dims=missing_dims)


ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
57 changes: 57 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5807,6 +5807,63 @@ def test_astype_attrs(self):
assert not data.astype(float, keep_attrs=False).attrs
assert not data.astype(float, keep_attrs=False).var1.attrs

def test_query_single_dim(self):
"""Test querying a single dimension."""

# setup test data
np.random.seed(42)
a = np.arange(0, 10, 1)
b = np.random.randint(0, 100, size=10)
c = np.linspace(0, 1, 20)
d = np.arange(0, 200).reshape(10, 20)
ds = Dataset(
{"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
)

# query single dim, single variable
actual = ds.query(x="a > 5")
expect = ds.isel(x=(a > 5))
assert_identical(expect, actual)

# query single dim, single variable, via dict
actual = ds.query(dict(x="a > 5"))
expect = ds.isel(dict(x=(a > 5)))
assert_identical(expect, actual)

# query single dim, single variable
actual = ds.query(x="b > 50")
expect = ds.isel(x=(b > 50))
assert_identical(expect, actual)

# query single dim, single variable
actual = ds.query(y="c < .5")
expect = ds.isel(y=(c < 0.5))
assert_identical(expect, actual)

# query single dim, multiple variables
actual = ds.query(x="(a > 5) & (b > 50)")
expect = ds.isel(x=((a > 5) & (b > 50)))
assert_identical(expect, actual)

# support pandas query parser
actual = ds.query(x="(a > 5) and (b > 50)")
expect = ds.isel(x=((a > 5) & (b > 50)))
assert_identical(expect, actual)

# query multiple dims via kwargs
actual = ds.query(x="a > 5", y="c < .5")
expect = ds.isel(x=(a > 5), y=(c < 0.5))
assert_identical(expect, actual)

# query multiple dims via dict
actual = ds.query(dict(x="a > 5", y="c < .5"))
expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
assert_identical(expect, actual)

# TODO test error handling

# TODO test dask data variables


# Py.test tests

Expand Down