From 8b542f8e4a703770617612788cdd3d3d5fea14dc Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 2 Mar 2021 11:03:57 +0000
Subject: [PATCH 01/15] initial work on Dataset.query

---
 xarray/core/dataset.py       | 31 ++++++++++++++++++++
 xarray/tests/test_dataset.py | 57 ++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 9faf74dd4bc..2f81e952194 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -6980,5 +6980,36 @@ def argmax(self, dim=None, axis=None, **kwargs):
                 "Dataset.argmin() with a sequence or ... for dim"
             )
 
+    def query(
+        self,
+        queries: Mapping[Hashable, Any] = None,
+        parser: str = "pandas",
+        engine: str = None,
+        missing_dims: str = "raise",
+        **queries_kwargs: Any,
+    ) -> "Dataset":
+        """TODO docstring"""
+
+        # allow queries to be given either as a dict or as kwargs
+        queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
+
+        # check queries
+        for dim, expr in queries.items():
+            if not isinstance(expr, str):
+                msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
+                raise ValueError(msg)
+            # TODO check missing dims here, or delegate to isel?
+
+        # evaluate the queries to create the indexers
+        indexers = {
+            dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
+            for dim, expr in queries.items()
+        }
+
+        # TODO any validation of indexers? Or just let isel try to handle it?
+
+        # apply the selection
+        return self.isel(indexers, missing_dims=missing_dims)
+
 
 ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 2118bc8b780..cc1ff2433bb 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -5807,6 +5807,63 @@ def test_astype_attrs(self):
         assert not data.astype(float, keep_attrs=False).attrs
         assert not data.astype(float, keep_attrs=False).var1.attrs
 
+    def test_query_single_dim(self):
+        """Test querying a single dimension."""
+
+        # setup test data
+        np.random.seed(42)
+        a = np.arange(0, 10, 1)
+        b = np.random.randint(0, 100, size=10)
+        c = np.linspace(0, 1, 20)
+        d = np.arange(0, 200).reshape(10, 20)
+        ds = Dataset(
+            {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
+        )
+
+        # query single dim, single variable
+        actual = ds.query(x="a > 5")
+        expect = ds.isel(x=(a > 5))
+        assert_identical(expect, actual)
+
+        # query single dim, single variable, via dict
+        actual = ds.query(dict(x="a > 5"))
+        expect = ds.isel(dict(x=(a > 5)))
+        assert_identical(expect, actual)
+
+        # query single dim, single variable
+        actual = ds.query(x="b > 50")
+        expect = ds.isel(x=(b > 50))
+        assert_identical(expect, actual)
+
+        # query single dim, single variable
+        actual = ds.query(y="c < .5")
+        expect = ds.isel(y=(c < 0.5))
+        assert_identical(expect, actual)
+
+        # query single dim, multiple variables
+        actual = ds.query(x="(a > 5) & (b > 50)")
+        expect = ds.isel(x=((a > 5) & (b > 50)))
+        assert_identical(expect, actual)
+
+        # support pandas query parser
+        actual = ds.query(x="(a > 5) and (b > 50)")
+        expect = ds.isel(x=((a > 5) & (b > 50)))
+        assert_identical(expect, actual)
+
+        # query multiple dims via kwargs
+        actual = ds.query(x="a > 5", y="c < .5")
+        expect = ds.isel(x=(a > 5), y=(c < 0.5))
+        assert_identical(expect, actual)
+
+        # query multiple dims via dict
+        actual = ds.query(dict(x="a > 5", y="c < .5"))
+        expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
+        assert_identical(expect, actual)
+
+        # TODO test error handling
+
+        # TODO test dask data variables
+
 
 # Py.test tests
 

From a41e80539310bc8c9e7416e3e57fd02d0455e5df Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Fri, 12 Mar 2021 18:13:03 +0000
Subject: [PATCH 02/15] dataset query: test backends, engines, parsers; add
 docstring

---
 ci/requirements/environment.yml |  1 +
 xarray/core/dataset.py          | 50 ++++++++++++++++++++++++++---
 xarray/tests/test_dataset.py    | 56 ++++++++++++++++++++++-----------
 3 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
index 36147c64c03..57498fa5700 100644
--- a/ci/requirements/environment.yml
+++ b/ci/requirements/environment.yml
@@ -22,6 +22,7 @@ dependencies:
   - nc-time-axis
   - netcdf4
   - numba
+  - numexpr
   - numpy
   - pandas
   - pint
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 2f81e952194..d6b44559635 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -6988,7 +6988,52 @@ def query(
         missing_dims: str = "raise",
         **queries_kwargs: Any,
     ) -> "Dataset":
-        """TODO docstring"""
+        """Return a new dataset with each array indexed along the specified
+        dimension(s), where the indexers are given as strings containing
+        Python expressions to be evaluated against the data variables in the
+        dataset.
+
+        Parameters
+        ----------
+        queries : dict, optional
+            A dic with keys matching dimensions and values given by strings
+            containing Python expressions to be evaluated against the data variables
+            in the dataset. The expressions will be evaluated using the pandas
+            eval() function, and can contain any valid Python expressions but cannot
+            contain any Python statements.
+        parser : {"pandas", "python"}, default: "pandas"
+            The parser to use to construct the syntax tree from the expression.
+            The default of 'pandas' parses code slightly different than standard
+            Python. Alternatively, you can parse an expression using the 'python'
+            parser to retain strict Python semantics.
+        engine: {"python", "numexpr", None}, default: None
+            The engine used to evaluate the expression. Supported engines are:
+            - None: tries to use numexpr, falls back to python
+            - "numexpr": evaluates expressions using numexpr
+            - "python": performs operations as if you had eval’d in top level python
+        missing_dims : {"raise", "warn", "ignore"}, default: "raise"
+            What to do if dimensions that should be selected from are not present in the
+            Dataset:
+            - "raise": raise an exception
+            - "warning": raise a warning, and ignore the missing dimensions
+            - "ignore": ignore the missing dimensions
+        **queries_kwargs : {dim: query, ...}, optional
+            The keyword arguments form of ``queries``.
+            One of queries or queries_kwargs must be provided.
+
+        Returns
+        -------
+        obj : Dataset
+            A new Dataset with the same contents as this dataset, except each
+            array and dimension is indexed by the results of the appropriate
+            queries.
+
+        See Also
+        --------
+        Dataset.isel
+        pandas.eval
+
+        """
 
         # allow queries to be given either as a dict or as kwargs
         queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
@@ -6998,7 +7043,6 @@ def query(
             if not isinstance(expr, str):
                 msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
                 raise ValueError(msg)
-            # TODO check missing dims here, or delegate to isel?
 
         # evaluate the queries to create the indexers
         indexers = {
@@ -7006,8 +7050,6 @@ def query(
             for dim, expr in queries.items()
         }
 
-        # TODO any validation of indexers? Or just let isel try to handle it?
-
         # apply the selection
         return self.isel(indexers, missing_dims=missing_dims)
 
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index cc1ff2433bb..8cd8f5bfb1c 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -5807,8 +5807,11 @@ def test_astype_attrs(self):
         assert not data.astype(float, keep_attrs=False).attrs
         assert not data.astype(float, keep_attrs=False).var1.attrs
 
-    def test_query_single_dim(self):
-        """Test querying a single dimension."""
+    @pytest.mark.parametrize("parser", ["pandas", "python"])
+    @pytest.mark.parametrize("engine", ["python", "numexpr", None])
+    @pytest.mark.parametrize("backend", ["numpy", "dask"])
+    def test_query(self, backend, engine, parser):
+        """Test querying a dataset."""
 
         # setup test data
         np.random.seed(42)
@@ -5816,53 +5819,70 @@ def test_query_single_dim(self):
         b = np.random.randint(0, 100, size=10)
         c = np.linspace(0, 1, 20)
         d = np.arange(0, 200).reshape(10, 20)
-        ds = Dataset(
-            {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
-        )
+        if backend == "numpy":
+            ds = Dataset(
+                {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
+            )
+        elif backend == "dask":
+            ds = Dataset(
+                {
+                    "a": ("x", da.from_array(a, chunks=3)),
+                    "b": ("x", da.from_array(b, chunks=3)),
+                    "c": ("y", da.from_array(c, chunks=7)),
+                    "d": (("x", "y"), da.from_array(d, chunks=(3, 7))),
+                }
+            )
 
         # query single dim, single variable
-        actual = ds.query(x="a > 5")
+        actual = ds.query(x="a > 5", engine=engine, parser=parser)
         expect = ds.isel(x=(a > 5))
         assert_identical(expect, actual)
 
         # query single dim, single variable, via dict
-        actual = ds.query(dict(x="a > 5"))
+        actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser)
         expect = ds.isel(dict(x=(a > 5)))
         assert_identical(expect, actual)
 
         # query single dim, single variable
-        actual = ds.query(x="b > 50")
+        actual = ds.query(x="b > 50", engine=engine, parser=parser)
         expect = ds.isel(x=(b > 50))
         assert_identical(expect, actual)
 
         # query single dim, single variable
-        actual = ds.query(y="c < .5")
+        actual = ds.query(y="c < .5", engine=engine, parser=parser)
         expect = ds.isel(y=(c < 0.5))
         assert_identical(expect, actual)
 
         # query single dim, multiple variables
-        actual = ds.query(x="(a > 5) & (b > 50)")
+        actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser)
         expect = ds.isel(x=((a > 5) & (b > 50)))
         assert_identical(expect, actual)
 
         # support pandas query parser
-        actual = ds.query(x="(a > 5) and (b > 50)")
-        expect = ds.isel(x=((a > 5) & (b > 50)))
-        assert_identical(expect, actual)
+        if parser == "pandas":
+            actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
+            expect = ds.isel(x=((a > 5) & (b > 50)))
+            assert_identical(expect, actual)
 
         # query multiple dims via kwargs
-        actual = ds.query(x="a > 5", y="c < .5")
+        actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser)
         expect = ds.isel(x=(a > 5), y=(c < 0.5))
         assert_identical(expect, actual)
 
         # query multiple dims via dict
-        actual = ds.query(dict(x="a > 5", y="c < .5"))
+        actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser)
         expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
         assert_identical(expect, actual)
 
-        # TODO test error handling
-
-        # TODO test dask data variables
+        # test error handling
+        with pytest.raises(ValueError):
+            ds.query("a > 5")  # must be dict
+        with pytest.raises(IndexError):
+            ds.query(y="a > 5")  # wrong length dimension
+        with pytest.raises(IndexError):
+            ds.query(x="c < .5")  # wrong length dimension
+        with pytest.raises(IndexError):
+            ds.query(x="d > 100")  # wrong number of dimensions
 
 
 # Py.test tests

From 907f226aa70afed7f199133dad5a36caf83a7829 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Fri, 12 Mar 2021 18:17:49 +0000
Subject: [PATCH 03/15] add error test

---
 xarray/tests/test_dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 8cd8f5bfb1c..9f488c17712 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -5876,7 +5876,9 @@ def test_query(self, backend, engine, parser):
 
         # test error handling
         with pytest.raises(ValueError):
-            ds.query("a > 5")  # must be dict
+            ds.query("a > 5")  # must be dict or kwargs
+        with pytest.raises(ValueError):
+            ds.query(x=(a > 5))  # must be query string
         with pytest.raises(IndexError):
             ds.query(y="a > 5")  # wrong length dimension
         with pytest.raises(IndexError):

From 621243778dbbc1a4ceb14cfff46c404a7afb50fc Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Fri, 12 Mar 2021 18:19:49 +0000
Subject: [PATCH 04/15] unfortunate typo

---
 xarray/core/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index d6b44559635..c6e846b7e14 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -6996,7 +6996,7 @@ def query(
         Parameters
         ----------
         queries : dict, optional
-            A dic with keys matching dimensions and values given by strings
+            A dict with keys matching dimensions and values given by strings
             containing Python expressions to be evaluated against the data variables
             in the dataset. The expressions will be evaluated using the pandas
             eval() function, and can contain any valid Python expressions but cannot

From a5e59324181bb5c44d9a4748458316f8c5523891 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Sat, 13 Mar 2021 23:40:47 +0000
Subject: [PATCH 05/15] test three dims

---
 xarray/tests/test_dataset.py | 39 ++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 9f488c17712..1c3ccfd46e4 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -5819,9 +5819,18 @@ def test_query(self, backend, engine, parser):
         b = np.random.randint(0, 100, size=10)
         c = np.linspace(0, 1, 20)
         d = np.arange(0, 200).reshape(10, 20)
+        e = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype(
+            object
+        )
         if backend == "numpy":
             ds = Dataset(
-                {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
+                {
+                    "a": ("x", a),
+                    "b": ("x", b),
+                    "c": ("y", c),
+                    "d": (("x", "y"), d),
+                    "e": ("z", e),
+                }
             )
         elif backend == "dask":
             ds = Dataset(
@@ -5830,6 +5839,7 @@ def test_query(self, backend, engine, parser):
                     "b": ("x", da.from_array(b, chunks=3)),
                     "c": ("y", da.from_array(c, chunks=7)),
                     "d": (("x", "y"), da.from_array(d, chunks=(3, 7))),
+                    "e": ("z", da.from_array(e, chunks=12)),
                 }
             )
 
@@ -5853,12 +5863,19 @@ def test_query(self, backend, engine, parser):
         expect = ds.isel(y=(c < 0.5))
         assert_identical(expect, actual)
 
+        # query single dim, single string variable
+        # N.B., this query raises NotImplemented for the Python parser, not clear why (same behaviour in pandas)
+        if parser == "pandas":
+            actual = ds.query(z='e == "foo"', engine=engine, parser=parser)
+            expect = ds.isel(z=(e == "foo"))
+            assert_identical(expect, actual)
+
         # query single dim, multiple variables
         actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser)
         expect = ds.isel(x=((a > 5) & (b > 50)))
         assert_identical(expect, actual)
 
-        # support pandas query parser
+        # check pandas query parser
         if parser == "pandas":
             actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
             expect = ds.isel(x=((a > 5) & (b > 50)))
@@ -5869,11 +5886,29 @@ def test_query(self, backend, engine, parser):
         expect = ds.isel(x=(a > 5), y=(c < 0.5))
         assert_identical(expect, actual)
 
+        # query multiple dims via kwargs
+        if parser == "pandas":
+            actual = ds.query(
+                x="a > 5", y="c < .5", z="e == 'foo'", engine=engine, parser=parser
+            )
+            expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(e == "foo"))
+            assert_identical(expect, actual)
+
         # query multiple dims via dict
         actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser)
         expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
         assert_identical(expect, actual)
 
+        # query multiple dims via dict
+        if parser == "pandas":
+            actual = ds.query(
+                dict(x="a > 5", y="c < .5", z="e == 'foo'"),
+                engine=engine,
+                parser=parser,
+            )
+            expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(e == "foo")))
+            assert_identical(expect, actual)
+
         # test error handling
         with pytest.raises(ValueError):
             ds.query("a > 5")  # must be dict or kwargs

From 372394647e38a1a900e4ceac58291510dca41e2b Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Sun, 14 Mar 2021 22:41:16 +0000
Subject: [PATCH 06/15] refine tests

---
 xarray/tests/test_dataset.py | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 1c3ccfd46e4..b18fd008063 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -5818,18 +5818,20 @@ def test_query(self, backend, engine, parser):
         a = np.arange(0, 10, 1)
         b = np.random.randint(0, 100, size=10)
         c = np.linspace(0, 1, 20)
-        d = np.arange(0, 200).reshape(10, 20)
-        e = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype(
+        d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype(
             object
         )
+        e = np.arange(0, 10 * 20).reshape(10, 20)
+        f = np.random.normal(0, 1, size=(10, 20, 30))
         if backend == "numpy":
             ds = Dataset(
                 {
                     "a": ("x", a),
                     "b": ("x", b),
                     "c": ("y", c),
-                    "d": (("x", "y"), d),
-                    "e": ("z", e),
+                    "d": ("z", d),
+                    "e": (("x", "y"), e),
+                    "f": (("x", "y", "z"), f),
                 }
             )
         elif backend == "dask":
@@ -5838,8 +5840,9 @@ def test_query(self, backend, engine, parser):
                     "a": ("x", da.from_array(a, chunks=3)),
                     "b": ("x", da.from_array(b, chunks=3)),
                     "c": ("y", da.from_array(c, chunks=7)),
-                    "d": (("x", "y"), da.from_array(d, chunks=(3, 7))),
-                    "e": ("z", da.from_array(e, chunks=12)),
+                    "d": ("z", da.from_array(d, chunks=12)),
+                    "e": (("x", "y"), da.from_array(e, chunks=(3, 7))),
+                    "f": (("x", "y", "z"), da.from_array(f, chunks=(3, 7, 12))),
                 }
             )
 
@@ -5864,10 +5867,11 @@ def test_query(self, backend, engine, parser):
         assert_identical(expect, actual)
 
         # query single dim, single string variable
-        # N.B., this query raises NotImplemented for the Python parser, not clear why (same behaviour in pandas)
         if parser == "pandas":
-            actual = ds.query(z='e == "foo"', engine=engine, parser=parser)
-            expect = ds.isel(z=(e == "foo"))
+            # N.B., this query currently only works with the pandas parser
+            # xref https://github.com/pandas-dev/pandas/issues/40436
+            actual = ds.query(z='d == "bar"', engine=engine, parser=parser)
+            expect = ds.isel(z=(d == "bar"))
             assert_identical(expect, actual)
 
         # query single dim, multiple variables
@@ -5875,7 +5879,7 @@ def test_query(self, backend, engine, parser):
         expect = ds.isel(x=((a > 5) & (b > 50)))
         assert_identical(expect, actual)
 
-        # check pandas query parser
+        # check pandas query syntax is supported
         if parser == "pandas":
             actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
             expect = ds.isel(x=((a > 5) & (b > 50)))
@@ -5889,9 +5893,9 @@ def test_query(self, backend, engine, parser):
         # query multiple dims via kwargs
         if parser == "pandas":
             actual = ds.query(
-                x="a > 5", y="c < .5", z="e == 'foo'", engine=engine, parser=parser
+                x="a > 5", y="c < .5", z="d == 'bar'", engine=engine, parser=parser
             )
-            expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(e == "foo"))
+            expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(d == "bar"))
             assert_identical(expect, actual)
 
         # query multiple dims via dict
@@ -5902,11 +5906,11 @@ def test_query(self, backend, engine, parser):
         # query multiple dims via dict
         if parser == "pandas":
             actual = ds.query(
-                dict(x="a > 5", y="c < .5", z="e == 'foo'"),
+                dict(x="a > 5", y="c < .5", z="d == 'bar'"),
                 engine=engine,
                 parser=parser,
             )
-            expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(e == "foo")))
+            expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(d == "bar")))
             assert_identical(expect, actual)
 
         # test error handling
@@ -5919,7 +5923,7 @@ def test_query(self, backend, engine, parser):
         with pytest.raises(IndexError):
             ds.query(x="c < .5")  # wrong length dimension
         with pytest.raises(IndexError):
-            ds.query(x="d > 100")  # wrong number of dimensions
+            ds.query(x="e > 100")  # wrong number of dimensions
 
 
 # Py.test tests

From 2d4a74dbe818a66d1de9610b3ae3d3a3814a4d77 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 09:28:07 +0000
Subject: [PATCH 07/15] fix error message

Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com>
---
 xarray/core/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index c6e846b7e14..1762e17573e 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -7036,7 +7036,7 @@ def query(
         """
 
         # allow queries to be given either as a dict or as kwargs
-        queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
+        queries = either_dict_or_kwargs(queries, queries_kwargs, "queries")
 
         # check queries
         for dim, expr in queries.items():

From 0ba3db3c12e3e678c1525e645a8428764b0d56cf Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 09:34:22 +0000
Subject: [PATCH 08/15] add requires decorators

---
 xarray/tests/__init__.py     | 1 +
 xarray/tests/test_dataset.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
index 4b47e1d2c7e..aebcb0f2b8d 100644
--- a/xarray/tests/__init__.py
+++ b/xarray/tests/__init__.py
@@ -83,6 +83,7 @@ def LooseVersion(vstring):
 has_cartopy, requires_cartopy = _importorskip("cartopy")
 # Need Pint 0.15 for __dask_tokenize__ tests for Quantity wrapped Dask Arrays
 has_pint_0_15, requires_pint_0_15 = _importorskip("pint", minversion="0.15")
+has_numexpr, requires_numexpr = _importorskip("numexpr")
 
 # some special cases
 has_scipy_or_netCDF4 = has_scipy or has_netCDF4
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index b18fd008063..9a463870368 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -46,6 +46,7 @@
     requires_numbagg,
     requires_scipy,
     requires_sparse,
+    requires_numexpr,
     source_ndarray,
 )
 
@@ -5807,6 +5808,8 @@ def test_astype_attrs(self):
         assert not data.astype(float, keep_attrs=False).attrs
         assert not data.astype(float, keep_attrs=False).var1.attrs
 
+    @requires_dask
+    @requires_numexpr
     @pytest.mark.parametrize("parser", ["pandas", "python"])
     @pytest.mark.parametrize("engine", ["python", "numexpr", None])
     @pytest.mark.parametrize("backend", ["numpy", "dask"])

From 9dddc135edc038dd6dc948fe7545cb10ab650835 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 11:15:04 +0000
Subject: [PATCH 09/15] revert change, should be func name

---
 xarray/core/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 1762e17573e..c6e846b7e14 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -7036,7 +7036,7 @@ def query(
         """
 
         # allow queries to be given either as a dict or as kwargs
-        queries = either_dict_or_kwargs(queries, queries_kwargs, "queries")
+        queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
 
         # check queries
         for dim, expr in queries.items():

From c3f322c075ff2c5125069ee506cf7d7e1a988bdb Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 11:15:26 +0000
Subject: [PATCH 10/15] improve Dataset.query tests

---
 xarray/tests/test_dataset.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 9a463870368..256f4873cc7 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -9,6 +9,7 @@
 import pandas as pd
 import pytest
 from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.computation.ops import UndefinedVariableError
 
 import xarray as xr
 from xarray import (
@@ -5882,6 +5883,11 @@ def test_query(self, backend, engine, parser):
         expect = ds.isel(x=((a > 5) & (b > 50)))
         assert_identical(expect, actual)
 
+        # query single dim, multiple variables with computation
+        actual = ds.query(x="(a * b) > 250", engine=engine, parser=parser)
+        expect = ds.isel(x=(a * b) > 250)
+        assert_identical(expect, actual)
+
         # check pandas query syntax is supported
         if parser == "pandas":
             actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
@@ -5927,6 +5933,8 @@ def test_query(self, backend, engine, parser):
             ds.query(x="c < .5")  # wrong length dimension
         with pytest.raises(IndexError):
             ds.query(x="e > 100")  # wrong number of dimensions
+        with pytest.raises(UndefinedVariableError):
+            ds.query(x="spam > 50")  # name not present
 
 
 # Py.test tests

From 0eec9821add87003d7968c45ec18da6429b193dd Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 11:15:40 +0000
Subject: [PATCH 11/15] add DataArray.query

---
 xarray/core/dataarray.py       | 58 +++++++++++++++++++++++++++++
 xarray/tests/test_dataarray.py | 67 ++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
index e6209b0604b..3c33afeb69d 100644
--- a/xarray/core/dataarray.py
+++ b/xarray/core/dataarray.py
@@ -4356,6 +4356,64 @@ def argmax(
     # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
     str = utils.UncachedAccessor(StringAccessor)
 
+    def query(
+        self,
+        queries: Mapping[Hashable, Any] = None,
+        parser: str = "pandas",
+        engine: str = None,
+        missing_dims: str = "raise",
+        **queries_kwargs: Any,
+    ) -> "DataArray":
+        """Return a new data array indexed along the specified
+        dimension(s), where the indexers are given as strings containing
+        Python expressions to be evaluated against the values in the array.
+
+        Parameters
+        ----------
+        queries : dict, optional
+            A dict with keys matching dimensions and values given by strings
+            containing Python expressions to be evaluated against the data variables
+            in the dataset. The expressions will be evaluated using the pandas
+            eval() function, and can contain any valid Python expressions but cannot
+            contain any Python statements.
+        parser : {"pandas", "python"}, default: "pandas"
+            The parser to use to construct the syntax tree from the expression.
+            The default of 'pandas' parses code slightly different than standard
+            Python. Alternatively, you can parse an expression using the 'python'
+            parser to retain strict Python semantics.
+        engine: {"python", "numexpr", None}, default: None
+            The engine used to evaluate the expression. Supported engines are:
+            - None: tries to use numexpr, falls back to python
+            - "numexpr": evaluates expressions using numexpr
+            - "python": performs operations as if you had eval’d in top level python
+        missing_dims : {"raise", "warn", "ignore"}, default: "raise"
+            What to do if dimensions that should be selected from are not present in the
+            Dataset:
+            - "raise": raise an exception
+            - "warning": raise a warning, and ignore the missing dimensions
+            - "ignore": ignore the missing dimensions
+        **queries_kwargs : {dim: query, ...}, optional
+            The keyword arguments form of ``queries``.
+            One of queries or queries_kwargs must be provided.
+
+        Returns
+        -------
+        obj : DataArray
+            A new DataArray with the same contents as this dataset, indexed by
+            the results of the appropriate queries.
+
+        See Also
+        --------
+        DataArray.isel
+        Dataset.query
+        pandas.eval
+
+        """
+
+        ds = self._to_dataset_whole(shallow_copy=True)
+        ds = ds.query(queries=queries, parser=parser, engine=engine, missing_dims=missing_dims, **queries_kwargs)
+        return ds[self.name]
+
 
 # priority most be higher than Variable to properly work with binary ufuncs
 ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index b28a53023ed..a905a31de63 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.core.computation.ops import UndefinedVariableError
 import pytest
 
 import xarray as xr
@@ -40,6 +41,7 @@
     requires_numbagg,
     requires_scipy,
     requires_sparse,
+    requires_numexpr,
     source_ndarray,
 )
 
@@ -4615,6 +4617,71 @@ def test_pad_reflect(self, mode, reflect_type):
         assert actual.shape == (7, 4, 9)
         assert_identical(actual, expected)
 
+    @requires_dask
+    @requires_numexpr
+    @pytest.mark.parametrize("parser", ["pandas", "python"])
+    @pytest.mark.parametrize("engine", ["python", "numexpr", None])
+    @pytest.mark.parametrize("backend", ["numpy", "dask"])
+    def test_query(self, backend, engine, parser):
+        """Test querying a dataset."""
+
+        # setup test data
+        np.random.seed(42)
+        a = np.arange(0, 10, 1)
+        b = np.random.randint(0, 100, size=10)
+        c = np.linspace(0, 1, 20)
+        d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype(
+            object
+        )
+        if backend == "numpy":
+            aa = DataArray(data=a, dims=["x"], name="a")
+            bb = DataArray(data=b, dims=["x"], name="b")
+            cc = DataArray(data=c, dims=["y"], name="c")
+            dd = DataArray(data=d, dims=["z"], name="d")
+
+        elif backend == "dask":
+            import dask.array as da
+            aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a")
+            bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b")
+            cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c")
+            dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d")
+
+        # query single dim, single variable
+        actual = aa.query(x="a > 5", engine=engine, parser=parser)
+        expect = aa.isel(x=(a > 5))
+        assert_identical(expect, actual)
+
+        # query single dim, single variable, via dict
+        actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser)
+        expect = aa.isel(dict(x=(a > 5)))
+        assert_identical(expect, actual)
+
+        # query single dim, single variable
+        actual = bb.query(x="b > 50", engine=engine, parser=parser)
+        expect = bb.isel(x=(b > 50))
+        assert_identical(expect, actual)
+
+        # query single dim, single variable
+        actual = cc.query(y="c < .5", engine=engine, parser=parser)
+        expect = cc.isel(y=(c < 0.5))
+        assert_identical(expect, actual)
+
+        # query single dim, single string variable
+        if parser == "pandas":
+            # N.B., this query currently only works with the pandas parser
+            # xref https://github.com/pandas-dev/pandas/issues/40436
+            actual = dd.query(z='d == "bar"', engine=engine, parser=parser)
+            expect = dd.isel(z=(d == "bar"))
+            assert_identical(expect, actual)
+
+        # test error handling
+        with pytest.raises(ValueError):
+            aa.query("a > 5")  # must be dict or kwargs
+        with pytest.raises(ValueError):
+            aa.query(x=(a > 5))  # must be query string
+        with pytest.raises(UndefinedVariableError):
+            aa.query(x="spam > 50")  # name not present
+
 
 class TestReduce:
     @pytest.fixture(autouse=True)

From cfe03d3093a4b716a4ed67b544d0be93e79dd993 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 11:17:29 +0000
Subject: [PATCH 12/15] add query to API docs

---
 doc/api.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/api.rst b/doc/api.rst
index 9add7a96109..baec11eaf98 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -138,6 +138,7 @@ Indexing
    Dataset.set_index
    Dataset.reset_index
    Dataset.reorder_levels
+   Dataset.query
 
 Missing value handling
 ----------------------
@@ -321,6 +322,7 @@ Indexing
    DataArray.set_index
    DataArray.reset_index
    DataArray.reorder_levels
+   DataArray.query
 
 Missing value handling
 ----------------------

From d78604115d08a6f7f6fbf2f2be14634e201b6612 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 11:21:31 +0000
Subject: [PATCH 13/15] add query to whats new

---
 doc/whats-new.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index eed4e16eb62..a5f351ad007 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -23,6 +23,10 @@ v0.17.1 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Add :py:meth:`Dataset.query` and :py:meth:`DataArray.query` which enable indexing
+  of datasets and data arrays by evaluating query expressions against the values of the
+  data variables (:pull:`4984`). By `Alistair Miles <https://github.com/alimanfoo>`_.
+
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

From 48de755405fe06fd9e0f158009a2e578e245e3e0 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 14:20:16 +0000
Subject: [PATCH 14/15] fix black, mypy

---
 xarray/core/dataarray.py       | 16 +++++++++++-----
 xarray/tests/test_dataarray.py |  5 +++--
 xarray/tests/test_dataset.py   |  4 ++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
index 79e89de8acd..b2fc14f4ba7 100644
--- a/xarray/core/dataarray.py
+++ b/xarray/core/dataarray.py
@@ -4354,10 +4354,6 @@ def argmax(
         else:
             return self._replace_maybe_drop_dims(result)
 
-    # this needs to be at the end, or mypy will confuse with `str`
-    # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
-    str = utils.UncachedAccessor(StringAccessor)
-
     def query(
         self,
         queries: Mapping[Hashable, Any] = None,
@@ -4413,9 +4409,19 @@ def query(
         """
 
         ds = self._to_dataset_whole(shallow_copy=True)
-        ds = ds.query(queries=queries, parser=parser, engine=engine, missing_dims=missing_dims, **queries_kwargs)
+        ds = ds.query(
+            queries=queries,
+            parser=parser,
+            engine=engine,
+            missing_dims=missing_dims,
+            **queries_kwargs,
+        )
         return ds[self.name]
 
+    # this needs to be at the end, or mypy will confuse with `str`
+    # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
+    str = utils.UncachedAccessor(StringAccessor)
+
 
 # priority most be higher than Variable to properly work with binary ufuncs
 ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index 74d4c1123d0..bfe6e497b47 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -6,8 +6,8 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.computation.ops import UndefinedVariableError
 import pytest
+from pandas.core.computation.ops import UndefinedVariableError
 from pandas.tseries.frequencies import to_offset
 
 import xarray as xr
@@ -40,9 +40,9 @@
     requires_dask,
     requires_iris,
     requires_numbagg,
+    requires_numexpr,
     requires_scipy,
     requires_sparse,
-    requires_numexpr,
     source_ndarray,
 )
 
@@ -4646,6 +4646,7 @@ def test_query(self, backend, engine, parser):
 
         elif backend == "dask":
             import dask.array as da
+
             aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a")
             bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b")
             cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c")
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 00c720857df..52073ace566 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.computation.ops import UndefinedVariableError
+from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.tseries.frequencies import to_offset
 
 import xarray as xr
@@ -46,9 +46,9 @@
     requires_cftime,
     requires_dask,
     requires_numbagg,
+    requires_numexpr,
     requires_scipy,
     requires_sparse,
-    requires_numexpr,
     source_ndarray,
 )
 

From 8fcb02e6663ba24392a2ef7540a76a05fd382cc2 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Tue, 16 Mar 2021 15:21:06 +0000
Subject: [PATCH 15/15] refine test parameterisation and requirements

---
 xarray/tests/test_dataarray.py | 10 ++++++----
 xarray/tests/test_dataset.py   | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index bfe6e497b47..c38c3656eaf 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -4622,11 +4622,13 @@ def test_pad_reflect(self, mode, reflect_type):
         assert actual.shape == (7, 4, 9)
         assert_identical(actual, expected)
 
-    @requires_dask
-    @requires_numexpr
     @pytest.mark.parametrize("parser", ["pandas", "python"])
-    @pytest.mark.parametrize("engine", ["python", "numexpr", None])
-    @pytest.mark.parametrize("backend", ["numpy", "dask"])
+    @pytest.mark.parametrize(
+        "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])]
+    )
+    @pytest.mark.parametrize(
+        "backend", ["numpy", pytest.param("dask", marks=[requires_dask])]
+    )
     def test_query(self, backend, engine, parser):
         """Test querying a dataset."""
 
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index 52073ace566..52df7603034 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -5809,11 +5809,13 @@ def test_astype_attrs(self):
         assert not data.astype(float, keep_attrs=False).attrs
         assert not data.astype(float, keep_attrs=False).var1.attrs
 
-    @requires_dask
-    @requires_numexpr
     @pytest.mark.parametrize("parser", ["pandas", "python"])
-    @pytest.mark.parametrize("engine", ["python", "numexpr", None])
-    @pytest.mark.parametrize("backend", ["numpy", "dask"])
+    @pytest.mark.parametrize(
+        "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])]
+    )
+    @pytest.mark.parametrize(
+        "backend", ["numpy", pytest.param("dask", marks=[requires_dask])]
+    )
     def test_query(self, backend, engine, parser):
         """Test querying a dataset."""