From 8b542f8e4a703770617612788cdd3d3d5fea14dc Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 2 Mar 2021 11:03:57 +0000 Subject: [PATCH 01/15] initial work on Dataset.query --- xarray/core/dataset.py | 31 ++++++++++++++++++++ xarray/tests/test_dataset.py | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9faf74dd4bc..2f81e952194 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6980,5 +6980,36 @@ def argmax(self, dim=None, axis=None, **kwargs): "Dataset.argmin() with a sequence or ... for dim" ) + def query( + self, + queries: Mapping[Hashable, Any] = None, + parser: str = "pandas", + engine: str = None, + missing_dims: str = "raise", + **queries_kwargs: Any, + ) -> "Dataset": + """TODO docstring""" + + # allow queries to be given either as a dict or as kwargs + queries = either_dict_or_kwargs(queries, queries_kwargs, "query") + + # check queries + for dim, expr in queries.items(): + if not isinstance(expr, str): + msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + # TODO check missing dims here, or delegate to isel? + + # evaluate the queries to create the indexers + indexers = { + dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine) + for dim, expr in queries.items() + } + + # TODO any validation of indexers? Or just let isel try to handle it? + + # apply the selection + return self.isel(indexers, missing_dims=missing_dims) + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2118bc8b780..cc1ff2433bb 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5807,6 +5807,63 @@ def test_astype_attrs(self): assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs + def test_query_single_dim(self): + """Test querying a single dimension.""" + + # setup test data + np.random.seed(42) + a = np.arange(0, 10, 1) + b = np.random.randint(0, 100, size=10) + c = np.linspace(0, 1, 20) + d = np.arange(0, 200).reshape(10, 20) + ds = Dataset( + {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)} + ) + + # query single dim, single variable + actual = ds.query(x="a > 5") + expect = ds.isel(x=(a > 5)) + assert_identical(expect, actual) + + # query single dim, single variable, via dict + actual = ds.query(dict(x="a > 5")) + expect = ds.isel(dict(x=(a > 5))) + assert_identical(expect, actual) + + # query single dim, single variable + actual = ds.query(x="b > 50") + expect = ds.isel(x=(b > 50)) + assert_identical(expect, actual) + + # query single dim, single variable + actual = ds.query(y="c < .5") + expect = ds.isel(y=(c < 0.5)) + assert_identical(expect, actual) + + # query single dim, multiple variables + actual = ds.query(x="(a > 5) & (b > 50)") + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) + + # support pandas query parser + actual = ds.query(x="(a > 5) and (b > 50)") + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) + + # query multiple dims via kwargs + actual = ds.query(x="a > 5", y="c < .5") + expect = ds.isel(x=(a > 5), y=(c < 0.5)) + assert_identical(expect, actual) + + # query multiple dims via dict + actual = ds.query(dict(x="a > 5", y="c < .5")) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) + assert_identical(expect, actual) + + # TODO test error handling + + # TODO test dask data variables + # Py.test tests From a41e80539310bc8c9e7416e3e57fd02d0455e5df Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 12 Mar 2021 18:13:03 +0000 Subject: [PATCH 02/15] dataset query: test backends, engines, parsers; add docstring --- ci/requirements/environment.yml | 1 + xarray/core/dataset.py | 50 ++++++++++++++++++++++++++--- xarray/tests/test_dataset.py | 56 ++++++++++++++++++++++----------- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 36147c64c03..57498fa5700 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -22,6 +22,7 @@ dependencies: - nc-time-axis - netcdf4 - numba + - numexpr - numpy - pandas - pint diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2f81e952194..d6b44559635 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6988,7 +6988,52 @@ def query( missing_dims: str = "raise", **queries_kwargs: Any, ) -> "Dataset": - """TODO docstring""" + """Return a new dataset with each array indexed along the specified + dimension(s), where the indexers are given as strings containing + Python expressions to be evaluated against the data variables in the + dataset. + + Parameters + ---------- + queries : dict, optional + A dic with keys matching dimensions and values given by strings + containing Python expressions to be evaluated against the data variables + in the dataset. The expressions will be evaluated using the pandas + eval() function, and can contain any valid Python expressions but cannot + contain any Python statements. + parser : {"pandas", "python"}, default: "pandas" + The parser to use to construct the syntax tree from the expression. + The default of 'pandas' parses code slightly different than standard + Python. Alternatively, you can parse an expression using the 'python' + parser to retain strict Python semantics. + engine: {"python", "numexpr", None}, default: None + The engine used to evaluate the expression. Supported engines are: + - None: tries to use numexpr, falls back to python + - "numexpr": evaluates expressions using numexpr + - "python": performs operations as if you had eval’d in top level python + missing_dims : {"raise", "warn", "ignore"}, default: "raise" + What to do if dimensions that should be selected from are not present in the + Dataset: + - "raise": raise an exception + - "warning": raise a warning, and ignore the missing dimensions + - "ignore": ignore the missing dimensions + **queries_kwargs : {dim: query, ...}, optional + The keyword arguments form of ``queries``. + One of queries or queries_kwargs must be provided. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + array and dimension is indexed by the results of the appropriate + queries. + + See Also + -------- + Dataset.isel + pandas.eval + + """ # allow queries to be given either as a dict or as kwargs queries = either_dict_or_kwargs(queries, queries_kwargs, "query") @@ -6998,7 +7043,6 @@ def query( if not isinstance(expr, str): msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given" raise ValueError(msg) - # TODO check missing dims here, or delegate to isel? # evaluate the queries to create the indexers indexers = { @@ -7006,8 +7050,6 @@ def query( for dim, expr in queries.items() } - # TODO any validation of indexers? Or just let isel try to handle it? - # apply the selection return self.isel(indexers, missing_dims=missing_dims) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index cc1ff2433bb..8cd8f5bfb1c 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5807,8 +5807,11 @@ def test_astype_attrs(self): assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs - def test_query_single_dim(self): - """Test querying a single dimension.""" + @pytest.mark.parametrize("parser", ["pandas", "python"]) + @pytest.mark.parametrize("engine", ["python", "numexpr", None]) + @pytest.mark.parametrize("backend", ["numpy", "dask"]) + def test_query(self, backend, engine, parser): + """Test querying a dataset.""" # setup test data np.random.seed(42) @@ -5816,53 +5819,70 @@ def test_query_single_dim(self): b = np.random.randint(0, 100, size=10) c = np.linspace(0, 1, 20) d = np.arange(0, 200).reshape(10, 20) - ds = Dataset( - {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)} - ) + if backend == "numpy": + ds = Dataset( + {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)} + ) + elif backend == "dask": + ds = Dataset( + { + "a": ("x", da.from_array(a, chunks=3)), + "b": ("x", da.from_array(b, chunks=3)), + "c": ("y", da.from_array(c, chunks=7)), + "d": (("x", "y"), da.from_array(d, chunks=(3, 7))), + } + ) # query single dim, single variable - actual = ds.query(x="a > 5") + actual = ds.query(x="a > 5", engine=engine, parser=parser) expect = ds.isel(x=(a > 5)) assert_identical(expect, actual) # query single dim, single variable, via dict - actual = ds.query(dict(x="a > 5")) + actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser) expect = ds.isel(dict(x=(a > 5))) assert_identical(expect, actual) # query single dim, single variable - actual = ds.query(x="b > 50") + actual = ds.query(x="b > 50", engine=engine, parser=parser) expect = ds.isel(x=(b > 50)) assert_identical(expect, actual) # query single dim, single variable - actual = ds.query(y="c < .5") + actual = ds.query(y="c < .5", engine=engine, parser=parser) expect = ds.isel(y=(c < 0.5)) assert_identical(expect, actual) # query single dim, multiple variables - actual = ds.query(x="(a > 5) & (b > 50)") + actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser) expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) # support pandas query parser - actual = ds.query(x="(a > 5) and (b > 50)") - expect = ds.isel(x=((a > 5) & (b > 50))) - assert_identical(expect, actual) + if parser == "pandas": + actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) # query multiple dims via kwargs - actual = ds.query(x="a > 5", y="c < .5") + actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser) expect = ds.isel(x=(a > 5), y=(c < 0.5)) assert_identical(expect, actual) # query multiple dims via dict - actual = ds.query(dict(x="a > 5", y="c < .5")) + actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser) expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) assert_identical(expect, actual) - # TODO test error handling - - # TODO test dask data variables + # test error handling + with pytest.raises(ValueError): + ds.query("a > 5") # must be dict + with pytest.raises(IndexError): + ds.query(y="a > 5") # wrong length dimension + with pytest.raises(IndexError): + ds.query(x="c < .5") # wrong length dimension + with pytest.raises(IndexError): + ds.query(x="d > 100") # wrong number of dimensions # Py.test tests From 907f226aa70afed7f199133dad5a36caf83a7829 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 12 Mar 2021 18:17:49 +0000 Subject: [PATCH 03/15] add error test --- xarray/tests/test_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8cd8f5bfb1c..9f488c17712 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5876,7 +5876,9 @@ def test_query(self, backend, engine, parser): # test error handling with pytest.raises(ValueError): - ds.query("a > 5") # must be dict + ds.query("a > 5") # must be dict or kwargs + with pytest.raises(ValueError): + ds.query(x=(a > 5)) # must be query string with pytest.raises(IndexError): ds.query(y="a > 5") # wrong length dimension with pytest.raises(IndexError): From 621243778dbbc1a4ceb14cfff46c404a7afb50fc Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 12 Mar 2021 18:19:49 +0000 Subject: [PATCH 04/15] unfortunate typo --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d6b44559635..c6e846b7e14 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6996,7 +6996,7 @@ def query( Parameters ---------- queries : dict, optional - A dic with keys matching dimensions and values given by strings + A dict with keys matching dimensions and values given by strings containing Python expressions to be evaluated against the data variables in the dataset. The expressions will be evaluated using the pandas eval() function, and can contain any valid Python expressions but cannot From a5e59324181bb5c44d9a4748458316f8c5523891 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 13 Mar 2021 23:40:47 +0000 Subject: [PATCH 05/15] test three dims --- xarray/tests/test_dataset.py | 39 ++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 9f488c17712..1c3ccfd46e4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5819,9 +5819,18 @@ def test_query(self, backend, engine, parser): b = np.random.randint(0, 100, size=10) c = np.linspace(0, 1, 20) d = np.arange(0, 200).reshape(10, 20) + e = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + object + ) if backend == "numpy": ds = Dataset( - {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)} + { + "a": ("x", a), + "b": ("x", b), + "c": ("y", c), + "d": (("x", "y"), d), + "e": ("z", e), + } ) elif backend == "dask": ds = Dataset( @@ -5830,6 +5839,7 @@ def test_query(self, backend, engine, parser): "b": ("x", da.from_array(b, chunks=3)), "c": ("y", da.from_array(c, chunks=7)), "d": (("x", "y"), da.from_array(d, chunks=(3, 7))), + "e": ("z", da.from_array(e, chunks=12)), } ) @@ -5853,12 +5863,19 @@ def test_query(self, backend, engine, parser): expect = ds.isel(y=(c < 0.5)) assert_identical(expect, actual) + # query single dim, single string variable + # N.B., this query raises NotImplemented for the Python parser, not clear why (same behaviour in pandas) + if parser == "pandas": + actual = ds.query(z='e == "foo"', engine=engine, parser=parser) + expect = ds.isel(z=(e == "foo")) + assert_identical(expect, actual) + # query single dim, multiple variables actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser) expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) - # support pandas query parser + # check pandas query parser if parser == "pandas": actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) expect = ds.isel(x=((a > 5) & (b > 50))) @@ -5869,11 +5886,29 @@ def test_query(self, backend, engine, parser): expect = ds.isel(x=(a > 5), y=(c < 0.5)) assert_identical(expect, actual) + # query multiple dims via kwargs + if parser == "pandas": + actual = ds.query( + x="a > 5", y="c < .5", z="e == 'foo'", engine=engine, parser=parser + ) + expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(e == "foo")) + assert_identical(expect, actual) + # query multiple dims via dict actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser) expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) assert_identical(expect, actual) + # query multiple dims via dict + if parser == "pandas": + actual = ds.query( + dict(x="a > 5", y="c < .5", z="e == 'foo'"), + engine=engine, + parser=parser, + ) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(e == "foo"))) + assert_identical(expect, actual) + # test error handling with pytest.raises(ValueError): ds.query("a > 5") # must be dict or kwargs From 372394647e38a1a900e4ceac58291510dca41e2b Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sun, 14 Mar 2021 22:41:16 +0000 Subject: [PATCH 06/15] refine tests --- xarray/tests/test_dataset.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 1c3ccfd46e4..b18fd008063 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5818,18 +5818,20 @@ def test_query(self, backend, engine, parser): a = np.arange(0, 10, 1) b = np.random.randint(0, 100, size=10) c = np.linspace(0, 1, 20) - d = np.arange(0, 200).reshape(10, 20) - e = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( object ) + e = np.arange(0, 10 * 20).reshape(10, 20) + f = np.random.normal(0, 1, size=(10, 20, 30)) if backend == "numpy": ds = Dataset( { "a": ("x", a), "b": ("x", b), "c": ("y", c), - "d": (("x", "y"), d), - "e": ("z", e), + "d": ("z", d), + "e": (("x", "y"), e), + "f": (("x", "y", "z"), f), } ) elif backend == "dask": @@ -5838,8 +5840,9 @@ def test_query(self, backend, engine, parser): "a": ("x", da.from_array(a, chunks=3)), "b": ("x", da.from_array(b, chunks=3)), "c": ("y", da.from_array(c, chunks=7)), - "d": (("x", "y"), da.from_array(d, chunks=(3, 7))), - "e": ("z", da.from_array(e, chunks=12)), + "d": ("z", da.from_array(d, chunks=12)), + "e": (("x", "y"), da.from_array(e, chunks=(3, 7))), + "f": (("x", "y", "z"), da.from_array(f, chunks=(3, 7, 12))), } ) @@ -5864,10 +5867,11 @@ def test_query(self, backend, engine, parser): assert_identical(expect, actual) # query single dim, single string variable - # N.B., this query raises NotImplemented for the Python parser, not clear why (same behaviour in pandas) if parser == "pandas": - actual = ds.query(z='e == "foo"', engine=engine, parser=parser) - expect = ds.isel(z=(e == "foo")) + # N.B., this query currently only works with the pandas parser + # xref https://github.com/pandas-dev/pandas/issues/40436 + actual = ds.query(z='d == "bar"', engine=engine, parser=parser) + expect = ds.isel(z=(d == "bar")) assert_identical(expect, actual) # query single dim, multiple variables @@ -5875,7 +5879,7 @@ def test_query(self, backend, engine, parser): expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) - # check pandas query parser + # check pandas query syntax is supported if parser == "pandas": actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) expect = ds.isel(x=((a > 5) & (b > 50))) @@ -5889,9 +5893,9 @@ def test_query(self, backend, engine, parser): # query multiple dims via kwargs if parser == "pandas": actual = ds.query( - x="a > 5", y="c < .5", z="e == 'foo'", engine=engine, parser=parser + x="a > 5", y="c < .5", z="d == 'bar'", engine=engine, parser=parser ) - expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(e == "foo")) + expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(d == "bar")) assert_identical(expect, actual) # query multiple dims via dict @@ -5902,11 +5906,11 @@ def test_query(self, backend, engine, parser): # query multiple dims via dict if parser == "pandas": actual = ds.query( - dict(x="a > 5", y="c < .5", z="e == 'foo'"), + dict(x="a > 5", y="c < .5", z="d == 'bar'"), engine=engine, parser=parser, ) - expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(e == "foo"))) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(d == "bar"))) assert_identical(expect, actual) # test error handling @@ -5919,7 +5923,7 @@ def test_query(self, backend, engine, parser): with pytest.raises(IndexError): ds.query(x="c < .5") # wrong length dimension with pytest.raises(IndexError): - ds.query(x="d > 100") # wrong number of dimensions + ds.query(x="e > 100") # wrong number of dimensions # Py.test tests From 2d4a74dbe818a66d1de9610b3ae3d3a3814a4d77 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 09:28:07 +0000 Subject: [PATCH 07/15] fix error message Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c6e846b7e14..1762e17573e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7036,7 +7036,7 @@ def query( """ # allow queries to be given either as a dict or as kwargs - queries = either_dict_or_kwargs(queries, queries_kwargs, "query") + queries = either_dict_or_kwargs(queries, queries_kwargs, "queries") # check queries for dim, expr in queries.items(): From 0ba3db3c12e3e678c1525e645a8428764b0d56cf Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 09:34:22 +0000 Subject: [PATCH 08/15] add requires decorators --- xarray/tests/__init__.py | 1 + xarray/tests/test_dataset.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4b47e1d2c7e..aebcb0f2b8d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -83,6 +83,7 @@ def LooseVersion(vstring): has_cartopy, requires_cartopy = _importorskip("cartopy") # Need Pint 0.15 for __dask_tokenize__ tests for Quantity wrapped Dask Arrays has_pint_0_15, requires_pint_0_15 = _importorskip("pint", minversion="0.15") +has_numexpr, requires_numexpr = _importorskip("numexpr") # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index b18fd008063..9a463870368 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -46,6 +46,7 @@ requires_numbagg, requires_scipy, requires_sparse, + requires_numexpr, source_ndarray, ) @@ -5807,6 +5808,8 @@ def test_astype_attrs(self): assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs + @requires_dask + @requires_numexpr @pytest.mark.parametrize("parser", ["pandas", "python"]) @pytest.mark.parametrize("engine", ["python", "numexpr", None]) @pytest.mark.parametrize("backend", ["numpy", "dask"]) From 9dddc135edc038dd6dc948fe7545cb10ab650835 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 11:15:04 +0000 Subject: [PATCH 09/15] revert change, should be func name --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1762e17573e..c6e846b7e14 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7036,7 +7036,7 @@ def query( """ # allow queries to be given either as a dict or as kwargs - queries = either_dict_or_kwargs(queries, queries_kwargs, "queries") + queries = either_dict_or_kwargs(queries, queries_kwargs, "query") # check queries for dim, expr in queries.items(): From c3f322c075ff2c5125069ee506cf7d7e1a988bdb Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 11:15:26 +0000 Subject: [PATCH 10/15] improve Dataset.query tests --- xarray/tests/test_dataset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 9a463870368..256f4873cc7 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -9,6 +9,7 @@ import pandas as pd import pytest from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.computation.ops import UndefinedVariableError import xarray as xr from xarray import ( @@ -5882,6 +5883,11 @@ def test_query(self, backend, engine, parser): expect = ds.isel(x=((a > 5) & (b > 50))) assert_identical(expect, actual) + # query single dim, multiple variables with computation + actual = ds.query(x="(a * b) > 250", engine=engine, parser=parser) + expect = ds.isel(x=(a * b) > 250) + assert_identical(expect, actual) + # check pandas query syntax is supported if parser == "pandas": actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) @@ -5927,6 +5933,8 @@ def test_query(self, backend, engine, parser): ds.query(x="c < .5") # wrong length dimension with pytest.raises(IndexError): ds.query(x="e > 100") # wrong number of dimensions + with pytest.raises(UndefinedVariableError): + ds.query(x="spam > 50") # name not present # Py.test tests From 0eec9821add87003d7968c45ec18da6429b193dd Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 11:15:40 +0000 Subject: [PATCH 11/15] add DataArray.query --- xarray/core/dataarray.py | 58 +++++++++++++++++++++++++++++ xarray/tests/test_dataarray.py | 67 ++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e6209b0604b..3c33afeb69d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4356,6 +4356,64 @@ def argmax( # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) + def query( + self, + queries: Mapping[Hashable, Any] = None, + parser: str = "pandas", + engine: str = None, + missing_dims: str = "raise", + **queries_kwargs: Any, + ) -> "DataArray": + """Return a new data array indexed along the specified + dimension(s), where the indexers are given as strings containing + Python expressions to be evaluated against the values in the array. + + Parameters + ---------- + queries : dict, optional + A dict with keys matching dimensions and values given by strings + containing Python expressions to be evaluated against the data variables + in the dataset. The expressions will be evaluated using the pandas + eval() function, and can contain any valid Python expressions but cannot + contain any Python statements. + parser : {"pandas", "python"}, default: "pandas" + The parser to use to construct the syntax tree from the expression. + The default of 'pandas' parses code slightly different than standard + Python. Alternatively, you can parse an expression using the 'python' + parser to retain strict Python semantics. + engine: {"python", "numexpr", None}, default: None + The engine used to evaluate the expression. Supported engines are: + - None: tries to use numexpr, falls back to python + - "numexpr": evaluates expressions using numexpr + - "python": performs operations as if you had eval’d in top level python + missing_dims : {"raise", "warn", "ignore"}, default: "raise" + What to do if dimensions that should be selected from are not present in the + Dataset: + - "raise": raise an exception + - "warning": raise a warning, and ignore the missing dimensions + - "ignore": ignore the missing dimensions + **queries_kwargs : {dim: query, ...}, optional + The keyword arguments form of ``queries``. + One of queries or queries_kwargs must be provided. + + Returns + ------- + obj : DataArray + A new DataArray with the same contents as this dataset, indexed by + the results of the appropriate queries. + + See Also + -------- + DataArray.isel + Dataset.query + pandas.eval + + """ + + ds = self._to_dataset_whole(shallow_copy=True) + ds = ds.query(queries=queries, parser=parser, engine=engine, missing_dims=missing_dims, **queries_kwargs) + return ds[self.name] + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index b28a53023ed..a905a31de63 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.core.computation.ops import UndefinedVariableError import pytest import xarray as xr @@ -40,6 +41,7 @@ requires_numbagg, requires_scipy, requires_sparse, + requires_numexpr, source_ndarray, ) @@ -4615,6 +4617,71 @@ def test_pad_reflect(self, mode, reflect_type): assert actual.shape == (7, 4, 9) assert_identical(actual, expected) + @requires_dask + @requires_numexpr + @pytest.mark.parametrize("parser", ["pandas", "python"]) + @pytest.mark.parametrize("engine", ["python", "numexpr", None]) + @pytest.mark.parametrize("backend", ["numpy", "dask"]) + def test_query(self, backend, engine, parser): + """Test querying a dataset.""" + + # setup test data + np.random.seed(42) + a = np.arange(0, 10, 1) + b = np.random.randint(0, 100, size=10) + c = np.linspace(0, 1, 20) + d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + object + ) + if backend == "numpy": + aa = DataArray(data=a, dims=["x"], name="a") + bb = DataArray(data=b, dims=["x"], name="b") + cc = DataArray(data=c, dims=["y"], name="c") + dd = DataArray(data=d, dims=["z"], name="d") + + elif backend == "dask": + import dask.array as da + aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a") + bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b") + cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c") + dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d") + + # query single dim, single variable + actual = aa.query(x="a > 5", engine=engine, parser=parser) + expect = aa.isel(x=(a > 5)) + assert_identical(expect, actual) + + # query single dim, single variable, via dict + actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser) + expect = aa.isel(dict(x=(a > 5))) + assert_identical(expect, actual) + + # query single dim, single variable + actual = bb.query(x="b > 50", engine=engine, parser=parser) + expect = bb.isel(x=(b > 50)) + assert_identical(expect, actual) + + # query single dim, single variable + actual = cc.query(y="c < .5", engine=engine, parser=parser) + expect = cc.isel(y=(c < 0.5)) + assert_identical(expect, actual) + + # query single dim, single string variable + if parser == "pandas": + # N.B., this query currently only works with the pandas parser + # xref https://github.com/pandas-dev/pandas/issues/40436 + actual = dd.query(z='d == "bar"', engine=engine, parser=parser) + expect = dd.isel(z=(d == "bar")) + assert_identical(expect, actual) + + # test error handling + with pytest.raises(ValueError): + aa.query("a > 5") # must be dict or kwargs + with pytest.raises(ValueError): + aa.query(x=(a > 5)) # must be query string + with pytest.raises(UndefinedVariableError): + aa.query(x="spam > 50") # name not present + class TestReduce: @pytest.fixture(autouse=True) From cfe03d3093a4b716a4ed67b544d0be93e79dd993 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 11:17:29 +0000 Subject: [PATCH 12/15] add query to API docs --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 9add7a96109..baec11eaf98 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -138,6 +138,7 @@ Indexing Dataset.set_index Dataset.reset_index Dataset.reorder_levels + Dataset.query Missing value handling ---------------------- @@ -321,6 +322,7 @@ Indexing DataArray.set_index DataArray.reset_index DataArray.reorder_levels + DataArray.query Missing value handling ---------------------- From d78604115d08a6f7f6fbf2f2be14634e201b6612 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 11:21:31 +0000 Subject: [PATCH 13/15] add query to whats new --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index eed4e16eb62..a5f351ad007 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,10 @@ v0.17.1 (unreleased) New Features ~~~~~~~~~~~~ +- Add :py:meth:`Dataset.query` and :py:meth:`DataArray.query` which enable indexing + of datasets and data arrays by evaluating query expressions against the values of the + data variables (:pull:`4984`). By `Alistair Miles `_. + Breaking changes ~~~~~~~~~~~~~~~~ From 48de755405fe06fd9e0f158009a2e578e245e3e0 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 14:20:16 +0000 Subject: [PATCH 14/15] fix black, mypy --- xarray/core/dataarray.py | 16 +++++++++++----- xarray/tests/test_dataarray.py | 5 +++-- xarray/tests/test_dataset.py | 4 ++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 79e89de8acd..b2fc14f4ba7 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4354,10 +4354,6 @@ def argmax( else: return self._replace_maybe_drop_dims(result) - # this needs to be at the end, or mypy will confuse with `str` - # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names - str = utils.UncachedAccessor(StringAccessor) - def query( self, queries: Mapping[Hashable, Any] = None, @@ -4413,9 +4409,19 @@ def query( """ ds = self._to_dataset_whole(shallow_copy=True) - ds = ds.query(queries=queries, parser=parser, engine=engine, missing_dims=missing_dims, **queries_kwargs) + ds = ds.query( + queries=queries, + parser=parser, + engine=engine, + missing_dims=missing_dims, + **queries_kwargs, + ) return ds[self.name] + # this needs to be at the end, or mypy will confuse with `str` + # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names + str = utils.UncachedAccessor(StringAccessor) + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 74d4c1123d0..bfe6e497b47 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6,8 +6,8 @@ import numpy as np import pandas as pd -from pandas.core.computation.ops import UndefinedVariableError import pytest +from pandas.core.computation.ops import UndefinedVariableError from pandas.tseries.frequencies import to_offset import xarray as xr @@ -40,9 +40,9 @@ requires_dask, requires_iris, requires_numbagg, + requires_numexpr, requires_scipy, requires_sparse, - requires_numexpr, source_ndarray, ) @@ -4646,6 +4646,7 @@ def test_query(self, backend, engine, parser): elif backend == "dask": import dask.array as da + aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a") bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b") cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 00c720857df..52073ace566 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import pytest -from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.computation.ops import UndefinedVariableError +from pandas.core.indexes.datetimes import DatetimeIndex from pandas.tseries.frequencies import to_offset import xarray as xr @@ -46,9 +46,9 @@ requires_cftime, requires_dask, requires_numbagg, + requires_numexpr, requires_scipy, requires_sparse, - requires_numexpr, source_ndarray, ) From 8fcb02e6663ba24392a2ef7540a76a05fd382cc2 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 16 Mar 2021 15:21:06 +0000 Subject: [PATCH 15/15] refine test parameterisation and requirements --- xarray/tests/test_dataarray.py | 10 ++++++---- xarray/tests/test_dataset.py | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index bfe6e497b47..c38c3656eaf 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4622,11 +4622,13 @@ def test_pad_reflect(self, mode, reflect_type): assert actual.shape == (7, 4, 9) assert_identical(actual, expected) - @requires_dask - @requires_numexpr @pytest.mark.parametrize("parser", ["pandas", "python"]) - @pytest.mark.parametrize("engine", ["python", "numexpr", None]) - @pytest.mark.parametrize("backend", ["numpy", "dask"]) + @pytest.mark.parametrize( + "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] + ) + @pytest.mark.parametrize( + "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] + ) def test_query(self, backend, engine, parser): """Test querying a dataset.""" diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 52073ace566..52df7603034 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5809,11 +5809,13 @@ def test_astype_attrs(self): assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs - @requires_dask - @requires_numexpr @pytest.mark.parametrize("parser", ["pandas", "python"]) - @pytest.mark.parametrize("engine", ["python", "numexpr", None]) - @pytest.mark.parametrize("backend", ["numpy", "dask"]) + @pytest.mark.parametrize( + "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] + ) + @pytest.mark.parametrize( + "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] + ) def test_query(self, backend, engine, parser): """Test querying a dataset."""