diff --git a/.github/actions/detect-ci-trigger/action.yaml b/.github/actions/detect-ci-trigger/action.yaml deleted file mode 100644 index c255d0c57cc..00000000000 --- a/.github/actions/detect-ci-trigger/action.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Detect CI Trigger -description: | - Detect a keyword used to control the CI in the subject line of a commit message. -inputs: - keyword: - description: | - The keyword to detect. - required: true -outputs: - trigger-found: - description: | - true if the keyword has been found in the subject line of the commit message - value: ${{ steps.detect-trigger.outputs.CI_TRIGGERED }} -runs: - using: "composite" - steps: - - name: detect trigger - id: detect-trigger - run: | - bash $GITHUB_ACTION_PATH/script.sh ${{ github.event_name }} ${{ inputs.keyword }} - shell: bash - - name: show detection result - run: | - echo "::group::final summary" - echo "commit message: ${{ steps.detect-trigger.outputs.COMMIT_MESSAGE }}" - echo "trigger keyword: ${{ inputs.keyword }}" - echo "trigger found: ${{ steps.detect-trigger.outputs.CI_TRIGGERED }}" - echo "::endgroup::" - shell: bash diff --git a/.github/actions/detect-ci-trigger/script.sh b/.github/actions/detect-ci-trigger/script.sh deleted file mode 100644 index c98175a5a08..00000000000 --- a/.github/actions/detect-ci-trigger/script.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash -event_name="$1" -keyword="$2" - -echo "::group::fetch a sufficient number of commits" -echo "skipped" -# git log -n 5 2>&1 -# if [[ "$event_name" == "pull_request" ]]; then -# ref=$(git log -1 --format='%H') -# git -c protocol.version=2 fetch --deepen=2 --no-tags --prune --progress -q origin $ref 2>&1 -# git log FETCH_HEAD -# git checkout FETCH_HEAD -# else -# echo "nothing to do." -# fi -# git log -n 5 2>&1 -echo "::endgroup::" - -echo "::group::extracting the commit message" -echo "event name: $event_name" -if [[ "$event_name" == "pull_request" ]]; then - ref="HEAD^2" -else - ref="HEAD" -fi - -commit_message="$(git log -n 1 --pretty=format:%s "$ref")" - -if [[ $(echo $commit_message | wc -l) -le 1 ]]; then - echo "commit message: '$commit_message'" -else - echo -e "commit message:\n--- start ---\n$commit_message\n--- end ---" -fi -echo "::endgroup::" - -echo "::group::scanning for the keyword" -echo "searching for: '$keyword'" -if echo "$commit_message" | grep -qF "$keyword"; then - result="true" -else - result="false" -fi -echo "keyword detected: $result" -echo "::endgroup::" - -echo "::set-output name=COMMIT_MESSAGE::$commit_message" -echo "::set-output name=CI_TRIGGERED::$result" diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 3579e18dbff..92c7226f81d 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -19,7 +19,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: ./.github/actions/detect-ci-trigger + - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[skip-ci]" diff --git a/.github/workflows/ci-pre-commit-autoupdate.yaml b/.github/workflows/ci-pre-commit-autoupdate.yaml new file mode 100644 index 00000000000..784fd05bcb4 --- /dev/null +++ b/.github/workflows/ci-pre-commit-autoupdate.yaml @@ -0,0 +1,41 @@ +name: "pre-commit autoupdate CI" + +on: + schedule: + - cron: "0 0 * * 0" # every Sunday at 00:00 UTC + workflow_dispatch: + + +jobs: + autoupdate: + name: 'pre-commit autoupdate' + runs-on: ubuntu-latest + if: github.repository == 'pydata/xarray' + steps: + - name: checkout + uses: actions/checkout@v2 + - name: Cache pip and pre-commit + uses: actions/cache@v2 + with: + path: | + ~/.cache/pre-commit + ~/.cache/pip + key: ${{ runner.os }}-pre-commit-autoupdate + - name: setup python + uses: actions/setup-python@v2 + - name: upgrade pip + run: python -m pip install --upgrade pip + - name: install pre-commit + run: python -m pip install --upgrade pre-commit + - name: version info + run: python -m pip list + - name: autoupdate + uses: technote-space/create-pr-action@837dbe469b39f08d416889369a52e2a993625c84 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EXECUTE_COMMANDS: | + python -m pre_commit autoupdate + COMMIT_MESSAGE: 'pre-commit: autoupdate hook versions' + PR_TITLE: 'pre-commit: autoupdate hook versions' + PR_BRANCH_PREFIX: 'pre-commit/' + PR_BRANCH_NAME: 'autoupdate-${PR_ID}' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7d7326eb5c2..e8fd881e707 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,7 +19,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: ./.github/actions/detect-ci-trigger + - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[skip-ci]" diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index dda762878c5..bba7c04a9c2 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -21,7 +21,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: ./.github/actions/detect-ci-trigger + - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[test-upstream]" diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 9455ef2f127..fc32d35837b 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -8,10 +8,10 @@ dependencies: # - cdms2 # Not available on Windows # - cfgrib # Causes Python interpreter crash on Windows: https://github.com/pydata/xarray/pull/3340 - cftime - - dask<2021.02.0 + - dask - distributed - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - iris diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 7261b5b6954..36147c64c03 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -3,16 +3,17 @@ channels: - conda-forge - nodefaults dependencies: + - aiobotocore - boto3 - bottleneck - cartopy - cdms2 - cfgrib - cftime - - dask<2021.02.0 + - dask - distributed - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - iris diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml index 51ec48cc6b1..3f82990f3b5 100644 --- a/ci/requirements/py38-all-but-dask.yml +++ b/ci/requirements/py38-all-but-dask.yml @@ -4,6 +4,8 @@ channels: - nodefaults dependencies: - python=3.8 + - black + - aiobotocore - boto3 - bottleneck - cartopy @@ -12,7 +14,7 @@ dependencies: - cftime - coveralls - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - lxml # Optional dep of pydap diff --git a/doc/conf.py b/doc/conf.py index 14b28b4e471..def4bb0b229 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -415,7 +415,7 @@ "numpy": ("https://numpy.org/doc/stable", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "numba": ("https://numba.pydata.org/numba-doc/latest", None), - "matplotlib": ("https://matplotlib.org", None), + "matplotlib": ("https://matplotlib.org/stable/", None), "dask": ("https://docs.dask.org/en/latest", None), "cftime": ("https://unidata.github.io/cftime", None), "rasterio": ("https://rasterio.readthedocs.io/en/latest", None), diff --git a/doc/examples/ERA5-GRIB-example.ipynb b/doc/examples/ERA5-GRIB-example.ipynb index b82a07a64e6..1c6be5f6634 100644 --- a/doc/examples/ERA5-GRIB-example.ipynb +++ b/doc/examples/ERA5-GRIB-example.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "GRIB format is commonly used to disemminate atmospheric model data. With Xarray and the cfgrib engine, GRIB data can easily be analyzed and visualized." + "GRIB format is commonly used to disseminate atmospheric model data. With Xarray and the cfgrib engine, GRIB data can easily be analyzed and visualized." ] }, { diff --git a/doc/examples/ROMS_ocean_model.ipynb b/doc/examples/ROMS_ocean_model.ipynb index 74536bbe28f..b699c4d5ba9 100644 --- a/doc/examples/ROMS_ocean_model.ipynb +++ b/doc/examples/ROMS_ocean_model.ipynb @@ -120,7 +120,7 @@ "source": [ "### A naive vertical slice\n", "\n", - "Create a slice using the s-coordinate as the vertical dimension is typically not very informative." + "Creating a slice using the s-coordinate as the vertical dimension is typically not very informative." ] }, { diff --git a/doc/examples/area_weighted_temperature.ipynb b/doc/examples/area_weighted_temperature.ipynb index de705966583..7299b50b1b3 100644 --- a/doc/examples/area_weighted_temperature.ipynb +++ b/doc/examples/area_weighted_temperature.ipynb @@ -20,7 +20,7 @@ "Author: [Mathias Hauser](https://github.com/mathause/)\n", "\n", "\n", - "We use the `air_temperature` example dataset to calculate the area-weighted temperature over its domain. This dataset has a regular latitude/ longitude grid, thus the gridcell area decreases towards the pole. For this grid we can use the cosine of the latitude as proxy for the grid cell area.\n" + "We use the `air_temperature` example dataset to calculate the area-weighted temperature over its domain. This dataset has a regular latitude/ longitude grid, thus the grid cell area decreases towards the pole. For this grid we can use the cosine of the latitude as proxy for the grid cell area.\n" ] }, { diff --git a/doc/examples/monthly-means.ipynb b/doc/examples/monthly-means.ipynb index bc88f4a9fc9..3490fc9a4fe 100644 --- a/doc/examples/monthly-means.ipynb +++ b/doc/examples/monthly-means.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Calculating Seasonal Averages from Timeseries of Monthly Means \n", + "Calculating Seasonal Averages from Time Series of Monthly Means \n", "=====\n", "\n", "Author: [Joe Hamman](https://github.com/jhamman/)\n", @@ -60,10 +60,10 @@ "source": [ "#### Now for the heavy lifting:\n", "We first have to come up with the weights,\n", - "- calculate the month lengths for each monthly data record\n", + "- calculate the month length for each monthly data record\n", "- calculate weights using `groupby('time.season')`\n", "\n", - "Finally, we just need to multiply our weights by the `Dataset` and sum allong the time dimension. Creating a `DataArray` for the month length is as easy as using the `days_in_month` accessor on the time coordinate. The calendar type, in this case `'noleap'`, is automatically considered in this operation." + "Finally, we just need to multiply our weights by the `Dataset` and sum along the time dimension. Creating a `DataArray` for the month length is as easy as using the `days_in_month` accessor on the time coordinate. The calendar type, in this case `'noleap'`, is automatically considered in this operation." ] }, { diff --git a/doc/indexing.rst b/doc/indexing.rst index 78766b8fd81..14af176c428 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -395,6 +395,22 @@ These methods may also be applied to ``Dataset`` objects ds = da.to_dataset(name="bar") ds.isel(x=xr.DataArray([0, 1, 2], dims=["points"])) +Vectorized indexing may be used to extract information from the nearest +grid cells of interest, for example, the nearest climate model grid cells +to a collection specified weather station latitudes and longitudes. + +.. ipython:: python + + ds = xr.tutorial.open_dataset("air_temperature") + + # Define target latitude and longitude (where weather stations might be) + target_lon = xr.DataArray([200, 201, 202, 205], dims="points") + target_lat = xr.DataArray([31, 41, 42, 42], dims="points") + + # Retrieve data at the grid cells nearest to the target latitudes and longitudes + da = ds["air"].sel(lon=target_lon, lat=target_lat, method="nearest") + da + .. tip:: If you are lazily loading your data from disk, not every form of vectorized diff --git a/doc/io.rst b/doc/io.rst index 2e46879929b..b97f1f5a699 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -890,17 +890,44 @@ Cloud Storage Buckets It is possible to read and write xarray datasets directly from / to cloud storage buckets using zarr. This example uses the `gcsfs`_ package to provide -a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then -pass to xarray:: +an interface to `Google Cloud Storage`_. + +From v0.16.2: general `fsspec`_ URLs are parsed and the store set up for you +automatically when reading, such that you can open a dataset in a single +call. You should include any arguments to the storage backend as the +key ``storage_options``, part of ``backend_kwargs``. + +.. code:: python + + ds_gcs = xr.open_dataset( + "gcs:///path.zarr", + backend_kwargs={ + "storage_options": {"project": "", "token": None} + }, + engine="zarr", + ) + + +This also works with ``open_mfdataset``, allowing you to pass a list of paths or +a URL to be interpreted as a glob string. + +For older versions, and for writing, you must explicitly set up a ``MutableMapping`` +instance and pass this, as follows: + +.. code:: python import gcsfs - fs = gcsfs.GCSFileSystem(project='', token=None) - gcsmap = gcsfs.mapping.GCSMap('', gcs=fs, check=True, create=False) + + fs = gcsfs.GCSFileSystem(project="", token=None) + gcsmap = gcsfs.mapping.GCSMap("", gcs=fs, check=True, create=False) # write to the bucket ds.to_zarr(store=gcsmap) # read it back ds_gcs = xr.open_zarr(gcsmap) +(or use the utility function ``fsspec.get_mapper()``). + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ .. _Zarr: http://zarr.readthedocs.io/ .. _Amazon S3: https://aws.amazon.com/s3/ .. _Google Cloud Storage: https://cloud.google.com/storage/ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9cdfe1517c4..f5344aa266e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -74,6 +74,11 @@ New Features in the form of kwargs as well as a dict, like most similar methods. By `Maximilian Roos `_. +- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs + (including globs for the latter) for ``engine="zarr"``, and so allow reading from + many remote and other file systems (:pull:`4461`) + By `Martin Durant `_ + Bug fixes ~~~~~~~~~ - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke `_. @@ -111,6 +116,8 @@ Bug fixes By `Leif Denby `_. - Fix time encoding bug associated with using cftime versions greater than 1.4.0 with xarray (:issue:`4870`, :pull:`4871`). By `Spencer Clark `_. +- Fix decoding of vlen strings using h5py versions greater than 3.0.0 with h5netcdf backend (:issue:`4570`, :pull:`4893`). + By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ diff --git a/setup.cfg b/setup.cfg index 72d28d3ca6f..231865d7788 100644 --- a/setup.cfg +++ b/setup.cfg @@ -185,6 +185,8 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-distributed.*] ignore_missing_imports = True +[mypy-fsspec.*] +ignore_missing_imports = True [mypy-h5netcdf.*] ignore_missing_imports = True [mypy-h5py.*] diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 81314588784..0791d1cdaf1 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -643,7 +643,9 @@ def open_dataarray( backend_kwargs: dict, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or - allow user control of dataset processing. + allow user control of dataset processing. If using fsspec URLs, + include the key "storage_options" to pass arguments to the + storage layer. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -869,14 +871,33 @@ def open_mfdataset( .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ if isinstance(paths, str): - if is_remote_uri(paths): + if is_remote_uri(paths) and engine == "zarr": + try: + from fsspec.core import get_fs_token_paths + except ImportError as e: + raise ImportError( + "The use of remote URLs for opening zarr requires the package fsspec" + ) from e + + fs, _, _ = get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + paths = fs.glob(fs._strip_protocol(paths)) # finds directories + paths = [fs.get_mapper(path) for path in paths] + elif is_remote_uri(paths): raise ValueError( "cannot do wild-card matching for paths that are remote URLs: " "{!r}. Instead, supply paths as an explicit list of strings.".format( paths ) ) - paths = sorted(glob(_normalize_path(paths))) + else: + paths = sorted(glob(_normalize_path(paths))) else: paths = [str(p) if isinstance(p, Path) else p for p in paths] diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index aa892c4f89c..5766b34d9bd 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -131,6 +131,7 @@ def open( autoclose=False, invalid_netcdf=None, phony_dims=None, + decode_vlen_strings=True, ): if isinstance(filename, bytes): @@ -157,6 +158,10 @@ def open( "h5netcdf backend keyword argument 'phony_dims' needs " "h5netcdf >= 0.8.0." ) + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs["decode_vlen_strings"] = decode_vlen_strings if lock is None: if mode == "r": @@ -358,6 +363,7 @@ def open_dataset( lock=None, invalid_netcdf=None, phony_dims=None, + decode_vlen_strings=True, ): store = H5NetCDFStore.open( @@ -367,6 +373,7 @@ def open_dataset( lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, + decode_vlen_strings=decode_vlen_strings, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index b8cd2bf6378..88c29306d18 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -8,19 +8,19 @@ from .common import BACKEND_ENTRYPOINTS +STANDARD_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"] -def remove_duplicates(backend_entrypoints): + +def remove_duplicates(pkg_entrypoints): # sort and group entrypoints by name - backend_entrypoints = sorted(backend_entrypoints, key=lambda ep: ep.name) - backend_entrypoints_grouped = itertools.groupby( - backend_entrypoints, key=lambda ep: ep.name - ) + pkg_entrypoints = sorted(pkg_entrypoints, key=lambda ep: ep.name) + pkg_entrypoints_grouped = itertools.groupby(pkg_entrypoints, key=lambda ep: ep.name) # check if there are multiple entrypoints for the same name - unique_backend_entrypoints = [] - for name, matches in backend_entrypoints_grouped: + unique_pkg_entrypoints = [] + for name, matches in pkg_entrypoints_grouped: matches = list(matches) - unique_backend_entrypoints.append(matches[0]) + unique_pkg_entrypoints.append(matches[0]) matches_len = len(matches) if matches_len > 1: selected_module_name = matches[0].module_name @@ -30,7 +30,7 @@ def remove_duplicates(backend_entrypoints): f"\n {all_module_names}.\n It will be used: {selected_module_name}.", RuntimeWarning, ) - return unique_backend_entrypoints + return unique_pkg_entrypoints def detect_parameters(open_dataset): @@ -51,13 +51,13 @@ def detect_parameters(open_dataset): return tuple(parameters_list) -def create_engines_dict(backend_entrypoints): - engines = {} - for backend_ep in backend_entrypoints: - name = backend_ep.name - backend = backend_ep.load() - engines[name] = backend - return engines +def backends_dict_from_pkg(pkg_entrypoints): + backend_entrypoints = {} + for pkg_ep in pkg_entrypoints: + name = pkg_ep.name + backend = pkg_ep.load() + backend_entrypoints[name] = backend + return backend_entrypoints def set_missing_parameters(backend_entrypoints): @@ -67,11 +67,23 @@ def set_missing_parameters(backend_entrypoints): backend.open_dataset_parameters = detect_parameters(open_dataset) -def build_engines(entrypoints): +def sort_backends(backend_entrypoints): + ordered_backends_entrypoints = {} + for be_name in STANDARD_BACKENDS_ORDER: + if backend_entrypoints.get(be_name, None) is not None: + ordered_backends_entrypoints[be_name] = backend_entrypoints.pop(be_name) + ordered_backends_entrypoints.update( + {name: backend_entrypoints[name] for name in sorted(backend_entrypoints)} + ) + return ordered_backends_entrypoints + + +def build_engines(pkg_entrypoints): backend_entrypoints = BACKEND_ENTRYPOINTS.copy() - pkg_entrypoints = remove_duplicates(entrypoints) - external_backend_entrypoints = create_engines_dict(pkg_entrypoints) + pkg_entrypoints = remove_duplicates(pkg_entrypoints) + external_backend_entrypoints = backends_dict_from_pkg(pkg_entrypoints) backend_entrypoints.update(external_backend_entrypoints) + backend_entrypoints = sort_backends(backend_entrypoints) set_missing_parameters(backend_entrypoints) engines = {} for name, backend in backend_entrypoints.items(): @@ -81,8 +93,8 @@ def build_engines(entrypoints): @functools.lru_cache(maxsize=1) def list_engines(): - entrypoints = pkg_resources.iter_entry_points("xarray.backends") - return build_engines(entrypoints) + pkg_entrypoints = pkg_resources.iter_entry_points("xarray.backends") + return build_engines(pkg_entrypoints) def guess_engine(store_spec): diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 04fdeac6450..074572169ce 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,5 +1,6 @@ import os import pathlib +from distutils.version import LooseVersion import numpy as np @@ -295,6 +296,7 @@ def open_group( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, append_dim=None, write_region=None, ): @@ -303,7 +305,15 @@ def open_group( if isinstance(store, pathlib.Path): store = os.fspath(store) - open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) + open_kwargs = dict( + mode=mode, + synchronizer=synchronizer, + path=group, + ) + if LooseVersion(zarr.__version__) >= "2.5.0": + open_kwargs["storage_options"] = storage_options + elif storage_options: + raise ValueError("Storage options only compatible with zarr>=2.5.0") if chunk_store: open_kwargs["chunk_store"] = chunk_store @@ -537,6 +547,7 @@ def open_zarr( consolidated=False, overwrite_encoded_chunks=False, chunk_store=None, + storage_options=None, decode_timedelta=None, use_cftime=None, **kwargs, @@ -649,6 +660,7 @@ def open_zarr( "consolidated": consolidated, "overwrite_encoded_chunks": overwrite_encoded_chunks, "chunk_store": chunk_store, + "storage_options": storage_options, } ds = open_dataset( @@ -687,6 +699,7 @@ def open_dataset( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, ): store = ZarrStore.open_group( filename_or_obj, @@ -696,6 +709,7 @@ def open_dataset( consolidated=consolidated, consolidate_on_close=consolidate_on_close, chunk_store=chunk_store, + storage_options=storage_options, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7ca5ff50eba..34354da61e2 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1094,6 +1094,26 @@ def isel( -------- Dataset.isel DataArray.sel + + Examples + -------- + >>> da = xr.DataArray(np.arange(25).reshape(5, 5), dims=("x", "y")) + >>> da + + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14], + [15, 16, 17, 18, 19], + [20, 21, 22, 23, 24]]) + Dimensions without coordinates: x, y + + >>> tgt_x = xr.DataArray(np.arange(0, 5), dims="points") + >>> tgt_y = xr.DataArray(np.arange(0, 5), dims="points") + >>> da = da.isel(x=tgt_x, y=tgt_y) + >>> da + + array([ 0, 6, 12, 18, 24]) + Dimensions without coordinates: points """ indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "isel") @@ -1202,6 +1222,34 @@ def sel( Dataset.sel DataArray.isel + Examples + -------- + >>> da = xr.DataArray( + ... np.arange(25).reshape(5, 5), + ... coords={"x": np.arange(5), "y": np.arange(5)}, + ... dims=("x", "y"), + ... ) + >>> da + + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14], + [15, 16, 17, 18, 19], + [20, 21, 22, 23, 24]]) + Coordinates: + * x (x) int64 0 1 2 3 4 + * y (y) int64 0 1 2 3 4 + + >>> tgt_x = xr.DataArray(np.linspace(0, 4, num=5), dims="points") + >>> tgt_y = xr.DataArray(np.linspace(0, 4, num=5), dims="points") + >>> da = da.sel(x=tgt_x, y=tgt_y, method="nearest") + >>> da + + array([ 0, 6, 12, 18, 24]) + Coordinates: + x (points) int64 0 1 2 3 4 + y (points) int64 0 1 2 3 4 + Dimensions without coordinates: points """ ds = self._to_temp_dataset().sel( indexers=indexers, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7d51adb5244..066a2f690b0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -866,13 +866,12 @@ def __dask_postcompute__(self): import dask info = [ - (True, k, v.__dask_postcompute__()) + (k, None) + v.__dask_postcompute__() if dask.is_dask_collection(v) - else (False, k, v) + else (k, v, None, None) for k, v in self._variables.items() ] - args = ( - info, + construct_direct_args = ( self._coord_names, self._dims, self._attrs, @@ -880,19 +879,18 @@ def __dask_postcompute__(self): self._encoding, self._close, ) - return self._dask_postcompute, args + return self._dask_postcompute, (info, construct_direct_args) def __dask_postpersist__(self): import dask info = [ - (True, k, v.__dask_postpersist__()) + (k, None, v.__dask_keys__()) + v.__dask_postpersist__() if dask.is_dask_collection(v) - else (False, k, v) + else (k, v, None, None, None) for k, v in self._variables.items() ] - args = ( - info, + construct_direct_args = ( self._coord_names, self._dims, self._attrs, @@ -900,45 +898,37 @@ def __dask_postpersist__(self): self._encoding, self._close, ) - return self._dask_postpersist, args + return self._dask_postpersist, (info, construct_direct_args) @staticmethod - def _dask_postcompute(results, info, *args): + def _dask_postcompute(results, info, construct_direct_args): variables = {} - results2 = list(results[::-1]) - for is_dask, k, v in info: - if is_dask: - func, args2 = v - r = results2.pop() - result = func(r, *args2) + results_iter = iter(results) + for k, v, rebuild, rebuild_args in info: + if v is None: + variables[k] = rebuild(next(results_iter), *rebuild_args) else: - result = v - variables[k] = result + variables[k] = v - final = Dataset._construct_direct(variables, *args) + final = Dataset._construct_direct(variables, *construct_direct_args) return final @staticmethod - def _dask_postpersist(dsk, info, *args): + def _dask_postpersist(dsk, info, construct_direct_args): + from dask.optimization import cull + variables = {} # postpersist is called in both dask.optimize and dask.persist # When persisting, we want to filter out unrelated keys for # each Variable's task graph. - is_persist = len(dsk) == len(info) - for is_dask, k, v in info: - if is_dask: - func, args2 = v - if is_persist: - name = args2[1][0] - dsk2 = {k: v for k, v in dsk.items() if k[0] == name} - else: - dsk2 = dsk - result = func(dsk2, *args2) + for k, v, dask_keys, rebuild, rebuild_args in info: + if v is None: + dsk2, _ = cull(dsk, dask_keys) + variables[k] = rebuild(dsk2, *rebuild_args) else: - result = v - variables[k] = result + variables[k] = v - return Dataset._construct_direct(variables, *args) + return Dataset._construct_direct(variables, *construct_direct_args) def compute(self, **kwargs) -> "Dataset": """Manually trigger loading and/or computation of this dataset's data diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ced688f32dd..9648458ec6d 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -645,7 +645,12 @@ def close_on_error(f): def is_remote_uri(path: str) -> bool: - return bool(re.search(r"^https?\://", path)) + """Finds URLs of the form protocol:// or protocol:: + + This also matches for http[s]://, which were the only remote URLs + supported in <=v0.16.2. + """ + return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) def read_magic_number(filename_or_obj, count=8): diff --git a/xarray/testing.py b/xarray/testing.py index e8b5f04ef85..2129b1e1aa4 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -27,6 +27,8 @@ def ensure_warnings(func): # -> make sure that does not happen in the assert_* functions @functools.wraps(func) def wrapper(*args, **kwargs): + __tracebackhide__ = True + with warnings.catch_warnings(): warnings.simplefilter("always") diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a7761aefa3d..4b47e1d2c7e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -74,6 +74,7 @@ def LooseVersion(vstring): has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") +has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3750c0715ae..aefb91478cb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ requires_cfgrib, requires_cftime, requires_dask, + requires_fsspec, requires_h5netcdf, requires_netCDF4, requires_pseudonetcdf, @@ -2578,13 +2579,19 @@ def test_open_dataset_group(self): v = group.createVariable("x", "int") v[...] = 42 - h5 = h5netcdf.File(tmp_file, mode="r") + kwargs = {} + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs = dict(decode_vlen_strings=True) + + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5["g"]) with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) assert_identical(expected, ds) - h5 = h5netcdf.File(tmp_file, mode="r") + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5, group="g") with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) @@ -2599,7 +2606,13 @@ def test_deepcopy(self): v = nc.createVariable("y", np.int32, ("x",)) v[:] = np.arange(10) - h5 = h5netcdf.File(tmp_file, mode="r") + kwargs = {} + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs = dict(decode_vlen_strings=True) + + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5) with open_dataset(store) as ds: copied = ds.copy(deep=True) @@ -3040,10 +3053,17 @@ def test_open_mfdataset(self): with raises_regex(IOError, "no files to open"): open_mfdataset("foo-bar-baz-*.nc") - with raises_regex(ValueError, "wild-card"): open_mfdataset("http://some/remote/uri") + @requires_fsspec + def test_open_mfdataset_no_files(self): + pytest.importorskip("aiobotocore") + + # glob is attempted as of #4823, but finds no files + with raises_regex(OSError, "no files"): + open_mfdataset("http://some/remote/uri", engine="zarr") + def test_open_mfdataset_2d(self): original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) with create_tmp_file() as tmp1: @@ -4799,6 +4819,48 @@ def test_extract_zarr_variable_encoding(): ) +@requires_zarr +@requires_fsspec +def test_open_fsspec(): + import fsspec + import zarr + + if not hasattr(zarr.storage, "FSStore") or not hasattr( + zarr.storage.FSStore, "getitems" + ): + pytest.skip("zarr too old") + + ds = open_dataset(os.path.join(os.path.dirname(__file__), "data", "example_1.nc")) + + m = fsspec.filesystem("memory") + mm = m.get_mapper("out1.zarr") + ds.to_zarr(mm) # old interface + ds0 = ds.copy() + ds0["time"] = ds.time + pd.to_timedelta("1 day") + mm = m.get_mapper("out2.zarr") + ds0.to_zarr(mm) # old interface + + # single dataset + url = "memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + # single dataset with caching + url = "simplecache::memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + # multi dataset + url = "memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + # multi dataset with caching + url = "simplecache::memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + @requires_h5netcdf def test_load_single_value_h5netcdf(tmp_path): """Test that numeric single-element vector attributes are handled fine. diff --git a/xarray/tests/test_plugins.py b/xarray/tests/test_plugins.py index 64a1c563dba..0cda2901cee 100644 --- a/xarray/tests/test_plugins.py +++ b/xarray/tests/test_plugins.py @@ -58,13 +58,13 @@ def test_remove_duplicates_warnings(dummy_duplicated_entrypoints): @mock.patch("pkg_resources.EntryPoint.load", mock.MagicMock(return_value=None)) -def test_create_engines_dict(): +def test_backends_dict_from_pkg(): specs = [ "engine1 = xarray.tests.test_plugins:backend_1", "engine2 = xarray.tests.test_plugins:backend_2", ] entrypoints = [pkg_resources.EntryPoint.parse(spec) for spec in specs] - engines = plugins.create_engines_dict(entrypoints) + engines = plugins.backends_dict_from_pkg(entrypoints) assert len(engines) == 2 assert engines.keys() == set(("engine1", "engine2")) @@ -111,8 +111,38 @@ def test_build_engines(): "cfgrib = xarray.tests.test_plugins:backend_1" ) backend_entrypoints = plugins.build_engines([dummy_pkg_entrypoint]) + assert isinstance(backend_entrypoints["cfgrib"], DummyBackendEntrypoint1) assert backend_entrypoints["cfgrib"].open_dataset_parameters == ( "filename_or_obj", "decoder", ) + + +@mock.patch( + "pkg_resources.EntryPoint.load", + mock.MagicMock(return_value=DummyBackendEntrypoint1), +) +def test_build_engines_sorted(): + dummy_pkg_entrypoints = [ + pkg_resources.EntryPoint.parse( + "dummy2 = xarray.tests.test_plugins:backend_1", + ), + pkg_resources.EntryPoint.parse( + "dummy1 = xarray.tests.test_plugins:backend_1", + ), + ] + backend_entrypoints = plugins.build_engines(dummy_pkg_entrypoints) + backend_entrypoints = list(backend_entrypoints) + + indices = [] + for be in plugins.STANDARD_BACKENDS_ORDER: + try: + index = backend_entrypoints.index(be) + backend_entrypoints.pop(index) + indices.append(index) + except ValueError: + pass + + assert set(indices) < {0, -1} + assert list(backend_entrypoints) == sorted(backend_entrypoints)