Merge remote-tracking branch 'upstream/master' into fix/4107

* upstream/master: (24 commits) Compatibility with dask 2021.02.0 (pydata#4884) Ensure maximum accuracy when encoding and decoding cftime.datetime values (pydata#4758) Fix `bounds_error=True` ignored with 1D interpolation (pydata#4855) add a drop_conflicts strategy for merging attrs (pydata#4827) update pre-commit hooks (mypy) (pydata#4883) ensure warnings cannot become errors in assert_ (pydata#4864) update pre-commit hooks (pydata#4874) small fixes for the docstrings of swap_dims and integrate (pydata#4867) Modify _encode_datetime_with_cftime for compatibility with cftime > 1.4.0 (pydata#4871) vélin (pydata#4872) don't skip the doctests CI (pydata#4869) fix da.pad example for numpy 1.20 (pydata#4865) temporarily pin dask (pydata#4873) Add units if "unit" is in the attrs. (pydata#4850) speed up the repr for big MultiIndex objects (pydata#4846) dim -> coord in DataArray.integrate (pydata#3993) WIP: backend interface, now it uses subclassing (pydata#4836) weighted: small improvements (pydata#4818) Update related-projects.rst (pydata#4844) iris update doc url (pydata#4845) ...
dcherian · Feb 12, 2021 · 8ce7175 · 8ce7175
2 parents 1a1f431 + 2a34bfb
commit 8ce7175
Show file tree

Hide file tree

Showing 69 changed files with 1,483 additions and 892 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -5,11 +5,3 @@
 - [ ] Passes `pre-commit run --all-files`
 - [ ] User visible changes (including notable bug fixes) are documented in `whats-new.rst`
 - [ ] New functions/methods are listed in `api.rst`
-
-
-<sub>
-<h3>
-  Overriding CI behaviors
-</h3>
-   By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a <tt>[skip-ci]</tt> tag to the first line of the commit message
-</sub>
diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -121,8 +121,6 @@ jobs:
   doctest:
     name: Doctests
     runs-on: "ubuntu-latest"
-    needs: detect-ci-trigger
-    if: needs.detect-ci-trigger.outputs.triggered == 'false'
     defaults:
       run:
         shell: bash -l {0}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,15 +17,20 @@ repos:
     hooks:
       - id: black
   - repo: https://github.com/keewis/blackdoc
-    rev: v0.3.2
+    rev: v0.3.3
     hooks:
       - id: blackdoc
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.4
     hooks:
       - id: flake8
+  # - repo: https://github.com/Carreau/velin
+  #   rev: 0.0.8
+  #   hooks:
+  #     - id: velin
+  #       args: ["--write", "--compact"]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.790  # Must match ci/requirements/*.yml
+    rev: v0.800
     hooks:
       - id: mypy
         exclude: "properties|asv_bench"

diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+import xarray as xr
+
+
+class ReprMultiIndex:
+    def setup(self, key):
+        index = pd.MultiIndex.from_product(
+            [range(10000), range(10000)], names=("level_0", "level_1")
+        )
+        series = pd.Series(range(100000000), index=index)
+        self.da = xr.DataArray(series)
+
+    def time_repr(self):
+        repr(self.da)
+
+    def time_repr_html(self):
+        self.da._repr_html_()
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -7,18 +7,23 @@
 
 class Unstacking:
     def setup(self):
-        data = np.random.RandomState(0).randn(1, 1000, 500)
-        self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
+        data = np.random.RandomState(0).randn(500, 1000)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+        self.df_missing = self.da_missing.to_pandas()
 
     def time_unstack_fast(self):
-        self.ds.unstack("flat_dim")
+        self.da_full.unstack("flat_dim")
 
     def time_unstack_slow(self):
-        self.ds[:, ::-1].unstack("flat_dim")
+        self.da_missing.unstack("flat_dim")
+
+    def time_unstack_pandas_slow(self):
+        self.df_missing.unstack()
 
 
 class UnstackingDask(Unstacking):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds = self.ds.chunk({"flat_dim": 50})
+        self.da_full = self.da_full.chunk({"flat_dim": 50})
diff --git a/ci/requirements/py37-min-all-deps.yml b/ci/requirements/py37-min-all-deps.yml
@@ -8,7 +8,6 @@ dependencies:
   # When upgrading python, numpy, or pandas, must also change
   # doc/installing.rst and setup.py.
   - python=3.7
-  - black
   - boto3=1.9
   - bottleneck=1.2
   - cartopy=0.17
@@ -18,16 +17,13 @@ dependencies:
   - coveralls
   - dask=2.9
   - distributed=2.9
-  - flake8
   - h5netcdf=0.7
   - h5py=2.9  # Policy allows for 2.10, but it's a conflict-fest
   - hdf5=1.10
   - hypothesis
   - iris=2.2
-  - isort
   - lxml=4.4  # Optional dep of pydap
   - matplotlib-base=3.1
-  - mypy=0.782  # Must match .pre-commit-config.yaml
   - nc-time-axis=1.2
   - netcdf4=1.4
   - numba=0.46

diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml
@@ -4,23 +4,19 @@ channels:
   - nodefaults
 dependencies:
   - python=3.8
-  - black
   - boto3
   - bottleneck
   - cartopy
   - cdms2
   - cfgrib
   - cftime
   - coveralls
-  - flake8
   - h5netcdf
   - h5py=2
   - hdf5
   - hypothesis
-  - isort
   - lxml    # Optional dep of pydap
   - matplotlib-base
-  - mypy=0.790  # Must match .pre-commit-config.yaml
   - nc-time-axis
   - netcdf4
   - numba

diff --git a/doc/conf.py b/doc/conf.py
@@ -411,7 +411,7 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
-    "iris": ("https://scitools.org.uk/iris/docs/latest", None),
+    "iris": ("https://scitools-iris.readthedocs.io/en/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
     "numba": ("https://numba.pydata.org/numba-doc/latest", None),

diff --git a/doc/contributing.rst b/doc/contributing.rst
@@ -836,6 +836,7 @@ PR checklist
 
     - Write new tests if needed. See `"Test-driven development/code writing" <https://xarray.pydata.org/en/stable/contributing.html#test-driven-development-code-writing>`_.
     - Test the code using `Pytest <http://doc.pytest.org/en/latest/>`_. Running all tests (type ``pytest`` in the root directory) takes a while, so feel free to only run the tests you think are needed based on your PR (example: ``pytest xarray/tests/test_dataarray.py``). CI will catch any failing tests.
+    - By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a "[skip-ci]" tag to the first line of the commit message.
 
 - **Properly format your code** and verify that it passes the formatting guidelines set by `Black <https://black.readthedocs.io/en/stable/>`_ and `Flake8 <http://flake8.pycqa.org/en/latest/>`_. See `"Code formatting" <https://xarray.pydata.org/en/stablcontributing.html#code-formatting>`_. You can use `pre-commit <https://pre-commit.com/>`_ to run these automatically on each commit.
 

diff --git a/doc/faq.rst b/doc/faq.rst
@@ -166,7 +166,7 @@ different approaches to handling metadata: Iris strictly interprets
 `CF conventions`_. Iris particularly shines at mapping, thanks to its
 integration with Cartopy_.
 
-.. _Iris: http://scitools.org.uk/iris/
+.. _Iris: https://scitools-iris.readthedocs.io/en/stable/
 .. _Cartopy: http://scitools.org.uk/cartopy/docs/latest/
 
 `UV-CDAT`__ is another Python library that implements in-memory netCDF-like

diff --git a/doc/related-projects.rst b/doc/related-projects.rst
@@ -15,6 +15,7 @@ Geosciences
 - `aospy <https://aospy.readthedocs.io>`_: Automated analysis and management of gridded climate data.
 - `climpred <https://climpred.readthedocs.io>`_: Analysis of ensemble forecast models for climate prediction.
 - `geocube <https://corteva.github.io/geocube>`_: Tool to convert geopandas vector data into rasterized xarray data.
+- `GeoWombat <https://github.com/jgrss/geowombat>`_: Utilities for analysis of remotely sensed and gridded raster data at scale (easily tame Landsat, Sentinel, Quickbird, and PlanetScope).
 - `infinite-diff <https://github.com/spencerahill/infinite-diff>`_: xarray-based finite-differencing, focused on gridded climate/meterology data
 - `marc_analysis <https://github.com/darothen/marc_analysis>`_: Analysis package for CESM/MARC experiments and output.
 - `MetPy <https://unidata.github.io/MetPy/dev/index.html>`_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data.

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -17,7 +17,7 @@ What's New
 
 .. _whats-new.0.16.3:
 
-v0.16.3 (unreleased)
+v0.17.0 (unreleased)
 --------------------
 
 Breaking changes
@@ -42,11 +42,39 @@ Breaking changes
 - remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`).
   By `Aureliana Barghini <https://github.com/aurghs>`_.
 
+Deprecations
+~~~~~~~~~~~~
+
+- ``dim`` argument to :py:meth:`DataArray.integrate` is being deprecated in
+  favour of a ``coord`` argument, for consistency with :py:meth:`Dataset.integrate`.
+  For now using ``dim`` issues a ``FutureWarning``. It will be removed in
+  version 0.19.0 (:pull:`3993`).
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
+
 
 New Features
 ~~~~~~~~~~~~
+- Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O
+  roundtripping of ``cftime.datetime`` objects (:pull:`4758`).
+  By `Spencer Clark <https://github.com/spencerkclark>`_.
+- :py:meth:`~xarray.cftime_range` and :py:meth:`DataArray.resample` now support
+  millisecond (``"L"`` or ``"ms"``) and microsecond (``"U"`` or ``"us"``) frequencies
+  for ``cftime.datetime`` coordinates (:issue:`4097`, :pull:`4758`).
+  By `Spencer Clark <https://github.com/spencerkclark>`_.
+- Significantly higher ``unstack`` performance on numpy-backed arrays which
+  contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
+  (:pull:`4746`);
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
+
 - Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
   By `Deepak Cherian <https://github.com/dcherian>`_
+- add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg
+  (:issue:`4749`, :pull:`4827`).
+  By `Justus Magin <https://github.com/keewis>`_.
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
+  in the form of kwargs as well as a dict, like most similar methods.
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 Bug fixes
 ~~~~~~~~~
@@ -83,6 +111,10 @@ Bug fixes
 - Convert to IndexVariable or Variable during renaming as appropriate. (:issue:`4107`, :issue:`4417`, :pull:`4108`)
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` (:issue:`4658`, :pull:`4819`). By `Daniel Mesejo <https://github.com/mesejo>`_.
+- Ensure that :py:meth:`Dataset.interp` raises ``ValueError`` when interpolating outside coordinate range and ``bounds_error=True`` (:issue:`4854`, :pull:`4855`).
+  By `Leif Denby <https://github.com/leifdenby>`_.
+- Fix time encoding bug associated with using cftime versions greater than
+  1.4.0 with xarray (:issue:`4870`, :pull:`4871`). By `Spencer Clark <https://github.com/spencerkclark>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -113,6 +145,8 @@ Internal Changes
   in ipython (:issue:`4741`, :pull:`4742`). By `Richard Kleijn <https://github.com/rhkleijn>`_.
 - Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for beckends to specify how to voluntary release
   all resources. (:pull:`#4809`), By `Alessandro Amici <https://github.com/alexamici>`_.
+- Ensure warnings cannot be turned into exceptions in :py:func:`testing.assert_equal` and
+  the other ``assert_*`` functions (:pull:`4864`). By `Mathias Hauser <https://github.com/mathause>`_.
 
 .. _whats-new.0.16.2:
 
@@ -129,7 +163,7 @@ Deprecations
 
 - :py:attr:`~core.accessor_dt.DatetimeAccessor.weekofyear` and :py:attr:`~core.accessor_dt.DatetimeAccessor.week`
   have been deprecated. Use ``DataArray.dt.isocalendar().week``
-  instead (:pull:`4534`). By `Mathias Hauser <https://github.com/mathause>`_,
+  instead (:pull:`4534`). By `Mathias Hauser <https://github.com/mathause>`_.
   `Maximilian Roos <https://github.com/max-sixty>`_, and `Spencer Clark <https://github.com/spencerkclark>`_.
 - :py:attr:`DataArray.rolling` and :py:attr:`Dataset.rolling` no longer support passing ``keep_attrs``
   via its constructor. Pass ``keep_attrs`` via the applied function, i.e. use

diff --git a/setup.cfg b/setup.cfg
@@ -164,6 +164,8 @@ force_to_top = true
 default_section = THIRDPARTY
 known_first_party = xarray
 
+[mypy]
+
 # Most of the numerical computing stack doesn't have type annotations yet.
 [mypy-affine.*]
 ignore_missing_imports = True

diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py
@@ -12,7 +12,7 @@
     BackendEntrypoint,
 )
 from .locks import SerializableLock, ensure_lock
-from .store import open_backend_dataset_store
+from .store import StoreBackendEntrypoint
 
 try:
     import cfgrib
@@ -86,62 +86,58 @@ def get_encoding(self):
         return encoding
 
 
-def guess_can_open_cfgrib(store_spec):
-    try:
-        _, ext = os.path.splitext(store_spec)
-    except TypeError:
-        return False
-    return ext in {".grib", ".grib2", ".grb", ".grb2"}
-
-
-def open_backend_dataset_cfgrib(
-    filename_or_obj,
-    *,
-    mask_and_scale=True,
-    decode_times=None,
-    concat_characters=None,
-    decode_coords=None,
-    drop_variables=None,
-    use_cftime=None,
-    decode_timedelta=None,
-    lock=None,
-    indexpath="{path}.{short_hash}.idx",
-    filter_by_keys={},
-    read_keys=[],
-    encode_cf=("parameter", "time", "geography", "vertical"),
-    squeeze=True,
-    time_dims=("time", "step"),
-):
-
-    store = CfGribDataStore(
+class CfgribfBackendEntrypoint(BackendEntrypoint):
+    def guess_can_open(self, store_spec):
+        try:
+            _, ext = os.path.splitext(store_spec)
+        except TypeError:
+            return False
+        return ext in {".grib", ".grib2", ".grb", ".grb2"}
+
+    def open_dataset(
+        self,
         filename_or_obj,
-        indexpath=indexpath,
-        filter_by_keys=filter_by_keys,
-        read_keys=read_keys,
-        encode_cf=encode_cf,
-        squeeze=squeeze,
-        time_dims=time_dims,
-        lock=lock,
-    )
-
-    with close_on_error(store):
-        ds = open_backend_dataset_store(
-            store,
-            mask_and_scale=mask_and_scale,
-            decode_times=decode_times,
-            concat_characters=concat_characters,
-            decode_coords=decode_coords,
-            drop_variables=drop_variables,
-            use_cftime=use_cftime,
-            decode_timedelta=decode_timedelta,
+        *,
+        mask_and_scale=True,
+        decode_times=None,
+        concat_characters=None,
+        decode_coords=None,
+        drop_variables=None,
+        use_cftime=None,
+        decode_timedelta=None,
+        lock=None,
+        indexpath="{path}.{short_hash}.idx",
+        filter_by_keys={},
+        read_keys=[],
+        encode_cf=("parameter", "time", "geography", "vertical"),
+        squeeze=True,
+        time_dims=("time", "step"),
+    ):
+
+        store = CfGribDataStore(
+            filename_or_obj,
+            indexpath=indexpath,
+            filter_by_keys=filter_by_keys,
+            read_keys=read_keys,
+            encode_cf=encode_cf,
+            squeeze=squeeze,
+            time_dims=time_dims,
+            lock=lock,
         )
-    return ds
-
-
-cfgrib_backend = BackendEntrypoint(
-    open_dataset=open_backend_dataset_cfgrib, guess_can_open=guess_can_open_cfgrib
-)
+        store_entrypoint = StoreBackendEntrypoint()
+        with close_on_error(store):
+            ds = store_entrypoint.open_dataset(
+                store,
+                mask_and_scale=mask_and_scale,
+                decode_times=decode_times,
+                concat_characters=concat_characters,
+                decode_coords=decode_coords,
+                drop_variables=drop_variables,
+                use_cftime=use_cftime,
+                decode_timedelta=decode_timedelta,
+            )
+        return ds
 
 
 if has_cfgrib:
-    BACKEND_ENTRYPOINTS["cfgrib"] = cfgrib_backend
+    BACKEND_ENTRYPOINTS["cfgrib"] = CfgribfBackendEntrypoint