diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 1a2b1d11747..527bdcdede2 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -9,17 +9,40 @@ .. autosummary:: :toctree: generated/ + Coordinates.from_pandas_multiindex + Coordinates.get + Coordinates.items + Coordinates.keys + Coordinates.values + Coordinates.dims + Coordinates.dtypes + Coordinates.variables + Coordinates.xindexes + Coordinates.indexes + Coordinates.to_dataset + Coordinates.to_index + Coordinates.update + Coordinates.merge + Coordinates.copy + Coordinates.equals + Coordinates.identical + core.coordinates.DatasetCoordinates.get core.coordinates.DatasetCoordinates.items core.coordinates.DatasetCoordinates.keys - core.coordinates.DatasetCoordinates.merge - core.coordinates.DatasetCoordinates.to_dataset - core.coordinates.DatasetCoordinates.to_index - core.coordinates.DatasetCoordinates.update core.coordinates.DatasetCoordinates.values core.coordinates.DatasetCoordinates.dims - core.coordinates.DatasetCoordinates.indexes + core.coordinates.DatasetCoordinates.dtypes core.coordinates.DatasetCoordinates.variables + core.coordinates.DatasetCoordinates.xindexes + core.coordinates.DatasetCoordinates.indexes + core.coordinates.DatasetCoordinates.to_dataset + core.coordinates.DatasetCoordinates.to_index + core.coordinates.DatasetCoordinates.update + core.coordinates.DatasetCoordinates.merge + core.coordinates.DataArrayCoordinates.copy + core.coordinates.DatasetCoordinates.equals + core.coordinates.DatasetCoordinates.identical core.rolling.DatasetCoarsen.boundary core.rolling.DatasetCoarsen.coord_func @@ -47,14 +70,19 @@ core.coordinates.DataArrayCoordinates.get core.coordinates.DataArrayCoordinates.items core.coordinates.DataArrayCoordinates.keys - core.coordinates.DataArrayCoordinates.merge - core.coordinates.DataArrayCoordinates.to_dataset - core.coordinates.DataArrayCoordinates.to_index - core.coordinates.DataArrayCoordinates.update core.coordinates.DataArrayCoordinates.values core.coordinates.DataArrayCoordinates.dims - core.coordinates.DataArrayCoordinates.indexes + core.coordinates.DataArrayCoordinates.dtypes core.coordinates.DataArrayCoordinates.variables + core.coordinates.DataArrayCoordinates.xindexes + core.coordinates.DataArrayCoordinates.indexes + core.coordinates.DataArrayCoordinates.to_dataset + core.coordinates.DataArrayCoordinates.to_index + core.coordinates.DataArrayCoordinates.update + core.coordinates.DataArrayCoordinates.merge + core.coordinates.DataArrayCoordinates.copy + core.coordinates.DataArrayCoordinates.equals + core.coordinates.DataArrayCoordinates.identical core.rolling.DataArrayCoarsen.boundary core.rolling.DataArrayCoarsen.coord_func diff --git a/doc/api.rst b/doc/api.rst index 9bac1c40af8..0cf07f91df8 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1085,6 +1085,7 @@ Advanced API .. autosummary:: :toctree: generated/ + Coordinates Dataset.variables DataArray.variable Variable diff --git a/doc/user-guide/terminology.rst b/doc/user-guide/terminology.rst index 24e6ab69927..b313eff653f 100644 --- a/doc/user-guide/terminology.rst +++ b/doc/user-guide/terminology.rst @@ -54,23 +54,22 @@ complete examples, please consult the relevant documentation.* Coordinate An array that labels a dimension or set of dimensions of another ``DataArray``. In the usual one-dimensional case, the coordinate array's - values can loosely be thought of as tick labels along a dimension. There - are two types of coordinate arrays: *dimension coordinates* and - *non-dimension coordinates* (see below). A coordinate named ``x`` can be - retrieved from ``arr.coords[x]``. A ``DataArray`` can have more - coordinates than dimensions because a single dimension can be labeled by - multiple coordinate arrays. However, only one coordinate array can be a - assigned as a particular dimension's dimension coordinate array. As a + values can loosely be thought of as tick labels along a dimension. We + distinguish :term:`Dimension coordinate` vs. :term:`Non-dimension + coordinate` and :term:`Indexed coordinate` vs. :term:`Non-indexed + coordinate`. A coordinate named ``x`` can be retrieved from + ``arr.coords[x]``. A ``DataArray`` can have more coordinates than + dimensions because a single dimension can be labeled by multiple + coordinate arrays. However, only one coordinate array can be a assigned + as a particular dimension's dimension coordinate array. As a consequence, ``len(arr.dims) <= len(arr.coords)`` in general. Dimension coordinate A one-dimensional coordinate array assigned to ``arr`` with both a name - and dimension name in ``arr.dims``. Dimension coordinates are used for - label-based indexing and alignment, like the index found on a - :py:class:`pandas.DataFrame` or :py:class:`pandas.Series`. In fact, - dimension coordinates use :py:class:`pandas.Index` objects under the - hood for efficient computation. Dimension coordinates are marked by - ``*`` when printing a ``DataArray`` or ``Dataset``. + and dimension name in ``arr.dims``. Usually (but not always), a + dimension coordinate is also an :term:`Indexed coordinate` so that it can + be used for label-based indexing and alignment, like the index found on + a :py:class:`pandas.DataFrame` or :py:class:`pandas.Series`. Non-dimension coordinate A coordinate array assigned to ``arr`` with a name in ``arr.coords`` but @@ -79,20 +78,40 @@ complete examples, please consult the relevant documentation.* example, multidimensional coordinates are often used in geoscience datasets when :doc:`the data's physical coordinates (such as latitude and longitude) differ from their logical coordinates - <../examples/multidimensional-coords>`. However, non-dimension coordinates - are not indexed, and any operation on non-dimension coordinates that - leverages indexing will fail. Printing ``arr.coords`` will print all of - ``arr``'s coordinate names, with the corresponding dimension(s) in - parentheses. For example, ``coord_name (dim_name) 1 2 3 ...``. + <../examples/multidimensional-coords>`. Printing ``arr.coords`` will + print all of ``arr``'s coordinate names, with the corresponding + dimension(s) in parentheses. For example, ``coord_name (dim_name) 1 2 3 + ...``. + + Indexed coordinate + A coordinate which has an associated :term:`Index`. Generally this means + that the coordinate labels can be used for indexing (selection) and/or + alignment. An indexed coordinate may have one or more arbitrary + dimensions although in most cases it is also a :term:`Dimension + coordinate`. It may or may not be grouped with other indexed coordinates + depending on whether they share the same index. Indexed coordinates are + marked by ``*`` when printing a ``DataArray`` or ``Dataset``. + + Non-indexed coordinate + A coordinate which has no associated :term:`Index`. It may still + represent fixed labels along one or more dimensions but it cannot be + used for label-based indexing and alignment. Index - An *index* is a data structure optimized for efficient selecting and - slicing of an associated array. Xarray creates indexes for dimension - coordinates so that operations along dimensions are fast, while - non-dimension coordinates are not indexed. Under the hood, indexes are - implemented as :py:class:`pandas.Index` objects. The index associated - with dimension name ``x`` can be retrieved by ``arr.indexes[x]``. By - construction, ``len(arr.dims) == len(arr.indexes)`` + An *index* is a data structure optimized for efficient data selection + and alignment within a discrete or continuous space that is defined by + coordinate labels (unless it is a functional index). By default, Xarray + creates a :py:class:`~xarray.indexes.PandasIndex` object (i.e., a + :py:class:`pandas.Index` wrapper) for each :term:`Dimension coordinate`. + For more advanced use cases (e.g., staggered or irregular grids, + geospatial indexes), Xarray also accepts any instance of a specialized + :py:class:`~xarray.indexes.Index` subclass that is associated to one or + more arbitrary coordinates. The index associated with the coordinate + ``x`` can be retrieved by ``arr.xindexes[x]`` (or ``arr.indexes["x"]`` + if the index is convertible to a :py:class:`pandas.Index` object). If + two coordinates ``x`` and ``y`` share the same index, + ``arr.xindexes[x]`` and ``arr.xindexes[y]`` both return the same + :py:class:`~xarray.indexes.Index` object. name The names of dimensions, coordinates, DataArray objects and data diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9e1dc20b0db..c7f891d52ae 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,20 @@ v2023.07.1 (unreleased) New Features ~~~~~~~~~~~~ +- :py:class:`Coordinates` can now be constructed independently of any Dataset or + DataArray (it is also returned by the :py:attr:`Dataset.coords` and + :py:attr:`DataArray.coords` properties). ``Coordinates`` objects are useful for + passing both coordinate variables and indexes to new Dataset / DataArray objects, + e.g., via their constructor or via :py:meth:`Dataset.assign_coords`. We may also + wrap coordinate variables in a ``Coordinates`` object in order to skip + the automatic creation of (pandas) indexes for dimension coordinates. + The :py:class:`Coordinates.from_pandas_multiindex` constructor may be used to + create coordinates directly from a :py:class:`pandas.MultiIndex` object (it is + preferred over passing it directly as coordinate data, which may be deprecated soon). + Like Dataset and DataArray objects, ``Coordinates`` objects may now be used in + :py:func:`align` and :py:func:`merge`. + (:issue:`6392`, :pull:`7368`). + By `BenoƮt Bovy `_. - Visually group together coordinates with the same indexes in the index section of the text repr (:pull:`7225`). By `Justus Magin `_. - Allow creating Xarray objects where a multidimensional variable shares its name diff --git a/xarray/__init__.py b/xarray/__init__.py index 87b897cf1ea..830bc254a71 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -26,6 +26,7 @@ where, ) from xarray.core.concat import concat +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.extensions import ( @@ -37,7 +38,7 @@ from xarray.core.merge import Context, MergeError, merge from xarray.core.options import get_options, set_options from xarray.core.parallel import map_blocks -from xarray.core.variable import Coordinate, IndexVariable, Variable, as_variable +from xarray.core.variable import IndexVariable, Variable, as_variable from xarray.util.print_versions import show_versions try: @@ -100,6 +101,7 @@ "CFTimeIndex", "Context", "Coordinate", + "Coordinates", "DataArray", "Dataset", "Index", diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index edebccc2534..39ff878b56d 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -5,13 +5,12 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping from contextlib import suppress -from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, Generic, cast import numpy as np import pandas as pd from xarray.core import dtypes -from xarray.core.common import DataWithCoords from xarray.core.indexes import ( Index, Indexes, @@ -20,15 +19,14 @@ indexes_all_equal, safe_cast_to_index, ) +from xarray.core.types import T_Alignable from xarray.core.utils import is_dict_like, is_full_slice from xarray.core.variable import Variable, as_compatible_data, calculate_dimensions if TYPE_CHECKING: from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset - from xarray.core.types import JoinOptions, T_DataArray, T_Dataset, T_DataWithCoords - -DataAlignable = TypeVar("DataAlignable", bound=DataWithCoords) + from xarray.core.types import JoinOptions, T_DataArray, T_Dataset def reindex_variables( @@ -92,7 +90,7 @@ def reindex_variables( NormalizedIndexVars = dict[MatchingIndexKey, dict[Hashable, Variable]] -class Aligner(Generic[DataAlignable]): +class Aligner(Generic[T_Alignable]): """Implements all the complex logic for the re-indexing and alignment of Xarray objects. @@ -105,8 +103,8 @@ class Aligner(Generic[DataAlignable]): """ - objects: tuple[DataAlignable, ...] - results: tuple[DataAlignable, ...] + objects: tuple[T_Alignable, ...] + results: tuple[T_Alignable, ...] objects_matching_indexes: tuple[dict[MatchingIndexKey, Index], ...] join: str exclude_dims: frozenset[Hashable] @@ -127,7 +125,7 @@ class Aligner(Generic[DataAlignable]): def __init__( self, - objects: Iterable[DataAlignable], + objects: Iterable[T_Alignable], join: str = "inner", indexes: Mapping[Any, Any] | None = None, exclude_dims: Iterable = frozenset(), @@ -510,7 +508,7 @@ def _get_dim_pos_indexers( def _get_indexes_and_vars( self, - obj: DataAlignable, + obj: T_Alignable, matching_indexes: dict[MatchingIndexKey, Index], ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]: new_indexes = {} @@ -533,13 +531,13 @@ def _get_indexes_and_vars( def _reindex_one( self, - obj: DataAlignable, + obj: T_Alignable, matching_indexes: dict[MatchingIndexKey, Index], - ) -> DataAlignable: + ) -> T_Alignable: new_indexes, new_variables = self._get_indexes_and_vars(obj, matching_indexes) dim_pos_indexers = self._get_dim_pos_indexers(matching_indexes) - new_obj = obj._reindex_callback( + return obj._reindex_callback( self, dim_pos_indexers, new_variables, @@ -548,8 +546,6 @@ def _reindex_one( self.exclude_dims, self.exclude_vars, ) - new_obj.encoding = obj.encoding - return new_obj def reindex_all(self) -> None: self.results = tuple( @@ -581,13 +577,13 @@ def align(self) -> None: def align( - *objects: DataAlignable, + *objects: T_Alignable, join: JoinOptions = "inner", copy: bool = True, indexes=None, exclude=frozenset(), fill_value=dtypes.NA, -) -> tuple[DataAlignable, ...]: +) -> tuple[T_Alignable, ...]: """ Given any number of Dataset and/or DataArray objects, returns new objects with aligned indexes and dimension sizes. @@ -801,6 +797,7 @@ def deep_align( This function is not public API. """ + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -808,7 +805,7 @@ def deep_align( indexes = {} def is_alignable(obj): - return isinstance(obj, (DataArray, Dataset)) + return isinstance(obj, (Coordinates, DataArray, Dataset)) positions = [] keys = [] @@ -866,7 +863,7 @@ def is_alignable(obj): def reindex( - obj: DataAlignable, + obj: T_Alignable, indexers: Mapping[Any, Any], method: str | None = None, tolerance: int | float | Iterable[int | float] | None = None, @@ -874,7 +871,7 @@ def reindex( fill_value: Any = dtypes.NA, sparse: bool = False, exclude_vars: Iterable[Hashable] = frozenset(), -) -> DataAlignable: +) -> T_Alignable: """Re-index either a Dataset or a DataArray. Not public API. @@ -905,13 +902,13 @@ def reindex( def reindex_like( - obj: DataAlignable, + obj: T_Alignable, other: Dataset | DataArray, method: str | None = None, tolerance: int | float | Iterable[int | float] | None = None, copy: bool = True, fill_value: Any = dtypes.NA, -) -> DataAlignable: +) -> T_Alignable: """Re-index either a Dataset or a DataArray like another Dataset/DataArray. Not public API. @@ -953,8 +950,8 @@ def _get_broadcast_dims_map_common_coords(args, exclude): def _broadcast_helper( - arg: T_DataWithCoords, exclude, dims_map, common_coords -) -> T_DataWithCoords: + arg: T_Alignable, exclude, dims_map, common_coords +) -> T_Alignable: from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -984,16 +981,16 @@ def _broadcast_dataset(ds: T_Dataset) -> T_Dataset: # remove casts once https://github.com/python/mypy/issues/12800 is resolved if isinstance(arg, DataArray): - return cast("T_DataWithCoords", _broadcast_array(arg)) + return cast(T_Alignable, _broadcast_array(arg)) elif isinstance(arg, Dataset): - return cast("T_DataWithCoords", _broadcast_dataset(arg)) + return cast(T_Alignable, _broadcast_dataset(arg)) else: raise ValueError("all input must be Dataset or DataArray objects") # TODO: this typing is too restrictive since it cannot deal with mixed # DataArray and Dataset types...? Is this a problem? -def broadcast(*args: T_DataWithCoords, exclude=None) -> tuple[T_DataWithCoords, ...]: +def broadcast(*args: T_Alignable, exclude=None) -> tuple[T_Alignable, ...]: """Explicitly broadcast any number of DataArray or Dataset objects against one another. diff --git a/xarray/core/common.py b/xarray/core/common.py index d54c259ae2c..16c5140161d 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -14,7 +14,6 @@ from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.parallelcompat import get_chunked_array_type, guess_chunkmanager -from xarray.core.pdcompat import _convert_base_to_offset from xarray.core.pycompat import is_chunked_array from xarray.core.utils import ( Frozen, @@ -607,9 +606,17 @@ def assign_coords( Dataset.swap_dims Dataset.set_coords """ + from xarray.core.coordinates import Coordinates + coords_combined = either_dict_or_kwargs(coords, coords_kwargs, "assign_coords") data = self.copy(deep=False) - results: dict[Hashable, Any] = self._calc_assign_results(coords_combined) + + results: Coordinates | dict[Hashable, Any] + if isinstance(coords, Coordinates): + results = coords + else: + results = self._calc_assign_results(coords_combined) + data.coords.update(results) return data @@ -950,6 +957,7 @@ def _resample( from xarray.core.dataarray import DataArray from xarray.core.groupby import ResolvedTimeResampleGrouper, TimeResampleGrouper + from xarray.core.pdcompat import _convert_base_to_offset from xarray.core.resample import RESAMPLE_DIM if keep_attrs is not None: diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 32809a54ddd..9ae1024b374 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -3,49 +3,53 @@ import warnings from collections.abc import Hashable, Iterator, Mapping, Sequence from contextlib import contextmanager -from typing import TYPE_CHECKING, Any +from typing import ( + TYPE_CHECKING, + Any, + Generic, + cast, +) import numpy as np import pandas as pd from xarray.core import formatting +from xarray.core.alignment import Aligner from xarray.core.indexes import ( Index, Indexes, PandasMultiIndex, assert_no_index_corrupted, + create_default_index_implicit, ) from xarray.core.merge import merge_coordinates_without_align, merge_coords +from xarray.core.types import Self, T_DataArray from xarray.core.utils import Frozen, ReprObject -from xarray.core.variable import Variable, calculate_dimensions +from xarray.core.variable import Variable, as_variable, calculate_dimensions if TYPE_CHECKING: from xarray.core.common import DataWithCoords from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset - from xarray.core.types import T_DataArray # Used as the key corresponding to a DataArray's variable when converting # arbitrary DataArray objects to datasets _THIS_ARRAY = ReprObject("") -class Coordinates(Mapping[Hashable, "T_DataArray"]): +class AbstractCoordinates(Mapping[Hashable, "T_DataArray"]): _data: DataWithCoords __slots__ = ("_data",) def __getitem__(self, key: Hashable) -> T_DataArray: raise NotImplementedError() - def __setitem__(self, key: Hashable, value: Any) -> None: - self.update({key: value}) - @property def _names(self) -> set[Hashable]: raise NotImplementedError() @property - def dims(self) -> Mapping[Hashable, int] | tuple[Hashable, ...]: + def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: raise NotImplementedError() @property @@ -54,10 +58,22 @@ def dtypes(self) -> Frozen[Hashable, np.dtype]: @property def indexes(self) -> Indexes[pd.Index]: + """Mapping of pandas.Index objects used for label based indexing. + + Raises an error if this Coordinates object has indexes that cannot + be coerced to pandas.Index objects. + + See Also + -------- + Coordinates.xindexes + """ return self._data.indexes @property def xindexes(self) -> Indexes[Index]: + """Mapping of :py:class:`~xarray.indexes.Index` objects + used for label based indexing. + """ return self._data.xindexes @property @@ -163,13 +179,209 @@ def to_index(self, ordered_dims: Sequence[Hashable] | None = None) -> pd.Index: return pd.MultiIndex(level_list, code_list, names=names) - def update(self, other: Mapping[Any, Any]) -> None: - other_vars = getattr(other, "variables", other) - self._maybe_drop_multiindex_coords(set(other_vars)) - coords, indexes = merge_coords( - [self.variables, other_vars], priority_arg=1, indexes=self.xindexes + +class Coordinates(AbstractCoordinates): + """Dictionary like container for Xarray coordinates (variables + indexes). + + This collection is a mapping of coordinate names to + :py:class:`~xarray.DataArray` objects. + + It can be passed directly to the :py:class:`~xarray.Dataset` and + :py:class:`~xarray.DataArray` constructors via their `coords` argument. This + will add both the coordinates variables and their index. + + Coordinates are either: + + - returned via the :py:attr:`Dataset.coords` and :py:attr:`DataArray.coords` + properties. + - built from index objects (e.g., :py:meth:`Coordinates.from_pandas_multiindex`). + - built directly from coordinate data and index objects (beware that no consistency + check is done on those inputs). + + In the latter case, no default (pandas) index is created. + + Parameters + ---------- + coords: dict-like + Mapping where keys are coordinate names and values are objects that + can be converted into a :py:class:`~xarray.Variable` object + (see :py:func:`~xarray.as_variable`). + indexes: dict-like + Mapping of where keys are coordinate names and values are + :py:class:`~xarray.indexes.Index` objects. + + """ + + _data: DataWithCoords + + __slots__ = ("_data",) + + def __init__( + self, + coords: Mapping[Any, Any] | None = None, + indexes: Mapping[Any, Index] | None = None, + ) -> None: + # When coordinates are constructed directly, an internal Dataset is + # created so that it is compatible with the DatasetCoordinates and + # DataArrayCoordinates classes serving as a proxy for the data. + # TODO: refactor DataArray / Dataset so that Coordinates store the data. + from xarray.core.dataset import Dataset + + if coords is None: + variables = {} + elif isinstance(coords, Coordinates): + variables = {k: v.copy() for k, v in coords.variables.items()} + else: + variables = {k: as_variable(v) for k, v in coords.items()} + + if indexes is None: + indexes = {} + else: + indexes = dict(indexes) + + no_coord_index = set(indexes) - set(variables) + if no_coord_index: + raise ValueError( + f"no coordinate variables found for these indexes: {no_coord_index}" + ) + + for k, idx in indexes.items(): + if not isinstance(idx, Index): + raise TypeError(f"'{k}' is not an `xarray.indexes.Index` object") + + # maybe convert to base variable + for k, v in variables.items(): + if k not in indexes: + variables[k] = v.to_base_variable() + + self._data = Dataset._construct_direct( + coord_names=set(variables), variables=variables, indexes=indexes ) - self._update_coords(coords, indexes) + + @classmethod + def _construct_direct( + cls, + coords: dict[Any, Variable], + indexes: dict[Any, Index], + dims: dict[Any, int] | None = None, + ) -> Self: + from xarray.core.dataset import Dataset + + obj = object.__new__(cls) + obj._data = Dataset._construct_direct( + coord_names=set(coords), + variables=coords, + indexes=indexes, + dims=dims, + ) + return obj + + @classmethod + def from_pandas_multiindex(cls, midx: pd.MultiIndex, dim: str) -> Self: + """Wrap a pandas multi-index as Xarray coordinates (dimension + levels). + + The returned coordinates can be directly assigned to a + :py:class:`~xarray.Dataset` or :py:class:`~xarray.DataArray` via the + ``coords`` argument of their constructor. + + Parameters + ---------- + midx : :py:class:`pandas.MultiIndex` + Pandas multi-index object. + dim : str + Dimension name. + + Returns + ------- + coords : Coordinates + A collection of Xarray indexed coordinates created from the multi-index. + + """ + xr_idx = PandasMultiIndex(midx, dim) + + variables = xr_idx.create_variables() + indexes = {k: xr_idx for k in variables} + + return cls(coords=variables, indexes=indexes) + + @property + def _names(self) -> set[Hashable]: + return self._data._coord_names + + @property + def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: + """Mapping from dimension names to lengths or tuple of dimension names.""" + return self._data.dims + + @property + def sizes(self) -> Frozen[Hashable, int]: + """Mapping from dimension names to lengths.""" + return self._data.sizes + + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + """Mapping from coordinate names to dtypes. + + Cannot be modified directly. + + See Also + -------- + Dataset.dtypes + """ + return Frozen({n: v.dtype for n, v in self._data.variables.items()}) + + @property + def variables(self) -> Mapping[Hashable, Variable]: + """Low level interface to Coordinates contents as dict of Variable objects. + + This dictionary is frozen to prevent mutation. + """ + return self._data.variables + + def to_dataset(self) -> Dataset: + """Convert these coordinates into a new Dataset.""" + names = [name for name in self._data._variables if name in self._names] + return self._data._copy_listed(names) + + def __getitem__(self, key: Hashable) -> DataArray: + return self._data[key] + + def __delitem__(self, key: Hashable) -> None: + # redirect to DatasetCoordinates.__delitem__ + del self._data.coords[key] + + def equals(self, other: Coordinates) -> bool: + """Two Coordinates objects are equal if they have matching variables, + all of which are equal. + + See Also + -------- + Coordinates.identical + """ + if not isinstance(other, Coordinates): + return False + return self.to_dataset().equals(other.to_dataset()) + + def identical(self, other: Coordinates) -> bool: + """Like equals, but also checks all variable attributes. + + See Also + -------- + Coordinates.equals + """ + if not isinstance(other, Coordinates): + return False + return self.to_dataset().identical(other.to_dataset()) + + def _update_coords( + self, coords: dict[Hashable, Variable], indexes: Mapping[Any, Index] + ) -> None: + # redirect to DatasetCoordinates._update_coords + self._data.coords._update_coords(coords, indexes) + + def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: + # redirect to DatasetCoordinates._maybe_drop_multiindex_coords + self._data.coords._maybe_drop_multiindex_coords(coords) def _merge_raw(self, other, reflexive): """For use with binary arithmetic.""" @@ -200,7 +412,7 @@ def _merge_inplace(self, other): yield self._update_coords(variables, indexes) - def merge(self, other: Coordinates | None) -> Dataset: + def merge(self, other: Mapping[Any, Any] | None) -> Dataset: """Merge two sets of coordinates to create a new Dataset The method implements the logic used for joining coordinates in the @@ -214,8 +426,9 @@ def merge(self, other: Coordinates | None) -> Dataset: Parameters ---------- - other : DatasetCoordinates or DataArrayCoordinates - The coordinates from another dataset or data array. + other : dict-like, optional + A :py:class:`Coordinates` object or any mapping that can be turned + into coordinates. Returns ------- @@ -236,13 +449,92 @@ def merge(self, other: Coordinates | None) -> Dataset: variables=coords, coord_names=coord_names, indexes=indexes ) + def __setitem__(self, key: Hashable, value: Any) -> None: + self.update({key: value}) + + def update(self, other: Mapping[Any, Any]) -> None: + """Update this Coordinates variables with other coordinate variables.""" + other_obj: Coordinates | Mapping[Hashable, Variable] + + if isinstance(other, Coordinates): + # special case: default indexes won't be created + other_obj = other + else: + other_obj = getattr(other, "variables", other) + + self._maybe_drop_multiindex_coords(set(other_obj)) + + coords, indexes = merge_coords( + [self.variables, other_obj], + priority_arg=1, + indexes=self.xindexes, + ) + + self._update_coords(coords, indexes) + + def _overwrite_indexes( + self, + indexes: Mapping[Any, Index], + variables: Mapping[Any, Variable] | None = None, + ) -> Self: + results = self.to_dataset()._overwrite_indexes(indexes, variables) + + # TODO: remove cast once we get rid of DatasetCoordinates + # and DataArrayCoordinates (i.e., Dataset and DataArray encapsulate Coordinates) + return cast(Self, results.coords) + + def _reindex_callback( + self, + aligner: Aligner, + dim_pos_indexers: dict[Hashable, Any], + variables: dict[Hashable, Variable], + indexes: dict[Hashable, Index], + fill_value: Any, + exclude_dims: frozenset[Hashable], + exclude_vars: frozenset[Hashable], + ) -> Self: + """Callback called from ``Aligner`` to create a new reindexed Coordinate.""" + aligned = self.to_dataset()._reindex_callback( + aligner, + dim_pos_indexers, + variables, + indexes, + fill_value, + exclude_dims, + exclude_vars, + ) + + # TODO: remove cast once we get rid of DatasetCoordinates + # and DataArrayCoordinates (i.e., Dataset and DataArray encapsulate Coordinates) + return cast(Self, aligned.coords) + + def _ipython_key_completions_(self): + """Provide method for the key-autocompletions in IPython.""" + return self._data._ipython_key_completions_() + + def copy( + self, + deep: bool = False, + memo: dict[int, Any] | None = None, + ) -> Coordinates: + """Return a copy of this Coordinates object.""" + # do not copy indexes (may corrupt multi-coordinate indexes) + # TODO: disable variables deepcopy? it may also be problematic when they + # encapsulate index objects like pd.Index + variables = { + k: v._copy(deep=deep, memo=memo) for k, v in self.variables.items() + } + return Coordinates._construct_direct( + coords=variables, indexes=dict(self.xindexes), dims=dict(self.sizes) + ) + class DatasetCoordinates(Coordinates): - """Dictionary like container for Dataset coordinates. + """Dictionary like container for Dataset coordinates (variables + indexes). - Essentially an immutable dictionary with keys given by the array's - dimensions and the values given by the corresponding xarray.Coordinate - objects. + This collection can be passed directly to the :py:class:`~xarray.Dataset` + and :py:class:`~xarray.DataArray` constructors via their `coords` argument. + This will add both the coordinates variables and their index. """ _data: Dataset @@ -257,7 +549,7 @@ def _names(self) -> set[Hashable]: return self._data._coord_names @property - def dims(self) -> Mapping[Hashable, int]: + def dims(self) -> Frozen[Hashable, int]: return self._data.dims @property @@ -343,11 +635,12 @@ def _ipython_key_completions_(self): ] -class DataArrayCoordinates(Coordinates["T_DataArray"]): - """Dictionary like container for DataArray coordinates. +class DataArrayCoordinates(Coordinates, Generic[T_DataArray]): + """Dictionary like container for DataArray coordinates (variables + indexes). - Essentially a dict with keys given by the array's - dimensions and the values given by corresponding DataArray objects. + This collection can be passed directly to the :py:class:`~xarray.Dataset` + and :py:class:`~xarray.DataArray` constructors via their `coords` argument. + This will add both the coordinates variables and their index. """ _data: T_DataArray @@ -477,3 +770,77 @@ def assert_coordinate_consistent( f"dimension coordinate {k!r} conflicts between " f"indexed and indexing objects:\n{obj[k]}\nvs.\n{coords[k]}" ) + + +def create_coords_with_default_indexes( + coords: Mapping[Any, Any], data_vars: Mapping[Any, Variable] | None = None +) -> Coordinates: + """Maybe create default indexes from a mapping of coordinates.""" + + # Note: data_vars are needed here only because a pd.MultiIndex object + # can be promoted as coordinates. + # TODO: It won't be relevant anymore when this behavior will be dropped + # in favor of the more explicit ``Coordinates.from_pandas_multiindex()``. + + from xarray.core.dataarray import DataArray + + all_variables = dict(coords) + if data_vars is not None: + all_variables.update(data_vars) + + indexes: dict[Hashable, Index] = {} + variables: dict[Hashable, Variable] = {} + + maybe_index_vars: dict[Hashable, Variable] = {} + mindex_data_vars: list[Hashable] = [] + + for k, v in all_variables.items(): + if k in coords: + maybe_index_vars[k] = v + elif isinstance(v, pd.MultiIndex): + # TODO: eventually stop promoting multi-index passed via data variables + mindex_data_vars.append(k) + maybe_index_vars[k] = v + + if mindex_data_vars: + warnings.warn( + f"passing one or more `pandas.MultiIndex` via data variable(s) {mindex_data_vars} " + "will no longer create indexed coordinates in the future. " + "If you want to keep this behavior, pass it as coordinates instead.", + FutureWarning, + ) + + maybe_index_vars = { + k: v + for k, v in all_variables.items() + if k in coords or isinstance(v, pd.MultiIndex) + } + + dataarray_coords: list[DataArrayCoordinates] = [] + + for name, obj in maybe_index_vars.items(): + if isinstance(obj, DataArray): + dataarray_coords.append(obj.coords) + + variable = as_variable(obj, name=name) + + if variable.dims == (name,): + idx, idx_vars = create_default_index_implicit(variable, all_variables) + indexes.update({k: idx for k in idx_vars}) + variables.update(idx_vars) + all_variables.update(idx_vars) + else: + variables[name] = variable + + new_coords = Coordinates._construct_direct(coords=variables, indexes=indexes) + + # extract and merge coordinates and indexes from input DataArrays + if dataarray_coords: + prioritized = {k: (v, indexes.get(k, None)) for k, v in variables.items()} + variables, indexes = merge_coordinates_without_align( + dataarray_coords + [new_coords], + prioritized=prioritized, + ) + new_coords = Coordinates._construct_direct(coords=variables, indexes=indexes) + + return new_coords diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bbaf79e23ba..b29c62aba80 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -23,7 +23,12 @@ from xarray.core.arithmetic import DataArrayArithmetic from xarray.core.common import AbstractArray, DataWithCoords, get_chunksizes from xarray.core.computation import unify_chunks -from xarray.core.coordinates import DataArrayCoordinates, assert_coordinate_consistent +from xarray.core.coordinates import ( + Coordinates, + DataArrayCoordinates, + assert_coordinate_consistent, + create_coords_with_default_indexes, +) from xarray.core.dataset import Dataset from xarray.core.formatting import format_item from xarray.core.indexes import ( @@ -34,7 +39,7 @@ isel_indexes, ) from xarray.core.indexing import is_fancy_indexer, map_index_queries -from xarray.core.merge import PANDAS_TYPES, MergeError, _create_indexes_from_coords +from xarray.core.merge import PANDAS_TYPES, MergeError from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.utils import ( Default, @@ -104,9 +109,35 @@ T_XarrayOther = TypeVar("T_XarrayOther", bound=Union["DataArray", Dataset]) +def _check_coords_dims(shape, coords, dims): + sizes = dict(zip(dims, shape)) + for k, v in coords.items(): + if any(d not in dims for d in v.dims): + raise ValueError( + f"coordinate {k} has dimensions {v.dims}, but these " + "are not a subset of the DataArray " + f"dimensions {dims}" + ) + + for d, s in zip(v.dims, v.shape): + if s != sizes[d]: + raise ValueError( + f"conflicting sizes for dimension {d!r}: " + f"length {sizes[d]} on the data but length {s} on " + f"coordinate {k!r}" + ) + + if k in sizes and v.shape != (sizes[k],): + raise ValueError( + f"coordinate {k!r} is a DataArray dimension, but " + f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} " + "matching the dimension size" + ) + + def _infer_coords_and_dims( shape, coords, dims -) -> tuple[dict[Hashable, Variable], tuple[Hashable, ...]]: +) -> tuple[Mapping[Hashable, Any], tuple[Hashable, ...]]: """All the logic for creating a new DataArray""" if ( @@ -144,40 +175,22 @@ def _infer_coords_and_dims( if not isinstance(d, str): raise TypeError(f"dimension {d} is not a string") - new_coords: dict[Hashable, Variable] = {} - - if utils.is_dict_like(coords): - for k, v in coords.items(): - new_coords[k] = as_variable(v, name=k) - elif coords is not None: - for dim, coord in zip(dims, coords): - var = as_variable(coord, name=dim) - var.dims = (dim,) - new_coords[dim] = var.to_index_variable() + new_coords: Mapping[Hashable, Any] - sizes = dict(zip(dims, shape)) - for k, v in new_coords.items(): - if any(d not in dims for d in v.dims): - raise ValueError( - f"coordinate {k} has dimensions {v.dims}, but these " - "are not a subset of the DataArray " - f"dimensions {dims}" - ) - - for d, s in zip(v.dims, v.shape): - if s != sizes[d]: - raise ValueError( - f"conflicting sizes for dimension {d!r}: " - f"length {sizes[d]} on the data but length {s} on " - f"coordinate {k!r}" - ) + if isinstance(coords, Coordinates): + new_coords = coords + else: + new_coords = {} + if utils.is_dict_like(coords): + for k, v in coords.items(): + new_coords[k] = as_variable(v, name=k) + elif coords is not None: + for dim, coord in zip(dims, coords): + var = as_variable(coord, name=dim) + var.dims = (dim,) + new_coords[dim] = var.to_index_variable() - if k in sizes and v.shape != (sizes[k],): - raise ValueError( - f"coordinate {k!r} is a DataArray dimension, but " - f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} " - "matching the dimension size" - ) + _check_coords_dims(shape, new_coords, dims) return new_coords, dims @@ -266,7 +279,7 @@ class DataArray( or pandas object, attempts are made to use this array's metadata to fill in other unspecified arguments. A view of the array's data is used instead of a copy if possible. - coords : sequence or dict of array_like, optional + coords : sequence or dict of array_like or :py:class:`~xarray.Coordinates`, optional Coordinates (tick labels) to use for indexing along each dimension. The following notations are accepted: @@ -286,6 +299,10 @@ class DataArray( - mapping {coord name: (dimension name, array-like)} - mapping {coord name: (tuple of dimension names, array-like)} + Alternatively, a :py:class:`~xarray.Coordinates` object may be used in + order to explicitly pass indexes (e.g., a multi-index or any custom + Xarray index) or to bypass the creation of a default index for any + :term:`Dimension coordinate` included in that object. dims : Hashable or sequence of Hashable, optional Name(s) of the data dimension(s). Must be either a Hashable (only for 1D data) or a sequence of Hashables with length equal @@ -297,6 +314,11 @@ class DataArray( attrs : dict_like or None, optional Attributes to assign to the new instance. By default, an empty attribute dictionary is initialized. + indexes : py:class:`~xarray.Indexes` or dict-like, optional + For internal use only. For passing indexes objects to the + new DataArray, use the ``coords`` argument instead with a + :py:class:`~xarray.Coordinate` object (both coordinate variables + and indexes will be extracted from the latter). Examples -------- @@ -386,7 +408,7 @@ def __init__( name: Hashable | None = None, attrs: Mapping | None = None, # internal parameters - indexes: dict[Hashable, Index] | None = None, + indexes: Mapping[Any, Index] | None = None, fastpath: bool = False, ) -> None: if fastpath: @@ -395,10 +417,11 @@ def __init__( assert attrs is None assert indexes is not None else: - # TODO: (benbovy - explicit indexes) remove - # once it becomes part of the public interface if indexes is not None: - raise ValueError("Providing explicit indexes is not supported yet") + raise ValueError( + "Explicitly passing indexes via the `indexes` argument is not supported " + "when `fastpath=False`. Use the `coords` argument instead." + ) # try to fill in arguments from data if they weren't supplied if coords is None: @@ -422,17 +445,18 @@ def __init__( data = as_compatible_data(data) coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, fastpath=True) - indexes, coords = _create_indexes_from_coords(coords) + + if not isinstance(coords, Coordinates): + coords = create_coords_with_default_indexes(coords) + indexes = dict(coords.xindexes) + coords = {k: v.copy() for k, v in coords.variables.items()} # These fully describe a DataArray self._variable = variable assert isinstance(coords, dict) self._coords = coords self._name = name - - # TODO(shoyer): document this argument, once it becomes part of the - # public interface. - self._indexes = indexes + self._indexes = indexes # type: ignore[assignment] self._close = None @@ -500,7 +524,7 @@ def _replace_maybe_drop_dims( def _overwrite_indexes( self: T_DataArray, indexes: Mapping[Any, Index], - coords: Mapping[Any, Variable] | None = None, + variables: Mapping[Any, Variable] | None = None, drop_coords: list[Hashable] | None = None, rename_dims: Mapping[Any, Any] | None = None, ) -> T_DataArray: @@ -508,8 +532,8 @@ def _overwrite_indexes( if not indexes: return self - if coords is None: - coords = {} + if variables is None: + variables = {} if drop_coords is None: drop_coords = [] @@ -518,7 +542,7 @@ def _overwrite_indexes( new_indexes = dict(self._indexes) for name in indexes: - new_coords[name] = coords[name] + new_coords[name] = variables[name] new_indexes[name] = indexes[name] for name in drop_coords: @@ -906,12 +930,20 @@ def indexes(self) -> Indexes: @property def xindexes(self) -> Indexes: - """Mapping of xarray Index objects used for label based indexing.""" + """Mapping of :py:class:`~xarray.indexes.Index` objects + used for label based indexing. + """ return Indexes(self._indexes, {k: self._coords[k] for k in self._indexes}) @property def coords(self) -> DataArrayCoordinates: - """Dictionary-like container of coordinate arrays.""" + """Mapping of :py:class:`~xarray.DataArray` objects corresponding to + coordinate variables. + + See Also + -------- + Coordinates + """ return DataArrayCoordinates(self) @overload @@ -1803,7 +1835,11 @@ def _reindex_callback( exclude_dims, exclude_vars, ) - return self._from_temp_dataset(reindexed) + + da = self._from_temp_dataset(reindexed) + da.encoding = self.encoding + + return da def reindex_like( self: T_DataArray, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7bd92ea32a0..2ee9c1d17b7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -50,7 +50,12 @@ get_chunksizes, ) from xarray.core.computation import unify_chunks -from xarray.core.coordinates import DatasetCoordinates, assert_coordinate_consistent +from xarray.core.coordinates import ( + Coordinates, + DatasetCoordinates, + assert_coordinate_consistent, + create_coords_with_default_indexes, +) from xarray.core.daskmanager import DaskManager from xarray.core.duck_array_ops import datetime_to_numeric from xarray.core.indexes import ( @@ -70,7 +75,7 @@ dataset_merge_method, dataset_update_method, merge_coordinates_without_align, - merge_data_and_coords, + merge_core, ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs @@ -113,7 +118,6 @@ from xarray.backends import AbstractDataStore, ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes - from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy from xarray.core.merge import CoercibleMapping @@ -400,6 +404,26 @@ def _initialize_feasible(lb, ub): return param_defaults, bounds_defaults +def merge_data_and_coords(data_vars, coords): + """Used in Dataset.__init__.""" + if isinstance(coords, Coordinates): + coords = coords.copy() + else: + coords = create_coords_with_default_indexes(coords, data_vars) + + # exclude coords from alignment (all variables in a Coordinates object should + # already be aligned together) and use coordinates' indexes to align data_vars + return merge_core( + [data_vars, coords], + compat="broadcast_equals", + join="outer", + explicit_coords=tuple(coords), + indexes=coords.xindexes, + priority_arg=1, + skip_align_args=[1], + ) + + class DataVariables(Mapping[Any, "DataArray"]): __slots__ = ("_dataset",) @@ -491,8 +515,11 @@ class Dataset( Dataset implements the mapping interface with keys given by variable names and values given by DataArray objects for each variable name. - One dimensional variables with name equal to their dimension are - index coordinates used for label based indexing. + By default, pandas indexes are created for one dimensional variables with + name equal to their dimension (i.e., :term:`Dimension coordinate`) so those + variables can be readily used as coordinates for label based indexing. When a + :py:class:`~xarray.Coordinates` object is passed to ``coords``, any existing + index(es) built from those coordinates will be added to the Dataset. To load data from a file or file-like object, use the `open_dataset` function. @@ -513,22 +540,21 @@ class Dataset( - mapping {var name: (dimension name, array-like)} - mapping {var name: (tuple of dimension names, array-like)} - mapping {dimension name: array-like} - (it will be automatically moved to coords, see below) + (if array-like is not a scalar it will be automatically moved to coords, + see below) Each dimension must have the same length in all variables in which it appears. - coords : dict-like, optional - Another mapping in similar form as the `data_vars` argument, - except the each item is saved on the dataset as a "coordinate". + coords : :py:class:`~xarray.Coordinates` or dict-like, optional + A :py:class:`~xarray.Coordinates` object or another mapping in + similar form as the `data_vars` argument, except that each item + is saved on the dataset as a "coordinate". These variables have an associated meaning: they describe constant/fixed/independent quantities, unlike the varying/measured/dependent quantities that belong in - `variables`. Coordinates values may be given by 1-dimensional - arrays or scalars, in which case `dims` do not need to be - supplied: 1D arrays will be assumed to give index values along - the dimension with the same name. + `variables`. - The following notations are accepted: + The following notations are accepted for arbitrary mappings: - mapping {coord name: DataArray} - mapping {coord name: Variable} @@ -538,8 +564,16 @@ class Dataset( (the dimension name is implicitly set to be the same as the coord name) - The last notation implies that the coord name is the same as - the dimension name. + The last notation implies either that the coordinate value is a scalar + or that it is a 1-dimensional array and the coord name is the same as + the dimension name (i.e., a :term:`Dimension coordinate`). In the latter + case, the 1-dimensional array will be assumed to give index values + along the dimension with the same name. + + Alternatively, a :py:class:`~xarray.Coordinates` object may be used in + order to explicitly pass indexes (e.g., a multi-index or any custom + Xarray index) or to bypass the creation of a default index for any + :term:`Dimension coordinate` included in that object. attrs : dict-like, optional Global attributes to save on this dataset. @@ -602,6 +636,7 @@ class Dataset( precipitation float64 8.326 Attributes: description: Weather related data. + """ _attrs: dict[Hashable, Any] | None @@ -633,8 +668,6 @@ def __init__( coords: Mapping[Any, Any] | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - # TODO(shoyer): expose indexes as a public argument in __init__ - if data_vars is None: data_vars = {} if coords is None: @@ -650,7 +683,7 @@ def __init__( coords = coords._variables variables, coord_names, dims, indexes, _ = merge_data_and_coords( - data_vars, coords, compat="broadcast_equals" + data_vars, coords ) self._attrs = dict(attrs) if attrs is not None else None @@ -1719,13 +1752,19 @@ def indexes(self) -> Indexes[pd.Index]: @property def xindexes(self) -> Indexes[Index]: - """Mapping of xarray Index objects used for label based indexing.""" + """Mapping of :py:class:`~xarray.indexes.Index` objects + used for label based indexing. + """ return Indexes(self._indexes, {k: self._variables[k] for k in self._indexes}) @property def coords(self) -> DatasetCoordinates: - """Dictionary of xarray.DataArray objects corresponding to coordinate - variables + """Mapping of :py:class:`~xarray.DataArray` objects corresponding to + coordinate variables. + + See Also + -------- + Coordinates """ return DatasetCoordinates(self) @@ -3103,7 +3142,7 @@ def broadcast_like( ) def _reindex_callback( - self, + self: T_Dataset, aligner: alignment.Aligner, dim_pos_indexers: dict[Hashable, Any], variables: dict[Hashable, Variable], @@ -3111,7 +3150,7 @@ def _reindex_callback( fill_value: Any, exclude_dims: frozenset[Hashable], exclude_vars: frozenset[Hashable], - ) -> Dataset: + ) -> T_Dataset: """Callback called from ``Aligner`` to create a new reindexed Dataset.""" new_variables = variables.copy() @@ -3159,6 +3198,8 @@ def _reindex_callback( new_variables, new_coord_names, indexes=new_indexes ) + reindexed.encoding = self.encoding + return reindexed def reindex_like( diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 53c2b16c05a..7de290f4e14 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1381,19 +1381,22 @@ def create_default_index_implicit( class Indexes(collections.abc.Mapping, Generic[T_PandasOrXarrayIndex]): - """Immutable proxy for Dataset or DataArrary indexes. + """Immutable proxy for Dataset or DataArray indexes. - Keys are coordinate names and values may correspond to either pandas or - xarray indexes. + It is a mapping where keys are coordinate names and values are either pandas + or xarray indexes. - Also provides some utility methods. + It also contains the indexed coordinate variables and provides some utility + methods. """ + _index_type: type[Index] | type[pd.Index] _indexes: dict[Any, T_PandasOrXarrayIndex] _variables: dict[Any, Variable] __slots__ = ( + "_index_type", "_indexes", "_variables", "_dims", @@ -1404,8 +1407,9 @@ class Indexes(collections.abc.Mapping, Generic[T_PandasOrXarrayIndex]): def __init__( self, - indexes: dict[Any, T_PandasOrXarrayIndex], - variables: dict[Any, Variable], + indexes: Mapping[Any, T_PandasOrXarrayIndex] | None = None, + variables: Mapping[Any, Variable] | None = None, + index_type: type[Index] | type[pd.Index] = Index, ): """Constructor not for public consumption. @@ -1414,11 +1418,33 @@ def __init__( indexes : dict Indexes held by this object. variables : dict - Indexed coordinate variables in this object. + Indexed coordinate variables in this object. Entries must + match those of `indexes`. + index_type : type + The type of all indexes, i.e., either :py:class:`xarray.indexes.Index` + or :py:class:`pandas.Index`. """ - self._indexes = indexes - self._variables = variables + if indexes is None: + indexes = {} + if variables is None: + variables = {} + + unmatched_keys = set(indexes) ^ set(variables) + if unmatched_keys: + raise ValueError( + f"unmatched keys found in indexes and variables: {unmatched_keys}" + ) + + if any(not isinstance(idx, index_type) for idx in indexes.values()): + index_type_str = f"{index_type.__module__}.{index_type.__name__}" + raise TypeError( + f"values of indexes must all be instances of {index_type_str}" + ) + + self._index_type = index_type + self._indexes = dict(**indexes) + self._variables = dict(**variables) self._dims: Mapping[Hashable, int] | None = None self.__coord_name_id: dict[Any, int] | None = None @@ -1566,7 +1592,7 @@ def to_pandas_indexes(self) -> Indexes[pd.Index]: elif isinstance(idx, Index): indexes[k] = idx.to_pandas_index() - return Indexes(indexes, self._variables) + return Indexes(indexes, self._variables, index_type=pd.Index) def copy_indexes( self, deep: bool = True, memo: dict[int, Any] | None = None diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 56e51256ba1..24b6ed0ba43 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -11,7 +11,6 @@ from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import ( Index, - Indexes, create_default_index_implicit, filter_indexes_from_coords, indexes_equal, @@ -34,7 +33,7 @@ tuple[DimsLike, ArrayLike, Mapping, Mapping], ] XarrayValue = Union[DataArray, Variable, VariableLike] - DatasetLike = Union[Dataset, Mapping[Any, XarrayValue]] + DatasetLike = Union[Dataset, Coordinates, Mapping[Any, XarrayValue]] CoercibleValue = Union[XarrayValue, pd.Series, pd.DataFrame] CoercibleMapping = Union[Dataset, Mapping[Any, CoercibleValue]] @@ -311,17 +310,22 @@ def collect_variables_and_indexes( ) -> dict[Hashable, list[MergeElement]]: """Collect variables and indexes from list of mappings of xarray objects. - Mappings must either be Dataset objects, or have values of one of the - following types: + Mappings can be Dataset or Coordinates objects, in which case both + variables and indexes are extracted from it. + + It can also have values of one of the following types: - an xarray.Variable - a tuple `(dims, data[, attrs[, encoding]])` that can be converted in an xarray.Variable - or an xarray.DataArray If a mapping of indexes is given, those indexes are assigned to all variables - with a matching key/name. + with a matching key/name. For dimension variables with no matching index, a + default (pandas) index is assigned. DataArray indexes that don't match mapping + keys are also extracted. """ + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -338,8 +342,8 @@ def append_all(variables, indexes): append(name, variable, indexes.get(name)) for mapping in list_of_mappings: - if isinstance(mapping, Dataset): - append_all(mapping.variables, mapping._indexes) + if isinstance(mapping, (Coordinates, Dataset)): + append_all(mapping.variables, mapping.xindexes) continue for name, variable in mapping.items(): @@ -466,12 +470,13 @@ def coerce_pandas_values(objects: Iterable[CoercibleMapping]) -> list[DatasetLik List of Dataset or dictionary objects. Any inputs or values in the inputs that were pandas objects have been converted into native xarray objects. """ + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset out = [] for obj in objects: - if isinstance(obj, Dataset): + if isinstance(obj, (Dataset, Coordinates)): variables: DatasetLike = obj else: variables = {} @@ -556,62 +561,6 @@ def merge_coords( return variables, out_indexes -def merge_data_and_coords( - data_vars: Mapping[Any, Any], - coords: Mapping[Any, Any], - compat: CompatOptions = "broadcast_equals", - join: JoinOptions = "outer", -) -> _MergeResult: - """Used in Dataset.__init__.""" - indexes, coords = _create_indexes_from_coords(coords, data_vars) - objects = [data_vars, coords] - explicit_coords = coords.keys() - return merge_core( - objects, - compat, - join, - explicit_coords=explicit_coords, - indexes=Indexes(indexes, coords), - ) - - -def _create_indexes_from_coords( - coords: Mapping[Any, Any], data_vars: Mapping[Any, Any] | None = None -) -> tuple[dict, dict]: - """Maybe create default indexes from a mapping of coordinates. - - Return those indexes and updated coordinates. - """ - all_variables = dict(coords) - if data_vars is not None: - all_variables.update(data_vars) - - indexes = {} - updated_coords = {} - - # this is needed for backward compatibility: when a pandas multi-index - # is given as data variable, it is promoted as index / level coordinates - # TODO: depreciate this implicit behavior - index_vars = { - k: v - for k, v in all_variables.items() - if k in coords or isinstance(v, pd.MultiIndex) - } - - for name, obj in index_vars.items(): - variable = as_variable(obj, name=name) - - if variable.dims == (name,): - idx, idx_vars = create_default_index_implicit(variable, all_variables) - indexes.update({k: idx for k in idx_vars}) - updated_coords.update(idx_vars) - all_variables.update(idx_vars) - else: - updated_coords[name] = obj - - return indexes, updated_coords - - def assert_valid_explicit_coords( variables: Mapping[Any, Any], dims: Mapping[Any, int], @@ -702,6 +651,7 @@ def merge_core( explicit_coords: Iterable[Hashable] | None = None, indexes: Mapping[Any, Any] | None = None, fill_value: object = dtypes.NA, + skip_align_args: list[int] | None = None, ) -> _MergeResult: """Core logic for merging labeled objects. @@ -727,6 +677,8 @@ def merge_core( may be cast to pandas.Index objects. fill_value : scalar, optional Value to use for newly missing values + skip_align_args : list of int, optional + Optional arguments in `objects` that are not included in alignment. Returns ------- @@ -748,10 +700,20 @@ def merge_core( _assert_compat_valid(compat) + objects = list(objects) + if skip_align_args is None: + skip_align_args = [] + + skip_align_objs = [(pos, objects.pop(pos)) for pos in skip_align_args] + coerced = coerce_pandas_values(objects) aligned = deep_align( coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value ) + + for pos, obj in skip_align_objs: + aligned.insert(pos, obj) + collected = collect_variables_and_indexes(aligned, indexes=indexes) prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat) variables, out_indexes = merge_collected( @@ -1008,18 +970,23 @@ def merge( combine_nested combine_by_coords """ + + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset dict_like_objects = [] for obj in objects: - if not isinstance(obj, (DataArray, Dataset, dict)): + if not isinstance(obj, (DataArray, Dataset, Coordinates, dict)): raise TypeError( "objects must be an iterable containing only " "Dataset(s), DataArray(s), and dictionaries." ) - obj = obj.to_dataset(promote_attrs=True) if isinstance(obj, DataArray) else obj + if isinstance(obj, DataArray): + obj = obj.to_dataset(promote_attrs=True) + elif isinstance(obj, Coordinates): + obj = obj.to_dataset() dict_like_objects.append(obj) merge_result = merge_core( diff --git a/xarray/core/types.py b/xarray/core/types.py index f3342071107..fec257d9310 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -1,12 +1,14 @@ from __future__ import annotations import datetime -from collections.abc import Hashable, Iterable, Sequence +import sys +from collections.abc import Hashable, Iterable, Iterator, Mapping, Sequence from typing import ( TYPE_CHECKING, Any, Callable, Literal, + Protocol, SupportsIndex, TypeVar, Union, @@ -16,16 +18,30 @@ import pandas as pd from packaging.version import Version +try: + if sys.version_info >= (3, 11): + from typing import Self + else: + from typing_extensions import Self +except ImportError: + if TYPE_CHECKING: + raise + else: + Self: Any = None + if TYPE_CHECKING: from numpy._typing import _SupportsDType from numpy.typing import ArrayLike from xarray.backends.common import BackendEntrypoint + from xarray.core.alignment import Aligner from xarray.core.common import AbstractArray, DataWithCoords + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.groupby import DataArrayGroupBy, GroupBy - from xarray.core.indexes import Index + from xarray.core.indexes import Index, Indexes + from xarray.core.utils import Frozen from xarray.core.variable import Variable try: @@ -43,19 +59,6 @@ except ImportError: ZarrArray = np.ndarray - # TODO: Turn on when https://github.com/python/mypy/issues/11871 is fixed. - # Can be uncommented if using pyright though. - # import sys - - # try: - # if sys.version_info >= (3, 11): - # from typing import Self - # else: - # from typing_extensions import Self - # except ImportError: - # Self: Any = None - Self: Any = None - # Anything that can be coerced to a shape tuple _ShapeLike = Union[SupportsIndex, Sequence[SupportsIndex]] _DTypeLikeNested = Any # TODO: wait for support for recursive types @@ -89,14 +92,66 @@ CFTimeDatetime = Any DatetimeLike = Union[pd.Timestamp, datetime.datetime, np.datetime64, CFTimeDatetime] else: - Self: Any = None DTypeLikeSave: Any = None +class Alignable(Protocol): + """Represents any Xarray type that supports alignment. + + It may be ``Dataset``, ``DataArray`` or ``Coordinates``. This protocol class + is needed since those types do not all have a common base class. + + """ + + @property + def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: + ... + + @property + def sizes(self) -> Frozen[Hashable, int]: + ... + + @property + def xindexes(self) -> Indexes[Index]: + ... + + def _reindex_callback( + self, + aligner: Aligner, + dim_pos_indexers: dict[Hashable, Any], + variables: dict[Hashable, Variable], + indexes: dict[Hashable, Index], + fill_value: Any, + exclude_dims: frozenset[Hashable], + exclude_vars: frozenset[Hashable], + ) -> Self: + ... + + def _overwrite_indexes( + self, + indexes: Mapping[Any, Index], + variables: Mapping[Any, Variable] | None = None, + ) -> Self: + ... + + def __len__(self) -> int: + ... + + def __iter__(self) -> Iterator[Hashable]: + ... + + def copy( + self, + deep: bool = False, + ) -> Self: + ... + + T_Backend = TypeVar("T_Backend", bound="BackendEntrypoint") T_Dataset = TypeVar("T_Dataset", bound="Dataset") T_DataArray = TypeVar("T_DataArray", bound="DataArray") T_Variable = TypeVar("T_Variable", bound="Variable") +T_Coordinates = TypeVar("T_Coordinates", bound="Coordinates") T_Array = TypeVar("T_Array", bound="AbstractArray") T_Index = TypeVar("T_Index", bound="Index") @@ -105,6 +160,7 @@ # Maybe we rename this to T_Data or something less Fortran-y? T_Xarray = TypeVar("T_Xarray", "DataArray", "Dataset") T_DataWithCoords = TypeVar("T_DataWithCoords", bound="DataWithCoords") +T_Alignable = TypeVar("T_Alignable", bound="Alignable") ScalarOrArray = Union["ArrayLike", np.generic, np.ndarray, "DaskArray"] DsCompatible = Union["Dataset", "DataArray", "Variable", "GroupBy", "ScalarOrArray"] diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9d859c0d8a7..9cee574591d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -3126,10 +3126,6 @@ def _inplace_binary_op(self, other, f): ) -# for backwards compatibility -Coordinate = utils.alias(IndexVariable, "Coordinate") - - def _unified_dims(variables): # validate dimensions all_dims = {} diff --git a/xarray/testing.py b/xarray/testing.py index 47e7dca81ae..6a8bb04f170 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -8,6 +8,7 @@ import pandas as pd from xarray.core import duck_array_ops, formatting, utils +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.indexes import Index, PandasIndex, PandasMultiIndex, default_indexes @@ -68,9 +69,9 @@ def assert_equal(a, b): Parameters ---------- - a : xarray.Dataset, xarray.DataArray or xarray.Variable + a : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The first object to compare. - b : xarray.Dataset, xarray.DataArray or xarray.Variable + b : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The second object to compare. See Also @@ -79,11 +80,15 @@ def assert_equal(a, b): numpy.testing.assert_array_equal """ __tracebackhide__ = True - assert type(a) == type(b) + assert ( + type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) + ) if isinstance(a, (Variable, DataArray)): assert a.equals(b), formatting.diff_array_repr(a, b, "equals") elif isinstance(a, Dataset): assert a.equals(b), formatting.diff_dataset_repr(a, b, "equals") + elif isinstance(a, Coordinates): + assert a.equals(b), formatting.diff_coords_repr(a, b, "equals") else: raise TypeError(f"{type(a)} not supported by assertion comparison") @@ -97,9 +102,9 @@ def assert_identical(a, b): Parameters ---------- - a : xarray.Dataset, xarray.DataArray or xarray.Variable + a : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The first object to compare. - b : xarray.Dataset, xarray.DataArray or xarray.Variable + b : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The second object to compare. See Also @@ -107,7 +112,9 @@ def assert_identical(a, b): assert_equal, assert_allclose, Dataset.equals, DataArray.equals """ __tracebackhide__ = True - assert type(a) == type(b) + assert ( + type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) + ) if isinstance(a, Variable): assert a.identical(b), formatting.diff_array_repr(a, b, "identical") elif isinstance(a, DataArray): @@ -115,6 +122,8 @@ def assert_identical(a, b): assert a.identical(b), formatting.diff_array_repr(a, b, "identical") elif isinstance(a, (Dataset, Variable)): assert a.identical(b), formatting.diff_dataset_repr(a, b, "identical") + elif isinstance(a, Coordinates): + assert a.identical(b), formatting.diff_coords_repr(a, b, "identical") else: raise TypeError(f"{type(a)} not supported by assertion comparison") @@ -400,6 +409,10 @@ def _assert_internal_invariants( _assert_dataset_invariants( xarray_obj, check_default_indexes=check_default_indexes ) + elif isinstance(xarray_obj, Coordinates): + _assert_dataset_invariants( + xarray_obj.to_dataset(), check_default_indexes=check_default_indexes + ) else: raise TypeError( f"{type(xarray_obj)} is not a supported type for xarray invariant checks" diff --git a/xarray/tests/test_coordinates.py b/xarray/tests/test_coordinates.py new file mode 100644 index 00000000000..bf68a5c1838 --- /dev/null +++ b/xarray/tests/test_coordinates.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import pandas as pd +import pytest + +from xarray.core.alignment import align +from xarray.core.coordinates import Coordinates +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset +from xarray.core.indexes import PandasIndex, PandasMultiIndex +from xarray.tests import assert_identical, source_ndarray + + +class TestCoordinates: + def test_init_noindex(self) -> None: + coords = Coordinates(coords={"foo": ("x", [0, 1, 2])}) + expected = Dataset(coords={"foo": ("x", [0, 1, 2])}) + assert_identical(coords.to_dataset(), expected) + + def test_init_from_coords(self) -> None: + expected = Dataset(coords={"foo": ("x", [0, 1, 2])}) + coords = Coordinates(coords=expected.coords) + assert_identical(coords.to_dataset(), expected) + + # test variables copied + assert coords.variables["foo"] is not expected.variables["foo"] + + # default index + expected = Dataset(coords={"x": ("x", [0, 1, 2])}) + coords = Coordinates(coords=expected.coords, indexes=expected.xindexes) + assert_identical(coords.to_dataset(), expected) + + def test_init_empty(self) -> None: + coords = Coordinates() + assert len(coords) == 0 + + def test_init_index_error(self) -> None: + idx = PandasIndex([1, 2, 3], "x") + with pytest.raises(ValueError, match="no coordinate variables found"): + Coordinates(indexes={"x": idx}) + + with pytest.raises(TypeError, match=".* is not an `xarray.indexes.Index`"): + Coordinates(coords={"x": ("x", [1, 2, 3])}, indexes={"x": "not_an_xarray_index"}) # type: ignore + + def test_init_dim_sizes_conflict(self) -> None: + with pytest.raises(ValueError): + Coordinates(coords={"foo": ("x", [1, 2]), "bar": ("x", [1, 2, 3, 4])}) + + def test_from_pandas_multiindex(self) -> None: + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + coords = Coordinates.from_pandas_multiindex(midx, "x") + + assert isinstance(coords.xindexes["x"], PandasMultiIndex) + assert coords.xindexes["x"].index.equals(midx) + assert coords.xindexes["x"].dim == "x" + + expected = PandasMultiIndex(midx, "x").create_variables() + assert list(coords.variables) == list(expected) + for name in ("x", "one", "two"): + assert_identical(expected[name], coords.variables[name]) + + def test_dims(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert coords.dims == {"x": 3} + + def test_sizes(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert coords.sizes == {"x": 3} + + def test_dtypes(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert coords.dtypes == {"x": int} + + def test_getitem(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert_identical( + coords["x"], + DataArray([0, 1, 2], coords={"x": [0, 1, 2]}, name="x"), + ) + + def test_delitem(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + del coords["x"] + assert "x" not in coords + + def test_update(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + coords.update({"y": ("y", [4, 5, 6])}) + assert "y" in coords + assert "y" in coords.xindexes + expected = DataArray([4, 5, 6], coords={"y": [4, 5, 6]}, name="y") + assert_identical(coords["y"], expected) + + def test_equals(self): + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + assert coords.equals(coords) + assert not coords.equals("no_a_coords") + + def test_identical(self): + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + assert coords.identical(coords) + assert not coords.identical("no_a_coords") + + def test_copy(self) -> None: + no_index_coords = Coordinates({"foo": ("x", [1, 2, 3])}) + copied = no_index_coords.copy() + assert_identical(no_index_coords, copied) + v0 = no_index_coords.variables["foo"] + v1 = copied.variables["foo"] + assert v0 is not v1 + assert source_ndarray(v0.data) is source_ndarray(v1.data) + + deep_copied = no_index_coords.copy(deep=True) + assert_identical(no_index_coords.to_dataset(), deep_copied.to_dataset()) + v0 = no_index_coords.variables["foo"] + v1 = deep_copied.variables["foo"] + assert v0 is not v1 + assert source_ndarray(v0.data) is not source_ndarray(v1.data) + + def test_align(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + left = coords + + # test Coordinates._reindex_callback + right = coords.to_dataset().isel(x=[0, 1]).coords + left2, right2 = align(left, right, join="inner") + assert_identical(left2, right2) + + # test Coordinates._overwrite_indexes + right.update({"x": ("x", [4, 5, 6])}) + left2, right2 = align(left, right, join="override") + assert_identical(left2, left) + assert_identical(left2, right2) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index cee5afa56a4..ec2965d9d0f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -27,6 +27,7 @@ from xarray.convert import from_cdms2 from xarray.core import dtypes from xarray.core.common import full_like +from xarray.core.coordinates import Coordinates from xarray.core.indexes import Index, PandasIndex, filter_indexes_from_coords from xarray.core.types import QueryEngineOptions, QueryParserOptions from xarray.core.utils import is_scalar @@ -486,6 +487,32 @@ def test_constructor_dask_coords(self) -> None: expected = DataArray(data, coords={"x": ecoord, "y": ecoord}, dims=["x", "y"]) assert_equal(actual, expected) + def test_constructor_no_default_index(self) -> None: + # explicitly passing a Coordinates object skips the creation of default index + da = DataArray(range(3), coords=Coordinates({"x": ("x", [1, 2, 3])})) + assert "x" in da.coords + assert "x" not in da.xindexes + + def test_constructor_multiindex(self) -> None: + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + coords = Coordinates.from_pandas_multiindex(midx, "x") + + da = DataArray(range(4), coords=coords, dims="x") + assert_identical(da.coords, coords) + + def test_constructor_custom_index(self) -> None: + class CustomIndex(Index): + ... + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + da = DataArray(range(3), coords=coords) + assert isinstance(da.xindexes["x"], CustomIndex) + + # test coordinate variables copied + assert da.coords["x"] is not coords.variables["x"] + def test_equals_and_identical(self) -> None: orig = DataArray(np.arange(5.0), {"a": 42}, dims="x") @@ -1546,6 +1573,24 @@ def test_assign_coords_existing_multiindex(self) -> None: with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): data.assign_coords(x=range(4)) + def test_assign_coords_custom_index(self) -> None: + class CustomIndex(Index): + pass + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + da = xr.DataArray([0, 1, 2], dims="x") + actual = da.assign_coords(coords) + assert isinstance(actual.xindexes["x"], CustomIndex) + + def test_assign_coords_no_default_index(self) -> None: + coords = Coordinates({"y": ("y", [1, 2, 3])}) + da = DataArray([1, 2, 3], dims="y") + actual = da.assign_coords(coords) + assert_identical(actual.coords, coords, check_default_indexes=False) + assert "y" not in actual.xindexes + def test_coords_alignment(self) -> None: lhs = DataArray([1, 2, 3], [("x", [0, 1, 2])]) rhs = DataArray([2, 3, 4], [("x", [1, 2, 3])]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f7f91d0e134..5304c54971a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -31,7 +31,7 @@ from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like -from xarray.core.coordinates import DatasetCoordinates +from xarray.core.coordinates import Coordinates, DatasetCoordinates from xarray.core.indexes import Index, PandasIndex from xarray.core.pycompat import array_type, integer_types from xarray.core.utils import is_scalar @@ -634,6 +634,37 @@ def test_constructor_with_coords(self) -> None: Dataset({}, {"x": mindex, "y": mindex}) Dataset({}, {"x": mindex, "level_1": range(4)}) + def test_constructor_no_default_index(self) -> None: + # explicitly passing a Coordinates object skips the creation of default index + ds = Dataset(coords=Coordinates({"x": ("x", [1, 2, 3])})) + assert "x" in ds + assert "x" not in ds.xindexes + + def test_constructor_multiindex(self) -> None: + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + coords = Coordinates.from_pandas_multiindex(midx, "x") + + ds = Dataset(coords=coords) + assert_identical(ds, coords.to_dataset()) + + with pytest.warns( + FutureWarning, match=".*`pandas.MultiIndex` via data variable.*" + ): + Dataset(data_vars={"x": midx}) + + def test_constructor_custom_index(self) -> None: + class CustomIndex(Index): + ... + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + ds = Dataset(coords=coords) + assert isinstance(ds.xindexes["x"], CustomIndex) + + # test coordinate variables copied + assert ds.variables["x"] is not coords.variables["x"] + def test_properties(self) -> None: ds = create_test_data() @@ -4255,6 +4286,25 @@ class CustomIndex(PandasIndex): actual = ds.assign_coords(y=[4, 5, 6]) assert isinstance(actual.xindexes["x"], CustomIndex) + def test_assign_coords_custom_index(self) -> None: + class CustomIndex(Index): + pass + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + ds = Dataset() + actual = ds.assign_coords(coords) + assert isinstance(actual.xindexes["x"], CustomIndex) + + def test_assign_coords_no_default_index(self) -> None: + coords = Coordinates({"y": ("y", [1, 2, 3])}) + ds = Dataset() + actual = ds.assign_coords(coords) + expected = coords.to_dataset() + assert_identical(expected, actual, check_default_indexes=False) + assert "y" not in actual.xindexes + def test_merge_multiindex_level(self) -> None: data = create_test_multiindex() @@ -6201,6 +6251,13 @@ def test_ipython_key_completion(self) -> None: ds["var3"].coords[item] # should not raise assert sorted(actual) == sorted(expected) + coords = Coordinates(ds.coords) + actual = coords._ipython_key_completions_() + expected = ["time", "dim2", "dim3", "numbers"] + for item in actual: + coords[item] # should not raise + assert sorted(actual) == sorted(expected) + # data_vars actual = ds.data_vars._ipython_key_completions_() expected = ["var1", "var2", "var3", "dim1"] diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 27b5cf2119c..ebe9f3fb932 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -582,7 +582,12 @@ def indexes( _, variables = indexes_and_vars - return Indexes(indexes, variables) + if isinstance(x_idx, Index): + index_type = Index + else: + index_type = pd.Index + + return Indexes(indexes, variables, index_type=index_type) def test_interface(self, unique_indexes, indexes) -> None: x_idx = unique_indexes[0] diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 8957f9c829a..63449708a79 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -235,6 +235,13 @@ def test_merge_dicts_dims(self): expected = xr.Dataset({"x": [12], "y": ("x", [13])}) assert_identical(actual, expected) + def test_merge_coordinates(self): + coords1 = xr.Coordinates({"x": ("x", [0, 1, 2])}) + coords2 = xr.Coordinates({"y": ("y", [3, 4, 5])}) + expected = xr.Dataset(coords={"x": [0, 1, 2], "y": [3, 4, 5]}) + actual = xr.merge([coords1, coords2]) + assert_identical(actual, expected) + def test_merge_error(self): ds = xr.Dataset({"x": 0}) with pytest.raises(xr.MergeError): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 9b70dcb5464..5cf3d7eaad7 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -12,7 +12,7 @@ import pytz from packaging.version import Version -from xarray import Coordinate, DataArray, Dataset, IndexVariable, Variable, set_options +from xarray import DataArray, Dataset, IndexVariable, Variable, set_options from xarray.core import dtypes, duck_array_ops, indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( @@ -2447,11 +2447,6 @@ def test_concat_str_dtype(self, dtype): assert actual.identical(expected) assert np.issubdtype(actual.dtype, dtype) - def test_coordinate_alias(self): - with pytest.warns(Warning, match="deprecated"): - x = Coordinate("x", [1, 2, 3]) - assert isinstance(x, IndexVariable) - def test_datetime64(self): # GH:1932 Make sure indexing keeps precision t = np.array([1518418799999986560, 1518418799999996560], dtype="datetime64[ns]")