pydata · benbovy · Oct 25, 2022 · Oct 25, 2022 · Oct 26, 2022 · Oct 26, 2022
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -108,6 +108,32 @@
     T_XarrayOther = TypeVar("T_XarrayOther", bound=Union["DataArray", Dataset])
 
 
+def _check_coords_dims(shape, coords, dims):
+    sizes = dict(zip(dims, shape))
+    for k, v in coords.items():
+        if any(d not in dims for d in v.dims):
+            raise ValueError(
+                f"coordinate {k} has dimensions {v.dims}, but these "
+                "are not a subset of the DataArray "
+                f"dimensions {dims}"
+            )
+
+        for d, s in zip(v.dims, v.shape):
+            if s != sizes[d]:
+                raise ValueError(
+                    f"conflicting sizes for dimension {d!r}: "
+                    f"length {sizes[d]} on the data but length {s} on "
+                    f"coordinate {k!r}"
+                )
+
+        if k in sizes and v.shape != (sizes[k],):
+            raise ValueError(
+                f"coordinate {k!r} is a DataArray dimension, but "
+                f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} "
+                "matching the dimension size"
+            )
+
+
 def _infer_coords_and_dims(
     shape, coords, dims
 ) -> tuple[dict[Hashable, Variable], tuple[Hashable, ...]]:
@@ -159,29 +185,7 @@ def _infer_coords_and_dims(
             var.dims = (dim,)
             new_coords[dim] = var.to_index_variable()
 
-    sizes = dict(zip(dims, shape))
-    for k, v in new_coords.items():
-        if any(d not in dims for d in v.dims):
-            raise ValueError(
-                f"coordinate {k} has dimensions {v.dims}, but these "
-                "are not a subset of the DataArray "
-                f"dimensions {dims}"
-            )
-
-        for d, s in zip(v.dims, v.shape):
-            if s != sizes[d]:
-                raise ValueError(
-                    f"conflicting sizes for dimension {d!r}: "
-                    f"length {sizes[d]} on the data but length {s} on "
-                    f"coordinate {k!r}"
-                )
-
-        if k in sizes and v.shape != (sizes[k],):
-            raise ValueError(
-                f"coordinate {k!r} is a DataArray dimension, but "
-                f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} "
-                "matching the dimension size"
-            )
+    _check_coords_dims(shape, new_coords, dims)
 
     return new_coords, dims
 
@@ -301,6 +305,11 @@ class DataArray(
     attrs : dict_like or None, optional
         Attributes to assign to the new instance. By default, an empty
         attribute dictionary is initialized.
+    indexes : py:class:`~xarray.Indexes` or dict-like, optional
+        A collection of :py:class:`~xarray.indexes.Index` objects and
+        their coordinates variables. If an empty collection is given,
+        it will skip the creation of default (pandas) indexes for
+        dimension coordinates.
 
     Examples
     --------
@@ -389,21 +398,18 @@ def __init__(
         dims: Hashable | Sequence[Hashable] | None = None,
         name: Hashable | None = None,
         attrs: Mapping | None = None,
+        indexes: Mapping[Any, Index] | None = None,
         # internal parameters
-        indexes: dict[Hashable, Index] | None = None,
         fastpath: bool = False,
     ) -> None:
         if fastpath:
             variable = data
             assert dims is None
             assert attrs is None
-            assert indexes is not None
+            assert isinstance(indexes, dict)
+            da_indexes = indexes
+            da_coords = coords
         else:
-            # TODO: (benbovy - explicit indexes) remove
-            # once it becomes part of the public interface
-            if indexes is not None:
-                raise ValueError("Providing explicit indexes is not supported yet")
-
             # try to fill in arguments from data if they weren't supplied
             if coords is None:
 
@@ -423,21 +429,50 @@ def __init__(
             if attrs is None and not isinstance(data, PANDAS_TYPES):
                 attrs = getattr(data, "attrs", None)
 
+            if indexes is None:
+                create_default_indexes = True
+                indexes = Indexes()
+            elif len(indexes) == 0:
+                create_default_indexes = False
+                indexes = Indexes()
+            else:
+                create_default_indexes = True
+                if not isinstance(indexes, Indexes):
+                    raise TypeError(
+                        "non-empty indexes must be an instance of `Indexes`"
+                    )
+                elif indexes._index_type != Index:
+                    raise TypeError("indexes must only contain Xarray `Index` objects")
+
             data = _check_data_shape(data, coords, dims)
             data = as_compatible_data(data)
-            coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
+            da_coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
             variable = Variable(dims, data, attrs, fastpath=True)
-            indexes, coords = _create_indexes_from_coords(coords)
+
+            if create_default_indexes:
+                da_indexes, da_coords = _create_indexes_from_coords(da_coords)
+            else:
+                da_indexes = {}
+
+            both_indexes_and_coords = set(indexes) & set(da_coords)
+            if both_indexes_and_coords:
+                raise ValueError(
+                    f"{both_indexes_and_coords} are found in both indexes and coords"
+                )
+
+            _check_coords_dims(data.shape, indexes.variables, dims)
+
+            da_coords.update(
+                {k: v.copy(deep=False) for k, v in indexes.variables.items()}
+            )
+            da_indexes.update(indexes)
 
         # These fully describe a DataArray
         self._variable = variable
-        assert isinstance(coords, dict)
-        self._coords = coords
+        assert isinstance(da_coords, dict)
+        self._coords = da_coords
         self._name = name
-
-        # TODO(shoyer): document this argument, once it becomes part of the
-        # public interface.
-        self._indexes = indexes
+        self._indexes = da_indexes  # type: ignore[assignment]
 
         self._close = None
 
@@ -3667,6 +3702,28 @@ def reduce(
         var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims, **kwargs)
         return self._replace_maybe_drop_dims(var)
 
+    def assign_indexes(self, indexes: Indexes[Index]):
+        """Assign new indexes to this dataarray.
+
+        Returns a new dataarray with all the original data in addition to the new
+        indexes (and their corresponding coordinates).
+
+        Parameters
+        ----------
+        indexes : :py:class:`~xarray.Indexes`.
+            A collection of :py:class:`~xarray.indexes.Index` objects
+            to assign (including their coordinate variables).
+
+        Returns
+        -------
+        assigned : DataArray
+            A new dataarray with the new indexes and coordinates in addition to
+            the existing data.
+        """
+        # TODO: check indexes.dims must be a subset of self.dims
+        ds = self._to_temp_dataset().assign_indexes(indexes)
+        return self._from_temp_dataset(ds)
+
     def to_pandas(self) -> DataArray | pd.Series | pd.DataFrame:
         """Convert this array into a pandas object with the same shape.
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -452,8 +452,10 @@ class Dataset(
     Dataset implements the mapping interface with keys given by variable
     names and values given by DataArray objects for each variable name.
 
-    One dimensional variables with name equal to their dimension are
-    index coordinates used for label based indexing.
+    By default, pandas indexes are created for one dimensional variables with
+    name equal to their dimension so those variables can be used as coordinates
+    for label based indexing. Xarray-compatible indexes may also be provided
+    via the `indexes` argument.
 
     To load data from a file or file-like object, use the `open_dataset`
     function.
@@ -504,6 +506,11 @@ class Dataset(
 
     attrs : dict-like, optional
         Global attributes to save on this dataset.
+    indexes : py:class:`~xarray.Indexes` or dict-like, optional
+        A collection of :py:class:`~xarray.indexes.Index` objects and
+        their coordinates variables. If an empty collection is given,
+        it will skip the creation of default (pandas) indexes for
+        dimension coordinates.
 
     Examples
     --------
@@ -563,6 +570,7 @@ class Dataset(
         precipitation   float64 8.326
     Attributes:
         description:  Weather related data.
+
     """
 
     _attrs: dict[Hashable, Any] | None
@@ -593,14 +601,26 @@ def __init__(
         data_vars: Mapping[Any, Any] | None = None,
         coords: Mapping[Any, Any] | None = None,
         attrs: Mapping[Any, Any] | None = None,
+        indexes: Mapping[Any, Index] | None = None,
     ) -> None:
-        # TODO(shoyer): expose indexes as a public argument in __init__
-
         if data_vars is None:
             data_vars = {}
         if coords is None:
             coords = {}
 
+        if indexes is None:
+            create_default_indexes = True
+            indexes = Indexes()
+        elif len(indexes) == 0:
+            create_default_indexes = False
+            indexes = Indexes()
+        else:
+            create_default_indexes = True
+            if not isinstance(indexes, Indexes):
+                raise TypeError("non-empty indexes must be an instance of `Indexes`")
+            elif indexes._index_type != Index:
+                raise TypeError("indexes must only contain Xarray `Index` objects")
+
         both_data_and_coords = set(data_vars) & set(coords)
         if both_data_and_coords:
             raise ValueError(
@@ -610,17 +630,34 @@ def __init__(
         if isinstance(coords, Dataset):
             coords = coords.variables
 
-        variables, coord_names, dims, indexes, _ = merge_data_and_coords(
-            data_vars, coords, compat="broadcast_equals"
+        variables, coord_names, dims, ds_indexes, _ = merge_data_and_coords(
+            data_vars,
+            coords,
+            compat="broadcast_equals",
+            create_default_indexes=create_default_indexes,
         )
 
+        both_indexes_and_coords = set(indexes) & coord_names
+        if both_indexes_and_coords:
+            raise ValueError(
+                f"{both_indexes_and_coords} are found in both indexes and coords"
+            )
+
+        variables.update({k: v.copy(deep=False) for k, v in indexes.variables.items()})
+        coord_names.update(indexes.variables)
+        ds_indexes.update(indexes)
+
+        # re-calculate dimensions if indexes are given explicitly
+        if indexes:
+            dims = calculate_dimensions(variables)
+
         self._attrs = dict(attrs) if attrs is not None else None
         self._close = None
         self._encoding = None
         self._variables = variables
         self._coord_names = coord_names
         self._dims = dims
-        self._indexes = indexes
+        self._indexes = ds_indexes
 
     @classmethod
     def load_store(cls: type[T_Dataset], store, decoder=None) -> T_Dataset:
@@ -6080,6 +6117,30 @@ def assign(
         data.update(results)
         return data
 
+    def assign_indexes(self, indexes: Indexes[Index]):
+        """Assign new indexes to this dataset.
+
+        Returns a new dataset with all the original data in addition to the new
+        indexes (and their corresponding coordinates).
+
+        Parameters
+        ----------
+        indexes : :py:class:`~xarray.Indexes`.
+            A collection of :py:class:`~xarray.indexes.Index` objects
+            to assign (including their coordinate variables).
+
+        Returns
+        -------
+        assigned : Dataset
+            A new dataset with the new indexes and coordinates in addition to
+            the existing data.
+        """
+        ds_indexes = Dataset(indexes=indexes)
+        dropped = self.drop_vars(indexes, errors="ignore")
+        return dropped.merge(
+            ds_indexes, compat="minimal", join="override", combine_attrs="no_conflicts"
+        )
+
     def to_array(
         self, dim: Hashable = "variable", name: Hashable | None = None
     ) -> DataArray: