Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass indexes directly to the DataArray and Dataset constructors #7214

Closed
wants to merge 14 commits into from
Closed
133 changes: 95 additions & 38 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,32 @@
T_XarrayOther = TypeVar("T_XarrayOther", bound=Union["DataArray", Dataset])


def _check_coords_dims(shape, coords, dims):
sizes = dict(zip(dims, shape))
for k, v in coords.items():
if any(d not in dims for d in v.dims):
raise ValueError(
f"coordinate {k} has dimensions {v.dims}, but these "
"are not a subset of the DataArray "
f"dimensions {dims}"
)

for d, s in zip(v.dims, v.shape):
if s != sizes[d]:
raise ValueError(
f"conflicting sizes for dimension {d!r}: "
f"length {sizes[d]} on the data but length {s} on "
f"coordinate {k!r}"
)

if k in sizes and v.shape != (sizes[k],):
raise ValueError(
f"coordinate {k!r} is a DataArray dimension, but "
f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} "
"matching the dimension size"
)


def _infer_coords_and_dims(
shape, coords, dims
) -> tuple[dict[Hashable, Variable], tuple[Hashable, ...]]:
Expand Down Expand Up @@ -159,29 +185,7 @@ def _infer_coords_and_dims(
var.dims = (dim,)
new_coords[dim] = var.to_index_variable()

sizes = dict(zip(dims, shape))
for k, v in new_coords.items():
if any(d not in dims for d in v.dims):
raise ValueError(
f"coordinate {k} has dimensions {v.dims}, but these "
"are not a subset of the DataArray "
f"dimensions {dims}"
)

for d, s in zip(v.dims, v.shape):
if s != sizes[d]:
raise ValueError(
f"conflicting sizes for dimension {d!r}: "
f"length {sizes[d]} on the data but length {s} on "
f"coordinate {k!r}"
)

if k in sizes and v.shape != (sizes[k],):
raise ValueError(
f"coordinate {k!r} is a DataArray dimension, but "
f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} "
"matching the dimension size"
)
_check_coords_dims(shape, new_coords, dims)

return new_coords, dims

Expand Down Expand Up @@ -301,6 +305,11 @@ class DataArray(
attrs : dict_like or None, optional
Attributes to assign to the new instance. By default, an empty
attribute dictionary is initialized.
indexes : py:class:`~xarray.Indexes` or dict-like, optional
A collection of :py:class:`~xarray.indexes.Index` objects and
their coordinates variables. If an empty collection is given,
it will skip the creation of default (pandas) indexes for
dimension coordinates.

Examples
--------
Expand Down Expand Up @@ -389,21 +398,18 @@ def __init__(
dims: Hashable | Sequence[Hashable] | None = None,
name: Hashable | None = None,
attrs: Mapping | None = None,
indexes: Mapping[Any, Index] | None = None,
# internal parameters
indexes: dict[Hashable, Index] | None = None,
fastpath: bool = False,
) -> None:
if fastpath:
variable = data
assert dims is None
assert attrs is None
assert indexes is not None
assert isinstance(indexes, dict)
da_indexes = indexes
da_coords = coords
else:
# TODO: (benbovy - explicit indexes) remove
# once it becomes part of the public interface
if indexes is not None:
raise ValueError("Providing explicit indexes is not supported yet")

# try to fill in arguments from data if they weren't supplied
if coords is None:

Expand All @@ -423,21 +429,50 @@ def __init__(
if attrs is None and not isinstance(data, PANDAS_TYPES):
attrs = getattr(data, "attrs", None)

if indexes is None:
create_default_indexes = True
indexes = Indexes()
elif len(indexes) == 0:
create_default_indexes = False
indexes = Indexes()
else:
create_default_indexes = True
if not isinstance(indexes, Indexes):
raise TypeError(
"non-empty indexes must be an instance of `Indexes`"
)
elif indexes._index_type != Index:
raise TypeError("indexes must only contain Xarray `Index` objects")

data = _check_data_shape(data, coords, dims)
data = as_compatible_data(data)
coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
da_coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
variable = Variable(dims, data, attrs, fastpath=True)
indexes, coords = _create_indexes_from_coords(coords)

if create_default_indexes:
da_indexes, da_coords = _create_indexes_from_coords(da_coords)
else:
da_indexes = {}

both_indexes_and_coords = set(indexes) & set(da_coords)
if both_indexes_and_coords:
raise ValueError(
f"{both_indexes_and_coords} are found in both indexes and coords"
)

_check_coords_dims(data.shape, indexes.variables, dims)

da_coords.update(
{k: v.copy(deep=False) for k, v in indexes.variables.items()}
)
da_indexes.update(indexes)

# These fully describe a DataArray
self._variable = variable
assert isinstance(coords, dict)
self._coords = coords
assert isinstance(da_coords, dict)
self._coords = da_coords
self._name = name

# TODO(shoyer): document this argument, once it becomes part of the
# public interface.
self._indexes = indexes
self._indexes = da_indexes # type: ignore[assignment]

self._close = None

Expand Down Expand Up @@ -3667,6 +3702,28 @@ def reduce(
var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims, **kwargs)
return self._replace_maybe_drop_dims(var)

def assign_indexes(self, indexes: Indexes[Index]):
"""Assign new indexes to this dataarray.

Returns a new dataarray with all the original data in addition to the new
indexes (and their corresponding coordinates).

Parameters
----------
indexes : :py:class:`~xarray.Indexes`.
A collection of :py:class:`~xarray.indexes.Index` objects
to assign (including their coordinate variables).

Returns
-------
assigned : DataArray
A new dataarray with the new indexes and coordinates in addition to
the existing data.
"""
# TODO: check indexes.dims must be a subset of self.dims
ds = self._to_temp_dataset().assign_indexes(indexes)
return self._from_temp_dataset(ds)

def to_pandas(self) -> DataArray | pd.Series | pd.DataFrame:
"""Convert this array into a pandas object with the same shape.

Expand Down
75 changes: 68 additions & 7 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,8 +452,10 @@ class Dataset(
Dataset implements the mapping interface with keys given by variable
names and values given by DataArray objects for each variable name.

One dimensional variables with name equal to their dimension are
index coordinates used for label based indexing.
By default, pandas indexes are created for one dimensional variables with
name equal to their dimension so those variables can be used as coordinates
for label based indexing. Xarray-compatible indexes may also be provided
via the `indexes` argument.

To load data from a file or file-like object, use the `open_dataset`
function.
Expand Down Expand Up @@ -504,6 +506,11 @@ class Dataset(

attrs : dict-like, optional
Global attributes to save on this dataset.
indexes : py:class:`~xarray.Indexes` or dict-like, optional
A collection of :py:class:`~xarray.indexes.Index` objects and
their coordinates variables. If an empty collection is given,
it will skip the creation of default (pandas) indexes for
dimension coordinates.

Examples
--------
Expand Down Expand Up @@ -563,6 +570,7 @@ class Dataset(
precipitation float64 8.326
Attributes:
description: Weather related data.

"""

_attrs: dict[Hashable, Any] | None
Expand Down Expand Up @@ -593,14 +601,26 @@ def __init__(
data_vars: Mapping[Any, Any] | None = None,
coords: Mapping[Any, Any] | None = None,
attrs: Mapping[Any, Any] | None = None,
indexes: Mapping[Any, Index] | None = None,
) -> None:
# TODO(shoyer): expose indexes as a public argument in __init__

if data_vars is None:
data_vars = {}
if coords is None:
coords = {}

if indexes is None:
create_default_indexes = True
indexes = Indexes()
elif len(indexes) == 0:
create_default_indexes = False
indexes = Indexes()
else:
create_default_indexes = True
if not isinstance(indexes, Indexes):
raise TypeError("non-empty indexes must be an instance of `Indexes`")
elif indexes._index_type != Index:
raise TypeError("indexes must only contain Xarray `Index` objects")
Comment on lines +614 to +622
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the special case for size 0 indexes. These sort of special cases that violate type stability can lead to very hard to debug bugs.

Instead, how about never creating default pandas indexes if indexes is provided? Maybe there could also be a special method (e.g., assign_default_indexes()) for manually adding in pandas indexes later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I like that!

I also like assign_default_indexes() or maybe assign_indexes(indexes=None) if we don't want to add another method.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One concern I have is when a user wants to explicitly provide a pandas multi-index or a custom single coordinate index for a dimension and still wants the default pandas indexes for the other dimension coordinates.

In this case, calling assign_default_indexes() or assign_indexes(indexes=None) later will likely overwrite the explicitly provided indexes?

After removing the pandas multi-index dimension coordinate this won't be an issue anymore, but the issue remains for any custom 1-d dimension coordinate index.

Should we add an overwrite=True keyword argument to assign_(default_)indexes?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assign_default_indexes() should probably include an optional list of dimension names for which to assign default indexes.

By default, we might make it only add indexes for dimensions that doesn't already have an index.


both_data_and_coords = set(data_vars) & set(coords)
if both_data_and_coords:
raise ValueError(
Expand All @@ -610,17 +630,34 @@ def __init__(
if isinstance(coords, Dataset):
coords = coords.variables

variables, coord_names, dims, indexes, _ = merge_data_and_coords(
data_vars, coords, compat="broadcast_equals"
variables, coord_names, dims, ds_indexes, _ = merge_data_and_coords(
data_vars,
coords,
compat="broadcast_equals",
create_default_indexes=create_default_indexes,
)

both_indexes_and_coords = set(indexes) & coord_names
if both_indexes_and_coords:
raise ValueError(
f"{both_indexes_and_coords} are found in both indexes and coords"
)
Comment on lines +640 to +644
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check surprises me. As a user, I would expect that I can write something like xarray.Dataset(data_vars=ds.data_vars, coords=ds.coords, attrs=ds.attrs, indexes=ds.xindexes) to make a new object.

Copy link
Member Author

@benbovy benbovy Oct 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Would it be reasonable that the coordinates provided via indexes=... (if we use it) override the coordinates provided via coords=...?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be reasonable that the coordinates provided via indexes=... (if we use it) override the coordinates provided via coords=...?

Can you give an example of how this would happen? Do indexes already provide enough information to create a coordinate (this would slightly surprise me)?

I think an error in these cases or creating a Dataset with inconsistent coordinates/indexes (if it is hard to check) would be fine. I would not silently override coordinates.


variables.update({k: v.copy(deep=False) for k, v in indexes.variables.items()})
coord_names.update(indexes.variables)
ds_indexes.update(indexes)

# re-calculate dimensions if indexes are given explicitly
if indexes:
dims = calculate_dimensions(variables)

self._attrs = dict(attrs) if attrs is not None else None
self._close = None
self._encoding = None
self._variables = variables
self._coord_names = coord_names
self._dims = dims
self._indexes = indexes
self._indexes = ds_indexes

@classmethod
def load_store(cls: type[T_Dataset], store, decoder=None) -> T_Dataset:
Expand Down Expand Up @@ -6080,6 +6117,30 @@ def assign(
data.update(results)
return data

def assign_indexes(self, indexes: Indexes[Index]):
"""Assign new indexes to this dataset.

Returns a new dataset with all the original data in addition to the new
indexes (and their corresponding coordinates).

Parameters
----------
indexes : :py:class:`~xarray.Indexes`.
A collection of :py:class:`~xarray.indexes.Index` objects
to assign (including their coordinate variables).

Returns
-------
assigned : Dataset
A new dataset with the new indexes and coordinates in addition to
the existing data.
"""
ds_indexes = Dataset(indexes=indexes)
dropped = self.drop_vars(indexes, errors="ignore")
return dropped.merge(
ds_indexes, compat="minimal", join="override", combine_attrs="no_conflicts"
)

def to_array(
self, dim: Hashable = "variable", name: Hashable | None = None
) -> DataArray:
Expand Down
Loading