-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
(fix): extension array indexers #9671
base: main
Are you sure you want to change the base?
Changes from all commits
7b5f323
8784f33
b45ab23
39086ef
df49a40
adb8ca3
266b1ed
6d5f13b
5d68bfe
07bba69
6e7f0bb
b4a49bb
2e1ff4f
d5a7da0
821b68d
d066edf
ed22da1
8bf23f4
c3a2b39
3c44aed
48be73a
7ac9983
d86ad04
b5d0795
60324f0
c2bc4df
84569bc
90e390d
7c32bd0
795ecf6
8eca6e9
fb91812
a06f2b1
14027e8
a47a96f
6f2861a
1f07500
798b444
f487599
20d6c9d
7391948
308091c
5f40b4e
2a65d8d
43f7d61
59934b9
a01f9f3
8b91128
0e351ca
07a8e9c
2be5739
b9d0a8e
08afc3b
edc55e1
bffe919
150b982
7113ceb
7f47f0b
512808d
63c83f4
0efbbeb
2910250
57d8d72
5333240
6f35c81
d0c17a4
b2b6bb1
5dbc8a7
be0d3e0
f95408a
609e15c
ec7f165
1f1cf1c
14b1a88
05627dd
e7cbf3a
fc87e04
9ae645e
6e3ca57
277d1c6
81a9d94
be8642f
f3f62e5
c07df41
ae49850
e8f5aa8
9653a01
a405f03
503b313
c8ab8f3
66e5b06
f9fde3a
8a3e834
f5822fd
66c0b9f
ba51274
3ba3e3f
1ab43eb
45ba9d3
091a90d
ab3c9ed
53fe43a
a16a890
4283f8a
0ba848d
6cb8702
5de8d0d
fc985d9
799b750
a2d8e69
d6fe956
bd6a5d1
6557ef9
759fb72
2118191
262295a
941c4b5
6e41425
adebafa
6cd81e5
1ae9a22
1cec644
60aed87
797dc85
225c5b3
33a1563
4efe8b0
21a0ec6
48dea20
1145f4b
a9990cf
5fa630f
43c85d1
a4702d6
9bd292a
25b797e
3bd8cf4
8b9c85a
2555d89
3b2d861
66e181c
e380968
305938c
b32b02c
a2c46b1
fa2c4b6
c65c9af
21dffc1
8eeeb78
7ad2183
9e4cab6
5964a9e
308391d
0e886d6
80dc10b
d494fe0
ef6f722
4ea5241
151e9cd
8ecda4e
2bbf0ff
0308672
b043020
7182ce2
470235e
e619a4c
700e78d
4525ea1
0b93dbd
b38cd7e
a2d1c96
c4b2af3
45a0d56
3ef79cd
ac719e8
5292569
ecd603b
f6716dc
0556376
ffc1828
eaf3c73
1e6ba18
85a340b
9d77885
db69b63
b120917
f7cda22
4d70fd1
666d279
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,10 +10,11 @@ | |
from dataclasses import dataclass, field | ||
from datetime import timedelta | ||
from html import escape | ||
from typing import TYPE_CHECKING, Any, overload | ||
from typing import TYPE_CHECKING, Any, cast, overload | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from numpy.typing import DTypeLike | ||
from packaging.version import Version | ||
|
||
from xarray.core import duck_array_ops | ||
|
@@ -33,8 +34,6 @@ | |
from xarray.namedarray.pycompat import array_type, integer_types, is_chunked_array | ||
|
||
if TYPE_CHECKING: | ||
from numpy.typing import DTypeLike | ||
|
||
from xarray.core.indexes import Index | ||
from xarray.core.types import Self | ||
from xarray.core.variable import Variable | ||
|
@@ -1707,27 +1706,44 @@ class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin): | |
__slots__ = ("_dtype", "array") | ||
|
||
array: pd.Index | ||
_dtype: np.dtype | ||
_dtype: np.dtype | pd.api.extensions.ExtensionDtype | ||
|
||
def __init__(self, array: pd.Index, dtype: DTypeLike = None): | ||
def __init__( | ||
self, | ||
array: pd.Index, | ||
dtype: DTypeLike | pd.api.extensions.ExtensionDtype | None = None, | ||
): | ||
from xarray.core.indexes import safe_cast_to_index | ||
|
||
self.array = safe_cast_to_index(array) | ||
|
||
if dtype is None: | ||
self._dtype = get_valid_numpy_dtype(array) | ||
if pd.api.types.is_extension_array_dtype(array.dtype): | ||
cast(pd.api.extensions.ExtensionDtype, array.dtype) | ||
self._dtype = array.dtype | ||
else: | ||
self._dtype = get_valid_numpy_dtype(array) | ||
elif pd.api.types.is_extension_array_dtype(dtype): | ||
self._dtype = cast(pd.api.extensions.ExtensionDtype, dtype) | ||
else: | ||
self._dtype = np.dtype(dtype) | ||
self._dtype = np.dtype(cast(DTypeLike, dtype)) | ||
|
||
@property | ||
def dtype(self) -> np.dtype: | ||
def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] | ||
return self._dtype | ||
|
||
def __array__( | ||
self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None | ||
self, | ||
dtype: np.typing.DTypeLike | pd.api.extensions.ExtensionDtype | None = None, | ||
/, | ||
*, | ||
copy: bool | None = None, | ||
) -> np.ndarray: | ||
if dtype is None: | ||
dtype = self.dtype | ||
if pd.api.types.is_extension_array_dtype(dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here. Why is this needed? |
||
dtype = get_valid_numpy_dtype(self.array) | ||
dtype = cast(np.dtype, dtype) | ||
array = self.array | ||
if isinstance(array, pd.PeriodIndex): | ||
with suppress(AttributeError): | ||
|
@@ -1746,7 +1762,7 @@ def get_duck_array(self) -> np.ndarray: | |
def shape(self) -> _Shape: | ||
return (len(self.array),) | ||
|
||
def _convert_scalar(self, item): | ||
def _convert_scalar(self, item) -> np.ndarray: | ||
if item is pd.NaT: | ||
# work around the impossibility of casting NaT with asarray | ||
# note: it probably would be better in general to return | ||
|
@@ -1762,7 +1778,10 @@ def _convert_scalar(self, item): | |
# numpy fails to convert pd.Timestamp to np.datetime64[ns] | ||
item = np.asarray(item.to_datetime64()) | ||
elif self.dtype != object: | ||
item = np.asarray(item, dtype=self.dtype) | ||
dtype = self.dtype | ||
if pd.api.types.is_extension_array_dtype(dtype): | ||
dtype = get_valid_numpy_dtype(self.array) | ||
item = np.asarray(item, dtype=cast(np.dtype, dtype)) | ||
|
||
# as for numpy.ndarray indexing, we always want the result to be | ||
# a NumPy array. | ||
|
@@ -1877,23 +1896,30 @@ class PandasMultiIndexingAdapter(PandasIndexingAdapter): | |
__slots__ = ("_dtype", "adapter", "array", "level") | ||
|
||
array: pd.MultiIndex | ||
_dtype: np.dtype | ||
_dtype: np.dtype | pd.api.extensions.ExtensionDtype | ||
level: str | None | ||
|
||
def __init__( | ||
self, | ||
array: pd.MultiIndex, | ||
dtype: DTypeLike = None, | ||
dtype: DTypeLike | pd.api.extensions.ExtensionDtype | None = None, | ||
level: str | None = None, | ||
): | ||
super().__init__(array, dtype) | ||
self.level = level | ||
|
||
def __array__( | ||
self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None | ||
self, | ||
dtype: DTypeLike | pd.api.extensions.ExtensionDtype | None = None, | ||
/, | ||
*, | ||
copy: bool | None = None, | ||
) -> np.ndarray: | ||
if dtype is None: | ||
dtype = self.dtype | ||
if pd.api.types.is_extension_array_dtype(dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this needed? Why would someone call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for internal usage, otherwise I wouldn't have added it. I can delete the line and then see what happens, and then comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dcherian This class is basically an internal adapter so anything that asks for its data in numpy form will call this. Things like |
||
dtype = get_valid_numpy_dtype(self.array) | ||
dtype = cast(np.dtype, dtype) | ||
if self.level is not None: | ||
return np.asarray( | ||
self.array.get_level_values(self.level).values, dtype=dtype | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
) | ||
|
||
import numpy as np | ||
import pandas as pd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NamedArray is supposed to not have a dependency on pandas. |
||
|
||
# TODO: get rid of this after migrating this class to array API | ||
from xarray.core import dtypes, formatting, formatting_html | ||
|
@@ -261,7 +262,7 @@ | |
attrs: _AttrsLike = None, | ||
): | ||
self._data = data | ||
self._dims = self._parse_dimensions(dims) | ||
Check warning on line 265 in xarray/namedarray/core.py GitHub Actions / ubuntu-latest py3.10 bare-minimum
Check warning on line 265 in xarray/namedarray/core.py GitHub Actions / ubuntu-latest py3.10 bare-minimum
|
||
self._attrs = dict(attrs) if attrs else None | ||
|
||
def __init_subclass__(cls, **kwargs: Any) -> None: | ||
|
@@ -834,7 +835,10 @@ | |
if chunkmanager.is_chunked_array(data_old): | ||
data_chunked = chunkmanager.rechunk(data_old, chunks) # type: ignore[arg-type] | ||
else: | ||
if not isinstance(data_old, ExplicitlyIndexed): | ||
if pd.api.types.is_extension_array_dtype(data_old.dtype): | ||
# One of PandasExtensionArray or PandasIndexingAdapter? | ||
ndata = data_old.array.to_numpy() | ||
Comment on lines
+838
to
+840
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I haven't used extension arrays myself that much, why can't a simple |
||
elif not isinstance(data_old, ExplicitlyIndexed): | ||
ndata = data_old | ||
else: | ||
# Unambiguously handle array storage backends (like NetCDF4 and h5py) | ||
|
@@ -845,7 +849,7 @@ | |
# Using OuterIndexer is a pragmatic choice: dask does not yet handle | ||
# different indexing types in an explicit way: | ||
# https://github.com/dask/dask/issues/2883 | ||
ndata = ImplicitToExplicitIndexingAdapter(data_old, OuterIndexer) # type: ignore[assignment] | ||
ndata = ImplicitToExplicitIndexingAdapter(data_old, OuterIndexer) | ||
|
||
if is_dict_like(chunks): | ||
chunks = tuple(chunks.get(n, s) for n, s in enumerate(ndata.shape)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is amazing, it enables IntervalIndex indexing now.
cc @benbovy