1"""Accessors for arrow-backed data."""
2
3from __future__ import annotations
4
5from abc import (
6 ABCMeta,
7 abstractmethod,
8)
9from typing import (
10 TYPE_CHECKING,
11 cast,
12)
13
14from pandas.compat import (
15 pa_version_under10p1,
16 pa_version_under11p0,
17)
18
19from pandas.core.dtypes.common import is_list_like
20
21if not pa_version_under10p1:
22 import pyarrow as pa
23 import pyarrow.compute as pc
24
25 from pandas.core.dtypes.dtypes import ArrowDtype
26
27if TYPE_CHECKING:
28 from collections.abc import Iterator
29
30 from pandas import (
31 DataFrame,
32 Series,
33 )
34
35
36class ArrowAccessor(metaclass=ABCMeta):
37 @abstractmethod
38 def __init__(self, data, validation_msg: str) -> None:
39 self._data = data
40 self._validation_msg = validation_msg
41 self._validate(data)
42
43 @abstractmethod
44 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
45 pass
46
47 def _validate(self, data):
48 dtype = data.dtype
49 if not isinstance(dtype, ArrowDtype):
50 # Raise AttributeError so that inspect can handle non-struct Series.
51 raise AttributeError(self._validation_msg.format(dtype=dtype))
52
53 if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
54 # Raise AttributeError so that inspect can handle invalid Series.
55 raise AttributeError(self._validation_msg.format(dtype=dtype))
56
57 @property
58 def _pa_array(self):
59 return self._data.array._pa_array
60
61
62class ListAccessor(ArrowAccessor):
63 """
64 Accessor object for list data properties of the Series values.
65
66 Parameters
67 ----------
68 data : Series
69 Series containing Arrow list data.
70 """
71
72 def __init__(self, data=None) -> None:
73 super().__init__(
74 data,
75 validation_msg="Can only use the '.list' accessor with "
76 "'list[pyarrow]' dtype, not {dtype}.",
77 )
78
79 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
80 return (
81 pa.types.is_list(pyarrow_dtype)
82 or pa.types.is_fixed_size_list(pyarrow_dtype)
83 or pa.types.is_large_list(pyarrow_dtype)
84 )
85
86 def len(self) -> Series:
87 """
88 Return the length of each list in the Series.
89
90 Returns
91 -------
92 pandas.Series
93 The length of each list.
94
95 Examples
96 --------
97 >>> import pyarrow as pa
98 >>> s = pd.Series(
99 ... [
100 ... [1, 2, 3],
101 ... [3],
102 ... ],
103 ... dtype=pd.ArrowDtype(pa.list_(
104 ... pa.int64()
105 ... ))
106 ... )
107 >>> s.list.len()
108 0 3
109 1 1
110 dtype: int32[pyarrow]
111 """
112 from pandas import Series
113
114 value_lengths = pc.list_value_length(self._pa_array)
115 return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
116
117 def __getitem__(self, key: int | slice) -> Series:
118 """
119 Index or slice lists in the Series.
120
121 Parameters
122 ----------
123 key : int | slice
124 Index or slice of indices to access from each list.
125
126 Returns
127 -------
128 pandas.Series
129 The list at requested index.
130
131 Examples
132 --------
133 >>> import pyarrow as pa
134 >>> s = pd.Series(
135 ... [
136 ... [1, 2, 3],
137 ... [3],
138 ... ],
139 ... dtype=pd.ArrowDtype(pa.list_(
140 ... pa.int64()
141 ... ))
142 ... )
143 >>> s.list[0]
144 0 1
145 1 3
146 dtype: int64[pyarrow]
147 """
148 from pandas import Series
149
150 if isinstance(key, int):
151 # TODO: Support negative key but pyarrow does not allow
152 # element index to be an array.
153 # if key < 0:
154 # key = pc.add(key, pc.list_value_length(self._pa_array))
155 element = pc.list_element(self._pa_array, key)
156 return Series(element, dtype=ArrowDtype(element.type))
157 elif isinstance(key, slice):
158 if pa_version_under11p0:
159 raise NotImplementedError(
160 f"List slice not supported by pyarrow {pa.__version__}."
161 )
162
163 # TODO: Support negative start/stop/step, ideally this would be added
164 # upstream in pyarrow.
165 start, stop, step = key.start, key.stop, key.step
166 if start is None:
167 # TODO: When adding negative step support
168 # this should be setto last element of array
169 # when step is negative.
170 start = 0
171 if step is None:
172 step = 1
173 sliced = pc.list_slice(self._pa_array, start, stop, step)
174 return Series(sliced, dtype=ArrowDtype(sliced.type))
175 else:
176 raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
177
178 def __iter__(self) -> Iterator:
179 raise TypeError(f"'{type(self).__name__}' object is not iterable")
180
181 def flatten(self) -> Series:
182 """
183 Flatten list values.
184
185 Returns
186 -------
187 pandas.Series
188 The data from all lists in the series flattened.
189
190 Examples
191 --------
192 >>> import pyarrow as pa
193 >>> s = pd.Series(
194 ... [
195 ... [1, 2, 3],
196 ... [3],
197 ... ],
198 ... dtype=pd.ArrowDtype(pa.list_(
199 ... pa.int64()
200 ... ))
201 ... )
202 >>> s.list.flatten()
203 0 1
204 1 2
205 2 3
206 3 3
207 dtype: int64[pyarrow]
208 """
209 from pandas import Series
210
211 flattened = pc.list_flatten(self._pa_array)
212 return Series(flattened, dtype=ArrowDtype(flattened.type))
213
214
215class StructAccessor(ArrowAccessor):
216 """
217 Accessor object for structured data properties of the Series values.
218
219 Parameters
220 ----------
221 data : Series
222 Series containing Arrow struct data.
223 """
224
225 def __init__(self, data=None) -> None:
226 super().__init__(
227 data,
228 validation_msg=(
229 "Can only use the '.struct' accessor with 'struct[pyarrow]' "
230 "dtype, not {dtype}."
231 ),
232 )
233
234 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
235 return pa.types.is_struct(pyarrow_dtype)
236
237 @property
238 def dtypes(self) -> Series:
239 """
240 Return the dtype object of each child field of the struct.
241
242 Returns
243 -------
244 pandas.Series
245 The data type of each child field.
246
247 Examples
248 --------
249 >>> import pyarrow as pa
250 >>> s = pd.Series(
251 ... [
252 ... {"version": 1, "project": "pandas"},
253 ... {"version": 2, "project": "pandas"},
254 ... {"version": 1, "project": "numpy"},
255 ... ],
256 ... dtype=pd.ArrowDtype(pa.struct(
257 ... [("version", pa.int64()), ("project", pa.string())]
258 ... ))
259 ... )
260 >>> s.struct.dtypes
261 version int64[pyarrow]
262 project string[pyarrow]
263 dtype: object
264 """
265 from pandas import (
266 Index,
267 Series,
268 )
269
270 pa_type = self._data.dtype.pyarrow_dtype
271 types = [ArrowDtype(struct.type) for struct in pa_type]
272 names = [struct.name for struct in pa_type]
273 return Series(types, index=Index(names))
274
275 def field(
276 self,
277 name_or_index: list[str]
278 | list[bytes]
279 | list[int]
280 | pc.Expression
281 | bytes
282 | str
283 | int,
284 ) -> Series:
285 """
286 Extract a child field of a struct as a Series.
287
288 Parameters
289 ----------
290 name_or_index : str | bytes | int | expression | list
291 Name or index of the child field to extract.
292
293 For list-like inputs, this will index into a nested
294 struct.
295
296 Returns
297 -------
298 pandas.Series
299 The data corresponding to the selected child field.
300
301 See Also
302 --------
303 Series.struct.explode : Return all child fields as a DataFrame.
304
305 Notes
306 -----
307 The name of the resulting Series will be set using the following
308 rules:
309
310 - For string, bytes, or integer `name_or_index` (or a list of these, for
311 a nested selection), the Series name is set to the selected
312 field's name.
313 - For a :class:`pyarrow.compute.Expression`, this is set to
314 the string form of the expression.
315 - For list-like `name_or_index`, the name will be set to the
316 name of the final field selected.
317
318 Examples
319 --------
320 >>> import pyarrow as pa
321 >>> s = pd.Series(
322 ... [
323 ... {"version": 1, "project": "pandas"},
324 ... {"version": 2, "project": "pandas"},
325 ... {"version": 1, "project": "numpy"},
326 ... ],
327 ... dtype=pd.ArrowDtype(pa.struct(
328 ... [("version", pa.int64()), ("project", pa.string())]
329 ... ))
330 ... )
331
332 Extract by field name.
333
334 >>> s.struct.field("project")
335 0 pandas
336 1 pandas
337 2 numpy
338 Name: project, dtype: string[pyarrow]
339
340 Extract by field index.
341
342 >>> s.struct.field(0)
343 0 1
344 1 2
345 2 1
346 Name: version, dtype: int64[pyarrow]
347
348 Or an expression
349
350 >>> import pyarrow.compute as pc
351 >>> s.struct.field(pc.field("project"))
352 0 pandas
353 1 pandas
354 2 numpy
355 Name: project, dtype: string[pyarrow]
356
357 For nested struct types, you can pass a list of values to index
358 multiple levels:
359
360 >>> version_type = pa.struct([
361 ... ("major", pa.int64()),
362 ... ("minor", pa.int64()),
363 ... ])
364 >>> s = pd.Series(
365 ... [
366 ... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
367 ... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
368 ... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
369 ... ],
370 ... dtype=pd.ArrowDtype(pa.struct(
371 ... [("version", version_type), ("project", pa.string())]
372 ... ))
373 ... )
374 >>> s.struct.field(["version", "minor"])
375 0 5
376 1 1
377 2 26
378 Name: minor, dtype: int64[pyarrow]
379 >>> s.struct.field([0, 0])
380 0 1
381 1 2
382 2 1
383 Name: major, dtype: int64[pyarrow]
384 """
385 from pandas import Series
386
387 def get_name(
388 level_name_or_index: list[str]
389 | list[bytes]
390 | list[int]
391 | pc.Expression
392 | bytes
393 | str
394 | int,
395 data: pa.ChunkedArray,
396 ):
397 if isinstance(level_name_or_index, int):
398 name = data.type.field(level_name_or_index).name
399 elif isinstance(level_name_or_index, (str, bytes)):
400 name = level_name_or_index
401 elif isinstance(level_name_or_index, pc.Expression):
402 name = str(level_name_or_index)
403 elif is_list_like(level_name_or_index):
404 # For nested input like [2, 1, 2]
405 # iteratively get the struct and field name. The last
406 # one is used for the name of the index.
407 level_name_or_index = list(reversed(level_name_or_index))
408 selected = data
409 while level_name_or_index:
410 # we need the cast, otherwise mypy complains about
411 # getting ints, bytes, or str here, which isn't possible.
412 level_name_or_index = cast(list, level_name_or_index)
413 name_or_index = level_name_or_index.pop()
414 name = get_name(name_or_index, selected)
415 selected = selected.type.field(selected.type.get_field_index(name))
416 name = selected.name
417 else:
418 raise ValueError(
419 "name_or_index must be an int, str, bytes, "
420 "pyarrow.compute.Expression, or list of those"
421 )
422 return name
423
424 pa_arr = self._data.array._pa_array
425 name = get_name(name_or_index, pa_arr)
426 field_arr = pc.struct_field(pa_arr, name_or_index)
427
428 return Series(
429 field_arr,
430 dtype=ArrowDtype(field_arr.type),
431 index=self._data.index,
432 name=name,
433 )
434
435 def explode(self) -> DataFrame:
436 """
437 Extract all child fields of a struct as a DataFrame.
438
439 Returns
440 -------
441 pandas.DataFrame
442 The data corresponding to all child fields.
443
444 See Also
445 --------
446 Series.struct.field : Return a single child field as a Series.
447
448 Examples
449 --------
450 >>> import pyarrow as pa
451 >>> s = pd.Series(
452 ... [
453 ... {"version": 1, "project": "pandas"},
454 ... {"version": 2, "project": "pandas"},
455 ... {"version": 1, "project": "numpy"},
456 ... ],
457 ... dtype=pd.ArrowDtype(pa.struct(
458 ... [("version", pa.int64()), ("project", pa.string())]
459 ... ))
460 ... )
461
462 >>> s.struct.explode()
463 version project
464 0 1 pandas
465 1 2 pandas
466 2 1 numpy
467 """
468 from pandas import concat
469
470 pa_type = self._pa_array.type
471 return concat(
472 [self.field(i) for i in range(pa_type.num_fields)], axis="columns"
473 )