Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/arrow/accessors.py: 32%

1"""Accessors for arrow-backed data."""

3from __future__ import annotations

5from abc import (

6 ABCMeta,

7 abstractmethod,

9from typing import (

10 TYPE_CHECKING,

11 cast,

12)

14from pandas.compat import (

15 pa_version_under10p1,

16 pa_version_under11p0,

17)

19from pandas.core.dtypes.common import is_list_like

21if not pa_version_under10p1:

22 import pyarrow as pa

23 import pyarrow.compute as pc

25 from pandas.core.dtypes.dtypes import ArrowDtype

27if TYPE_CHECKING:

28 from collections.abc import Iterator

30 from pandas import (

31 DataFrame,

32 Series,

33 )

36class ArrowAccessor(metaclass=ABCMeta):

37 @abstractmethod

38 def __init__(self, data, validation_msg: str) -> None:

39 self._data = data

40 self._validation_msg = validation_msg

41 self._validate(data)

43 @abstractmethod

44 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:

45 pass

47 def _validate(self, data):

48 dtype = data.dtype

49 if not isinstance(dtype, ArrowDtype):

50 # Raise AttributeError so that inspect can handle non-struct Series.

51 raise AttributeError(self._validation_msg.format(dtype=dtype))

53 if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):

54 # Raise AttributeError so that inspect can handle invalid Series.

55 raise AttributeError(self._validation_msg.format(dtype=dtype))

57 @property

58 def _pa_array(self):

59 return self._data.array._pa_array

62class ListAccessor(ArrowAccessor):

63 """

64 Accessor object for list data properties of the Series values.

66 Parameters

67 ----------

68 data : Series

69 Series containing Arrow list data.

70 """

72 def __init__(self, data=None) -> None:

73 super().__init__(

74 data,

75 validation_msg="Can only use the '.list' accessor with "

76 "'list[pyarrow]' dtype, not {dtype}.",

77 )

79 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:

80 return (

81 pa.types.is_list(pyarrow_dtype)

82 or pa.types.is_fixed_size_list(pyarrow_dtype)

83 or pa.types.is_large_list(pyarrow_dtype)

84 )

86 def len(self) -> Series:

87 """

88 Return the length of each list in the Series.

90 Returns

91 -------

92 pandas.Series

93 The length of each list.

95 Examples

96 --------

97 >>> import pyarrow as pa

98 >>> s = pd.Series(

99 ... [

100 ... [1, 2, 3],

101 ... [3],

102 ... ],

103 ... dtype=pd.ArrowDtype(pa.list_(

104 ... pa.int64()

105 ... ))

106 ... )

107 >>> s.list.len()

108 0 3

109 1 1

110 dtype: int32[pyarrow]

111 """

112 from pandas import Series

113

114 value_lengths = pc.list_value_length(self._pa_array)

115 return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))

116

117 def __getitem__(self, key: int | slice) -> Series:

118 """

119 Index or slice lists in the Series.

120

121 Parameters

122 ----------

123 key : int | slice

124 Index or slice of indices to access from each list.

125

126 Returns

127 -------

128 pandas.Series

129 The list at requested index.

130

131 Examples

132 --------

133 >>> import pyarrow as pa

134 >>> s = pd.Series(

135 ... [

136 ... [1, 2, 3],

137 ... [3],

138 ... ],

139 ... dtype=pd.ArrowDtype(pa.list_(

140 ... pa.int64()

141 ... ))

142 ... )

143 >>> s.list[0]

144 0 1

145 1 3

146 dtype: int64[pyarrow]

147 """

148 from pandas import Series

149

150 if isinstance(key, int):

151 # TODO: Support negative key but pyarrow does not allow

152 # element index to be an array.

153 # if key < 0:

154 # key = pc.add(key, pc.list_value_length(self._pa_array))

155 element = pc.list_element(self._pa_array, key)

156 return Series(element, dtype=ArrowDtype(element.type))

157 elif isinstance(key, slice):

158 if pa_version_under11p0:

159 raise NotImplementedError(

160 f"List slice not supported by pyarrow {pa.__version__}."

161 )

162

163 # TODO: Support negative start/stop/step, ideally this would be added

164 # upstream in pyarrow.

165 start, stop, step = key.start, key.stop, key.step

166 if start is None:

167 # TODO: When adding negative step support

168 # this should be setto last element of array

169 # when step is negative.

170 start = 0

171 if step is None:

172 step = 1

173 sliced = pc.list_slice(self._pa_array, start, stop, step)

174 return Series(sliced, dtype=ArrowDtype(sliced.type))

175 else:

176 raise ValueError(f"key must be an int or slice, got {type(key).__name__}")

177

178 def __iter__(self) -> Iterator:

179 raise TypeError(f"'{type(self).__name__}' object is not iterable")

180

181 def flatten(self) -> Series:

182 """

183 Flatten list values.

184

185 Returns

186 -------

187 pandas.Series

188 The data from all lists in the series flattened.

189

190 Examples

191 --------

192 >>> import pyarrow as pa

193 >>> s = pd.Series(

194 ... [

195 ... [1, 2, 3],

196 ... [3],

197 ... ],

198 ... dtype=pd.ArrowDtype(pa.list_(

199 ... pa.int64()

200 ... ))

201 ... )

202 >>> s.list.flatten()

203 0 1

204 1 2

205 2 3

206 3 3

207 dtype: int64[pyarrow]

208 """

209 from pandas import Series

210

211 flattened = pc.list_flatten(self._pa_array)

212 return Series(flattened, dtype=ArrowDtype(flattened.type))

213

214

215class StructAccessor(ArrowAccessor):

216 """

217 Accessor object for structured data properties of the Series values.

218

219 Parameters

220 ----------

221 data : Series

222 Series containing Arrow struct data.

223 """

224

225 def __init__(self, data=None) -> None:

226 super().__init__(

227 data,

228 validation_msg=(

229 "Can only use the '.struct' accessor with 'struct[pyarrow]' "

230 "dtype, not {dtype}."

231 ),

232 )

233

234 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:

235 return pa.types.is_struct(pyarrow_dtype)

236

237 @property

238 def dtypes(self) -> Series:

239 """

240 Return the dtype object of each child field of the struct.

241

242 Returns

243 -------

244 pandas.Series

245 The data type of each child field.

246

247 Examples

248 --------

249 >>> import pyarrow as pa

250 >>> s = pd.Series(

251 ... [

252 ... {"version": 1, "project": "pandas"},

253 ... {"version": 2, "project": "pandas"},

254 ... {"version": 1, "project": "numpy"},

255 ... ],

256 ... dtype=pd.ArrowDtype(pa.struct(

257 ... [("version", pa.int64()), ("project", pa.string())]

258 ... ))

259 ... )

260 >>> s.struct.dtypes

261 version int64[pyarrow]

262 project string[pyarrow]

263 dtype: object

264 """

265 from pandas import (

266 Index,

267 Series,

268 )

269

270 pa_type = self._data.dtype.pyarrow_dtype

271 types = [ArrowDtype(struct.type) for struct in pa_type]

272 names = [struct.name for struct in pa_type]

273 return Series(types, index=Index(names))

274

275 def field(

276 self,

277 name_or_index: list[str]

278 | list[bytes]

279 | list[int]

280 | pc.Expression

281 | bytes

282 | str

283 | int,

284 ) -> Series:

285 """

286 Extract a child field of a struct as a Series.

287

288 Parameters

289 ----------

290 name_or_index : str | bytes | int | expression | list

291 Name or index of the child field to extract.

292

293 For list-like inputs, this will index into a nested

294 struct.

295

296 Returns

297 -------

298 pandas.Series

299 The data corresponding to the selected child field.

300

301 See Also

302 --------

303 Series.struct.explode : Return all child fields as a DataFrame.

304

305 Notes

306 -----

307 The name of the resulting Series will be set using the following

308 rules:

309

310 - For string, bytes, or integer `name_or_index` (or a list of these, for

311 a nested selection), the Series name is set to the selected

312 field's name.

313 - For a :class:`pyarrow.compute.Expression`, this is set to

314 the string form of the expression.

315 - For list-like `name_or_index`, the name will be set to the

316 name of the final field selected.

317

318 Examples

319 --------

320 >>> import pyarrow as pa

321 >>> s = pd.Series(

322 ... [

323 ... {"version": 1, "project": "pandas"},

324 ... {"version": 2, "project": "pandas"},

325 ... {"version": 1, "project": "numpy"},

326 ... ],

327 ... dtype=pd.ArrowDtype(pa.struct(

328 ... [("version", pa.int64()), ("project", pa.string())]

329 ... ))

330 ... )

331

332 Extract by field name.

333

334 >>> s.struct.field("project")

335 0 pandas

336 1 pandas

337 2 numpy

338 Name: project, dtype: string[pyarrow]

339

340 Extract by field index.

341

342 >>> s.struct.field(0)

343 0 1

344 1 2

345 2 1

346 Name: version, dtype: int64[pyarrow]

347

348 Or an expression

349

350 >>> import pyarrow.compute as pc

351 >>> s.struct.field(pc.field("project"))

352 0 pandas

353 1 pandas

354 2 numpy

355 Name: project, dtype: string[pyarrow]

356

357 For nested struct types, you can pass a list of values to index

358 multiple levels:

359

360 >>> version_type = pa.struct([

361 ... ("major", pa.int64()),

362 ... ("minor", pa.int64()),

363 ... ])

364 >>> s = pd.Series(

365 ... [

366 ... {"version": {"major": 1, "minor": 5}, "project": "pandas"},

367 ... {"version": {"major": 2, "minor": 1}, "project": "pandas"},

368 ... {"version": {"major": 1, "minor": 26}, "project": "numpy"},

369 ... ],

370 ... dtype=pd.ArrowDtype(pa.struct(

371 ... [("version", version_type), ("project", pa.string())]

372 ... ))

373 ... )

374 >>> s.struct.field(["version", "minor"])

375 0 5

376 1 1

377 2 26

378 Name: minor, dtype: int64[pyarrow]

379 >>> s.struct.field([0, 0])

380 0 1

381 1 2

382 2 1

383 Name: major, dtype: int64[pyarrow]

384 """

385 from pandas import Series

386

387 def get_name(

388 level_name_or_index: list[str]

389 | list[bytes]

390 | list[int]

391 | pc.Expression

392 | bytes

393 | str

394 | int,

395 data: pa.ChunkedArray,

396 ):

397 if isinstance(level_name_or_index, int):

398 name = data.type.field(level_name_or_index).name

399 elif isinstance(level_name_or_index, (str, bytes)):

400 name = level_name_or_index

401 elif isinstance(level_name_or_index, pc.Expression):

402 name = str(level_name_or_index)

403 elif is_list_like(level_name_or_index):

404 # For nested input like [2, 1, 2]

405 # iteratively get the struct and field name. The last

406 # one is used for the name of the index.

407 level_name_or_index = list(reversed(level_name_or_index))

408 selected = data

409 while level_name_or_index:

410 # we need the cast, otherwise mypy complains about

411 # getting ints, bytes, or str here, which isn't possible.

412 level_name_or_index = cast(list, level_name_or_index)

413 name_or_index = level_name_or_index.pop()

414 name = get_name(name_or_index, selected)

415 selected = selected.type.field(selected.type.get_field_index(name))

416 name = selected.name

417 else:

418 raise ValueError(

419 "name_or_index must be an int, str, bytes, "

420 "pyarrow.compute.Expression, or list of those"

421 )

422 return name

423

424 pa_arr = self._data.array._pa_array

425 name = get_name(name_or_index, pa_arr)

426 field_arr = pc.struct_field(pa_arr, name_or_index)

427

428 return Series(

429 field_arr,

430 dtype=ArrowDtype(field_arr.type),

431 index=self._data.index,

432 name=name,

433 )

434

435 def explode(self) -> DataFrame:

436 """

437 Extract all child fields of a struct as a DataFrame.

438

439 Returns

440 -------

441 pandas.DataFrame

442 The data corresponding to all child fields.

443

444 See Also

445 --------

446 Series.struct.field : Return a single child field as a Series.

447

448 Examples

449 --------

450 >>> import pyarrow as pa

451 >>> s = pd.Series(

452 ... [

453 ... {"version": 1, "project": "pandas"},

454 ... {"version": 2, "project": "pandas"},

455 ... {"version": 1, "project": "numpy"},

456 ... ],

457 ... dtype=pd.ArrowDtype(pa.struct(

458 ... [("version", pa.int64()), ("project", pa.string())]

459 ... ))

460 ... )

461

462 >>> s.struct.explode()

463 version project

464 0 1 pandas

465 1 2 pandas

466 2 1 numpy

467 """

468 from pandas import concat

469

470 pa_type = self._pa_array.type

471 return concat(

472 [self.field(i) for i in range(pa_type.num_fields)], axis="columns"

473 )