Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/arrow/accessors.py: 32%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

101 statements  

1"""Accessors for arrow-backed data.""" 

2 

3from __future__ import annotations 

4 

5from abc import ( 

6 ABCMeta, 

7 abstractmethod, 

8) 

9from typing import ( 

10 TYPE_CHECKING, 

11 cast, 

12) 

13 

14from pandas.compat import ( 

15 pa_version_under10p1, 

16 pa_version_under11p0, 

17) 

18 

19from pandas.core.dtypes.common import is_list_like 

20 

21if not pa_version_under10p1: 

22 import pyarrow as pa 

23 import pyarrow.compute as pc 

24 

25 from pandas.core.dtypes.dtypes import ArrowDtype 

26 

27if TYPE_CHECKING: 

28 from collections.abc import Iterator 

29 

30 from pandas import ( 

31 DataFrame, 

32 Series, 

33 ) 

34 

35 

36class ArrowAccessor(metaclass=ABCMeta): 

37 @abstractmethod 

38 def __init__(self, data, validation_msg: str) -> None: 

39 self._data = data 

40 self._validation_msg = validation_msg 

41 self._validate(data) 

42 

43 @abstractmethod 

44 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: 

45 pass 

46 

47 def _validate(self, data): 

48 dtype = data.dtype 

49 if not isinstance(dtype, ArrowDtype): 

50 # Raise AttributeError so that inspect can handle non-struct Series. 

51 raise AttributeError(self._validation_msg.format(dtype=dtype)) 

52 

53 if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype): 

54 # Raise AttributeError so that inspect can handle invalid Series. 

55 raise AttributeError(self._validation_msg.format(dtype=dtype)) 

56 

57 @property 

58 def _pa_array(self): 

59 return self._data.array._pa_array 

60 

61 

62class ListAccessor(ArrowAccessor): 

63 """ 

64 Accessor object for list data properties of the Series values. 

65 

66 Parameters 

67 ---------- 

68 data : Series 

69 Series containing Arrow list data. 

70 """ 

71 

72 def __init__(self, data=None) -> None: 

73 super().__init__( 

74 data, 

75 validation_msg="Can only use the '.list' accessor with " 

76 "'list[pyarrow]' dtype, not {dtype}.", 

77 ) 

78 

79 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: 

80 return ( 

81 pa.types.is_list(pyarrow_dtype) 

82 or pa.types.is_fixed_size_list(pyarrow_dtype) 

83 or pa.types.is_large_list(pyarrow_dtype) 

84 ) 

85 

86 def len(self) -> Series: 

87 """ 

88 Return the length of each list in the Series. 

89 

90 Returns 

91 ------- 

92 pandas.Series 

93 The length of each list. 

94 

95 Examples 

96 -------- 

97 >>> import pyarrow as pa 

98 >>> s = pd.Series( 

99 ... [ 

100 ... [1, 2, 3], 

101 ... [3], 

102 ... ], 

103 ... dtype=pd.ArrowDtype(pa.list_( 

104 ... pa.int64() 

105 ... )) 

106 ... ) 

107 >>> s.list.len() 

108 0 3 

109 1 1 

110 dtype: int32[pyarrow] 

111 """ 

112 from pandas import Series 

113 

114 value_lengths = pc.list_value_length(self._pa_array) 

115 return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) 

116 

117 def __getitem__(self, key: int | slice) -> Series: 

118 """ 

119 Index or slice lists in the Series. 

120 

121 Parameters 

122 ---------- 

123 key : int | slice 

124 Index or slice of indices to access from each list. 

125 

126 Returns 

127 ------- 

128 pandas.Series 

129 The list at requested index. 

130 

131 Examples 

132 -------- 

133 >>> import pyarrow as pa 

134 >>> s = pd.Series( 

135 ... [ 

136 ... [1, 2, 3], 

137 ... [3], 

138 ... ], 

139 ... dtype=pd.ArrowDtype(pa.list_( 

140 ... pa.int64() 

141 ... )) 

142 ... ) 

143 >>> s.list[0] 

144 0 1 

145 1 3 

146 dtype: int64[pyarrow] 

147 """ 

148 from pandas import Series 

149 

150 if isinstance(key, int): 

151 # TODO: Support negative key but pyarrow does not allow 

152 # element index to be an array. 

153 # if key < 0: 

154 # key = pc.add(key, pc.list_value_length(self._pa_array)) 

155 element = pc.list_element(self._pa_array, key) 

156 return Series(element, dtype=ArrowDtype(element.type)) 

157 elif isinstance(key, slice): 

158 if pa_version_under11p0: 

159 raise NotImplementedError( 

160 f"List slice not supported by pyarrow {pa.__version__}." 

161 ) 

162 

163 # TODO: Support negative start/stop/step, ideally this would be added 

164 # upstream in pyarrow. 

165 start, stop, step = key.start, key.stop, key.step 

166 if start is None: 

167 # TODO: When adding negative step support 

168 # this should be setto last element of array 

169 # when step is negative. 

170 start = 0 

171 if step is None: 

172 step = 1 

173 sliced = pc.list_slice(self._pa_array, start, stop, step) 

174 return Series(sliced, dtype=ArrowDtype(sliced.type)) 

175 else: 

176 raise ValueError(f"key must be an int or slice, got {type(key).__name__}") 

177 

178 def __iter__(self) -> Iterator: 

179 raise TypeError(f"'{type(self).__name__}' object is not iterable") 

180 

181 def flatten(self) -> Series: 

182 """ 

183 Flatten list values. 

184 

185 Returns 

186 ------- 

187 pandas.Series 

188 The data from all lists in the series flattened. 

189 

190 Examples 

191 -------- 

192 >>> import pyarrow as pa 

193 >>> s = pd.Series( 

194 ... [ 

195 ... [1, 2, 3], 

196 ... [3], 

197 ... ], 

198 ... dtype=pd.ArrowDtype(pa.list_( 

199 ... pa.int64() 

200 ... )) 

201 ... ) 

202 >>> s.list.flatten() 

203 0 1 

204 1 2 

205 2 3 

206 3 3 

207 dtype: int64[pyarrow] 

208 """ 

209 from pandas import Series 

210 

211 flattened = pc.list_flatten(self._pa_array) 

212 return Series(flattened, dtype=ArrowDtype(flattened.type)) 

213 

214 

215class StructAccessor(ArrowAccessor): 

216 """ 

217 Accessor object for structured data properties of the Series values. 

218 

219 Parameters 

220 ---------- 

221 data : Series 

222 Series containing Arrow struct data. 

223 """ 

224 

225 def __init__(self, data=None) -> None: 

226 super().__init__( 

227 data, 

228 validation_msg=( 

229 "Can only use the '.struct' accessor with 'struct[pyarrow]' " 

230 "dtype, not {dtype}." 

231 ), 

232 ) 

233 

234 def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: 

235 return pa.types.is_struct(pyarrow_dtype) 

236 

237 @property 

238 def dtypes(self) -> Series: 

239 """ 

240 Return the dtype object of each child field of the struct. 

241 

242 Returns 

243 ------- 

244 pandas.Series 

245 The data type of each child field. 

246 

247 Examples 

248 -------- 

249 >>> import pyarrow as pa 

250 >>> s = pd.Series( 

251 ... [ 

252 ... {"version": 1, "project": "pandas"}, 

253 ... {"version": 2, "project": "pandas"}, 

254 ... {"version": 1, "project": "numpy"}, 

255 ... ], 

256 ... dtype=pd.ArrowDtype(pa.struct( 

257 ... [("version", pa.int64()), ("project", pa.string())] 

258 ... )) 

259 ... ) 

260 >>> s.struct.dtypes 

261 version int64[pyarrow] 

262 project string[pyarrow] 

263 dtype: object 

264 """ 

265 from pandas import ( 

266 Index, 

267 Series, 

268 ) 

269 

270 pa_type = self._data.dtype.pyarrow_dtype 

271 types = [ArrowDtype(struct.type) for struct in pa_type] 

272 names = [struct.name for struct in pa_type] 

273 return Series(types, index=Index(names)) 

274 

275 def field( 

276 self, 

277 name_or_index: list[str] 

278 | list[bytes] 

279 | list[int] 

280 | pc.Expression 

281 | bytes 

282 | str 

283 | int, 

284 ) -> Series: 

285 """ 

286 Extract a child field of a struct as a Series. 

287 

288 Parameters 

289 ---------- 

290 name_or_index : str | bytes | int | expression | list 

291 Name or index of the child field to extract. 

292 

293 For list-like inputs, this will index into a nested 

294 struct. 

295 

296 Returns 

297 ------- 

298 pandas.Series 

299 The data corresponding to the selected child field. 

300 

301 See Also 

302 -------- 

303 Series.struct.explode : Return all child fields as a DataFrame. 

304 

305 Notes 

306 ----- 

307 The name of the resulting Series will be set using the following 

308 rules: 

309 

310 - For string, bytes, or integer `name_or_index` (or a list of these, for 

311 a nested selection), the Series name is set to the selected 

312 field's name. 

313 - For a :class:`pyarrow.compute.Expression`, this is set to 

314 the string form of the expression. 

315 - For list-like `name_or_index`, the name will be set to the 

316 name of the final field selected. 

317 

318 Examples 

319 -------- 

320 >>> import pyarrow as pa 

321 >>> s = pd.Series( 

322 ... [ 

323 ... {"version": 1, "project": "pandas"}, 

324 ... {"version": 2, "project": "pandas"}, 

325 ... {"version": 1, "project": "numpy"}, 

326 ... ], 

327 ... dtype=pd.ArrowDtype(pa.struct( 

328 ... [("version", pa.int64()), ("project", pa.string())] 

329 ... )) 

330 ... ) 

331 

332 Extract by field name. 

333 

334 >>> s.struct.field("project") 

335 0 pandas 

336 1 pandas 

337 2 numpy 

338 Name: project, dtype: string[pyarrow] 

339 

340 Extract by field index. 

341 

342 >>> s.struct.field(0) 

343 0 1 

344 1 2 

345 2 1 

346 Name: version, dtype: int64[pyarrow] 

347 

348 Or an expression 

349 

350 >>> import pyarrow.compute as pc 

351 >>> s.struct.field(pc.field("project")) 

352 0 pandas 

353 1 pandas 

354 2 numpy 

355 Name: project, dtype: string[pyarrow] 

356 

357 For nested struct types, you can pass a list of values to index 

358 multiple levels: 

359 

360 >>> version_type = pa.struct([ 

361 ... ("major", pa.int64()), 

362 ... ("minor", pa.int64()), 

363 ... ]) 

364 >>> s = pd.Series( 

365 ... [ 

366 ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, 

367 ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, 

368 ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, 

369 ... ], 

370 ... dtype=pd.ArrowDtype(pa.struct( 

371 ... [("version", version_type), ("project", pa.string())] 

372 ... )) 

373 ... ) 

374 >>> s.struct.field(["version", "minor"]) 

375 0 5 

376 1 1 

377 2 26 

378 Name: minor, dtype: int64[pyarrow] 

379 >>> s.struct.field([0, 0]) 

380 0 1 

381 1 2 

382 2 1 

383 Name: major, dtype: int64[pyarrow] 

384 """ 

385 from pandas import Series 

386 

387 def get_name( 

388 level_name_or_index: list[str] 

389 | list[bytes] 

390 | list[int] 

391 | pc.Expression 

392 | bytes 

393 | str 

394 | int, 

395 data: pa.ChunkedArray, 

396 ): 

397 if isinstance(level_name_or_index, int): 

398 name = data.type.field(level_name_or_index).name 

399 elif isinstance(level_name_or_index, (str, bytes)): 

400 name = level_name_or_index 

401 elif isinstance(level_name_or_index, pc.Expression): 

402 name = str(level_name_or_index) 

403 elif is_list_like(level_name_or_index): 

404 # For nested input like [2, 1, 2] 

405 # iteratively get the struct and field name. The last 

406 # one is used for the name of the index. 

407 level_name_or_index = list(reversed(level_name_or_index)) 

408 selected = data 

409 while level_name_or_index: 

410 # we need the cast, otherwise mypy complains about 

411 # getting ints, bytes, or str here, which isn't possible. 

412 level_name_or_index = cast(list, level_name_or_index) 

413 name_or_index = level_name_or_index.pop() 

414 name = get_name(name_or_index, selected) 

415 selected = selected.type.field(selected.type.get_field_index(name)) 

416 name = selected.name 

417 else: 

418 raise ValueError( 

419 "name_or_index must be an int, str, bytes, " 

420 "pyarrow.compute.Expression, or list of those" 

421 ) 

422 return name 

423 

424 pa_arr = self._data.array._pa_array 

425 name = get_name(name_or_index, pa_arr) 

426 field_arr = pc.struct_field(pa_arr, name_or_index) 

427 

428 return Series( 

429 field_arr, 

430 dtype=ArrowDtype(field_arr.type), 

431 index=self._data.index, 

432 name=name, 

433 ) 

434 

435 def explode(self) -> DataFrame: 

436 """ 

437 Extract all child fields of a struct as a DataFrame. 

438 

439 Returns 

440 ------- 

441 pandas.DataFrame 

442 The data corresponding to all child fields. 

443 

444 See Also 

445 -------- 

446 Series.struct.field : Return a single child field as a Series. 

447 

448 Examples 

449 -------- 

450 >>> import pyarrow as pa 

451 >>> s = pd.Series( 

452 ... [ 

453 ... {"version": 1, "project": "pandas"}, 

454 ... {"version": 2, "project": "pandas"}, 

455 ... {"version": 1, "project": "numpy"}, 

456 ... ], 

457 ... dtype=pd.ArrowDtype(pa.struct( 

458 ... [("version", pa.int64()), ("project", pa.string())] 

459 ... )) 

460 ... ) 

461 

462 >>> s.struct.explode() 

463 version project 

464 0 1 pandas 

465 1 2 pandas 

466 2 1 numpy 

467 """ 

468 from pandas import concat 

469 

470 pa_type = self._pa_array.type 

471 return concat( 

472 [self.field(i) for i in range(pa_type.num_fields)], axis="columns" 

473 )