Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/from_dataframe.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

181 statements  

1from __future__ import annotations 

2 

3import ctypes 

4import re 

5from typing import Any 

6 

7import numpy as np 

8 

9from pandas.compat._optional import import_optional_dependency 

10from pandas.errors import SettingWithCopyError 

11 

12import pandas as pd 

13from pandas.core.interchange.dataframe_protocol import ( 

14 Buffer, 

15 Column, 

16 ColumnNullType, 

17 DataFrame as DataFrameXchg, 

18 DtypeKind, 

19) 

20from pandas.core.interchange.utils import ( 

21 ArrowCTypes, 

22 Endianness, 

23) 

24 

25_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = { 

26 DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, 

27 DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, 

28 DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, 

29 DtypeKind.BOOL: {1: bool, 8: bool}, 

30} 

31 

32 

33def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: 

34 """ 

35 Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. 

36 

37 Parameters 

38 ---------- 

39 df : DataFrameXchg 

40 Object supporting the interchange protocol, i.e. `__dataframe__` method. 

41 allow_copy : bool, default: True 

42 Whether to allow copying the memory to perform the conversion 

43 (if false then zero-copy approach is requested). 

44 

45 Returns 

46 ------- 

47 pd.DataFrame 

48 

49 Examples 

50 -------- 

51 >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) 

52 >>> interchange_object = df_not_necessarily_pandas.__dataframe__() 

53 >>> interchange_object.column_names() 

54 Index(['A', 'B'], dtype='object') 

55 >>> df_pandas = (pd.api.interchange.from_dataframe 

56 ... (interchange_object.select_columns_by_name(['A']))) 

57 >>> df_pandas 

58 A 

59 0 1 

60 1 2 

61 

62 These methods (``column_names``, ``select_columns_by_name``) should work 

63 for any dataframe library which implements the interchange protocol. 

64 """ 

65 if isinstance(df, pd.DataFrame): 

66 return df 

67 

68 if not hasattr(df, "__dataframe__"): 

69 raise ValueError("`df` does not support __dataframe__") 

70 

71 return _from_dataframe( 

72 df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy 

73 ) 

74 

75 

76def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True): 

77 """ 

78 Build a ``pd.DataFrame`` from the DataFrame interchange object. 

79 

80 Parameters 

81 ---------- 

82 df : DataFrameXchg 

83 Object supporting the interchange protocol, i.e. `__dataframe__` method. 

84 allow_copy : bool, default: True 

85 Whether to allow copying the memory to perform the conversion 

86 (if false then zero-copy approach is requested). 

87 

88 Returns 

89 ------- 

90 pd.DataFrame 

91 """ 

92 pandas_dfs = [] 

93 for chunk in df.get_chunks(): 

94 pandas_df = protocol_df_chunk_to_pandas(chunk) 

95 pandas_dfs.append(pandas_df) 

96 

97 if not allow_copy and len(pandas_dfs) > 1: 

98 raise RuntimeError( 

99 "To join chunks a copy is required which is forbidden by allow_copy=False" 

100 ) 

101 if not pandas_dfs: 

102 pandas_df = protocol_df_chunk_to_pandas(df) 

103 elif len(pandas_dfs) == 1: 

104 pandas_df = pandas_dfs[0] 

105 else: 

106 pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False) 

107 

108 index_obj = df.metadata.get("pandas.index", None) 

109 if index_obj is not None: 

110 pandas_df.index = index_obj 

111 

112 return pandas_df 

113 

114 

115def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: 

116 """ 

117 Convert interchange protocol chunk to ``pd.DataFrame``. 

118 

119 Parameters 

120 ---------- 

121 df : DataFrameXchg 

122 

123 Returns 

124 ------- 

125 pd.DataFrame 

126 """ 

127 # We need a dict of columns here, with each column being a NumPy array (at 

128 # least for now, deal with non-NumPy dtypes later). 

129 columns: dict[str, Any] = {} 

130 buffers = [] # hold on to buffers, keeps memory alive 

131 for name in df.column_names(): 

132 if not isinstance(name, str): 

133 raise ValueError(f"Column {name} is not a string") 

134 if name in columns: 

135 raise ValueError(f"Column {name} is not unique") 

136 col = df.get_column_by_name(name) 

137 dtype = col.dtype[0] 

138 if dtype in ( 

139 DtypeKind.INT, 

140 DtypeKind.UINT, 

141 DtypeKind.FLOAT, 

142 DtypeKind.BOOL, 

143 ): 

144 columns[name], buf = primitive_column_to_ndarray(col) 

145 elif dtype == DtypeKind.CATEGORICAL: 

146 columns[name], buf = categorical_column_to_series(col) 

147 elif dtype == DtypeKind.STRING: 

148 columns[name], buf = string_column_to_ndarray(col) 

149 elif dtype == DtypeKind.DATETIME: 

150 columns[name], buf = datetime_column_to_ndarray(col) 

151 else: 

152 raise NotImplementedError(f"Data type {dtype} not handled yet") 

153 

154 buffers.append(buf) 

155 

156 pandas_df = pd.DataFrame(columns) 

157 pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers 

158 return pandas_df 

159 

160 

161def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: 

162 """ 

163 Convert a column holding one of the primitive dtypes to a NumPy array. 

164 

165 A primitive type is one of: int, uint, float, bool. 

166 

167 Parameters 

168 ---------- 

169 col : Column 

170 

171 Returns 

172 ------- 

173 tuple 

174 Tuple of np.ndarray holding the data and the memory owner object 

175 that keeps the memory alive. 

176 """ 

177 buffers = col.get_buffers() 

178 

179 data_buff, data_dtype = buffers["data"] 

180 data = buffer_to_ndarray( 

181 data_buff, data_dtype, offset=col.offset, length=col.size() 

182 ) 

183 

184 data = set_nulls(data, col, buffers["validity"]) 

185 return data, buffers 

186 

187 

188def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: 

189 """ 

190 Convert a column holding categorical data to a pandas Series. 

191 

192 Parameters 

193 ---------- 

194 col : Column 

195 

196 Returns 

197 ------- 

198 tuple 

199 Tuple of pd.Series holding the data and the memory owner object 

200 that keeps the memory alive. 

201 """ 

202 categorical = col.describe_categorical 

203 

204 if not categorical["is_dictionary"]: 

205 raise NotImplementedError("Non-dictionary categoricals not supported yet") 

206 

207 cat_column = categorical["categories"] 

208 if hasattr(cat_column, "_col"): 

209 # Item "Column" of "Optional[Column]" has no attribute "_col" 

210 # Item "None" of "Optional[Column]" has no attribute "_col" 

211 categories = np.array(cat_column._col) # type: ignore[union-attr] 

212 else: 

213 raise NotImplementedError( 

214 "Interchanging categorical columns isn't supported yet, and our " 

215 "fallback of using the `col._col` attribute (a ndarray) failed." 

216 ) 

217 buffers = col.get_buffers() 

218 

219 codes_buff, codes_dtype = buffers["data"] 

220 codes = buffer_to_ndarray( 

221 codes_buff, codes_dtype, offset=col.offset, length=col.size() 

222 ) 

223 

224 # Doing module in order to not get ``IndexError`` for 

225 # out-of-bounds sentinel values in `codes` 

226 if len(categories) > 0: 

227 values = categories[codes % len(categories)] 

228 else: 

229 values = codes 

230 

231 cat = pd.Categorical( 

232 values, categories=categories, ordered=categorical["is_ordered"] 

233 ) 

234 data = pd.Series(cat) 

235 

236 data = set_nulls(data, col, buffers["validity"]) 

237 return data, buffers 

238 

239 

240def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: 

241 """ 

242 Convert a column holding string data to a NumPy array. 

243 

244 Parameters 

245 ---------- 

246 col : Column 

247 

248 Returns 

249 ------- 

250 tuple 

251 Tuple of np.ndarray holding the data and the memory owner object 

252 that keeps the memory alive. 

253 """ 

254 null_kind, sentinel_val = col.describe_null 

255 

256 if null_kind not in ( 

257 ColumnNullType.NON_NULLABLE, 

258 ColumnNullType.USE_BITMASK, 

259 ColumnNullType.USE_BYTEMASK, 

260 ): 

261 raise NotImplementedError( 

262 f"{null_kind} null kind is not yet supported for string columns." 

263 ) 

264 

265 buffers = col.get_buffers() 

266 

267 assert buffers["offsets"], "String buffers must contain offsets" 

268 # Retrieve the data buffer containing the UTF-8 code units 

269 data_buff, _ = buffers["data"] 

270 # We're going to reinterpret the buffer as uint8, so make sure we can do it safely 

271 assert col.dtype[2] in ( 

272 ArrowCTypes.STRING, 

273 ArrowCTypes.LARGE_STRING, 

274 ) # format_str == utf-8 

275 # Convert the buffers to NumPy arrays. In order to go from STRING to 

276 # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) 

277 data_dtype = ( 

278 DtypeKind.UINT, 

279 8, 

280 ArrowCTypes.UINT8, 

281 Endianness.NATIVE, 

282 ) 

283 # Specify zero offset as we don't want to chunk the string data 

284 data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) 

285 

286 # Retrieve the offsets buffer containing the index offsets demarcating 

287 # the beginning and the ending of each string 

288 offset_buff, offset_dtype = buffers["offsets"] 

289 # Offsets buffer contains start-stop positions of strings in the data buffer, 

290 # meaning that it has more elements than in the data buffer, do `col.size() + 1` 

291 # here to pass a proper offsets buffer size 

292 offsets = buffer_to_ndarray( 

293 offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1 

294 ) 

295 

296 null_pos = None 

297 if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): 

298 validity = buffers["validity"] 

299 if validity is not None: 

300 valid_buff, valid_dtype = validity 

301 null_pos = buffer_to_ndarray( 

302 valid_buff, valid_dtype, offset=col.offset, length=col.size() 

303 ) 

304 if sentinel_val == 0: 

305 null_pos = ~null_pos 

306 

307 # Assemble the strings from the code units 

308 str_list: list[None | float | str] = [None] * col.size() 

309 for i in range(col.size()): 

310 # Check for missing values 

311 if null_pos is not None and null_pos[i]: 

312 str_list[i] = np.nan 

313 continue 

314 

315 # Extract a range of code units 

316 units = data[offsets[i] : offsets[i + 1]] 

317 

318 # Convert the list of code units to bytes 

319 str_bytes = bytes(units) 

320 

321 # Create the string 

322 string = str_bytes.decode(encoding="utf-8") 

323 

324 # Add to our list of strings 

325 str_list[i] = string 

326 

327 # Convert the string list to a NumPy array 

328 return np.asarray(str_list, dtype="object"), buffers 

329 

330 

331def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: 

332 """Parse datetime `format_str` to interpret the `data`.""" 

333 # timestamp 'ts{unit}:tz' 

334 timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) 

335 if timestamp_meta: 

336 unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) 

337 if unit != "s": 

338 # the format string describes only a first letter of the unit, so 

339 # add one extra letter to convert the unit to numpy-style: 

340 # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' 

341 unit += "s" 

342 data = data.astype(f"datetime64[{unit}]") 

343 if tz != "": 

344 data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz) 

345 return data 

346 

347 # date 'td{Days/Ms}' 

348 date_meta = re.match(r"td([Dm])", format_str) 

349 if date_meta: 

350 unit = date_meta.group(1) 

351 if unit == "D": 

352 # NumPy doesn't support DAY unit, so converting days to seconds 

353 # (converting to uint64 to avoid overflow) 

354 data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") 

355 elif unit == "m": 

356 data = data.astype("datetime64[ms]") 

357 else: 

358 raise NotImplementedError(f"Date unit is not supported: {unit}") 

359 return data 

360 

361 raise NotImplementedError(f"DateTime kind is not supported: {format_str}") 

362 

363 

364def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]: 

365 """ 

366 Convert a column holding DateTime data to a NumPy array. 

367 

368 Parameters 

369 ---------- 

370 col : Column 

371 

372 Returns 

373 ------- 

374 tuple 

375 Tuple of np.ndarray holding the data and the memory owner object 

376 that keeps the memory alive. 

377 """ 

378 buffers = col.get_buffers() 

379 

380 _, col_bit_width, format_str, _ = col.dtype 

381 dbuf, _ = buffers["data"] 

382 # Consider dtype being `uint` to get number of units passed since the 01.01.1970 

383 

384 data = buffer_to_ndarray( 

385 dbuf, 

386 ( 

387 DtypeKind.INT, 

388 col_bit_width, 

389 getattr(ArrowCTypes, f"INT{col_bit_width}"), 

390 Endianness.NATIVE, 

391 ), 

392 offset=col.offset, 

393 length=col.size(), 

394 ) 

395 

396 data = parse_datetime_format_str(format_str, data) # type: ignore[assignment] 

397 data = set_nulls(data, col, buffers["validity"]) 

398 return data, buffers 

399 

400 

401def buffer_to_ndarray( 

402 buffer: Buffer, 

403 dtype: tuple[DtypeKind, int, str, str], 

404 *, 

405 length: int, 

406 offset: int = 0, 

407) -> np.ndarray: 

408 """ 

409 Build a NumPy array from the passed buffer. 

410 

411 Parameters 

412 ---------- 

413 buffer : Buffer 

414 Buffer to build a NumPy array from. 

415 dtype : tuple 

416 Data type of the buffer conforming protocol dtypes format. 

417 offset : int, default: 0 

418 Number of elements to offset from the start of the buffer. 

419 length : int, optional 

420 If the buffer is a bit-mask, specifies a number of bits to read 

421 from the buffer. Has no effect otherwise. 

422 

423 Returns 

424 ------- 

425 np.ndarray 

426 

427 Notes 

428 ----- 

429 The returned array doesn't own the memory. The caller of this function is 

430 responsible for keeping the memory owner object alive as long as 

431 the returned NumPy array is being used. 

432 """ 

433 kind, bit_width, _, _ = dtype 

434 

435 column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None) 

436 if column_dtype is None: 

437 raise NotImplementedError(f"Conversion for {dtype} is not yet supported.") 

438 

439 # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer 

440 # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports 

441 # it since https://github.com/numpy/numpy/pull/19083 

442 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) 

443 

444 if bit_width == 1: 

445 assert length is not None, "`length` must be specified for a bit-mask buffer." 

446 pa = import_optional_dependency("pyarrow") 

447 arr = pa.BooleanArray.from_buffers( 

448 pa.bool_(), 

449 length, 

450 [None, pa.foreign_buffer(buffer.ptr, length)], 

451 offset=offset, 

452 ) 

453 return np.asarray(arr) 

454 else: 

455 data_pointer = ctypes.cast( 

456 buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) 

457 ) 

458 if length > 0: 

459 return np.ctypeslib.as_array(data_pointer, shape=(length,)) 

460 return np.array([], dtype=ctypes_type) 

461 

462 

463def set_nulls( 

464 data: np.ndarray | pd.Series, 

465 col: Column, 

466 validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, 

467 allow_modify_inplace: bool = True, 

468): 

469 """ 

470 Set null values for the data according to the column null kind. 

471 

472 Parameters 

473 ---------- 

474 data : np.ndarray or pd.Series 

475 Data to set nulls in. 

476 col : Column 

477 Column object that describes the `data`. 

478 validity : tuple(Buffer, dtype) or None 

479 The return value of ``col.buffers()``. We do not access the ``col.buffers()`` 

480 here to not take the ownership of the memory of buffer objects. 

481 allow_modify_inplace : bool, default: True 

482 Whether to modify the `data` inplace when zero-copy is possible (True) or always 

483 modify a copy of the `data` (False). 

484 

485 Returns 

486 ------- 

487 np.ndarray or pd.Series 

488 Data with the nulls being set. 

489 """ 

490 if validity is None: 

491 return data 

492 null_kind, sentinel_val = col.describe_null 

493 null_pos = None 

494 

495 if null_kind == ColumnNullType.USE_SENTINEL: 

496 null_pos = pd.Series(data) == sentinel_val 

497 elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): 

498 assert validity, "Expected to have a validity buffer for the mask" 

499 valid_buff, valid_dtype = validity 

500 null_pos = buffer_to_ndarray( 

501 valid_buff, valid_dtype, offset=col.offset, length=col.size() 

502 ) 

503 if sentinel_val == 0: 

504 null_pos = ~null_pos 

505 elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): 

506 pass 

507 else: 

508 raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") 

509 

510 if null_pos is not None and np.any(null_pos): 

511 if not allow_modify_inplace: 

512 data = data.copy() 

513 try: 

514 data[null_pos] = None 

515 except TypeError: 

516 # TypeError happens if the `data` dtype appears to be non-nullable 

517 # in numpy notation (bool, int, uint). If this happens, 

518 # cast the `data` to nullable float dtype. 

519 data = data.astype(float) 

520 data[null_pos] = None 

521 except SettingWithCopyError: 

522 # `SettingWithCopyError` may happen for datetime-like with missing values. 

523 data = data.copy() 

524 data[null_pos] = None 

525 

526 return data