Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/from

1from __future__ import annotations

3import ctypes

4import re

5from typing import Any

7import numpy as np

9from pandas.compat._optional import import_optional_dependency

10from pandas.errors import SettingWithCopyError

12import pandas as pd

13from pandas.core.interchange.dataframe_protocol import (

14 Buffer,

15 Column,

16 ColumnNullType,

17 DataFrame as DataFrameXchg,

18 DtypeKind,

19)

20from pandas.core.interchange.utils import (

21 ArrowCTypes,

22 Endianness,

23)

25_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {

26 DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},

27 DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},

28 DtypeKind.FLOAT: {32: np.float32, 64: np.float64},

29 DtypeKind.BOOL: {1: bool, 8: bool},

30}

33def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:

34 """

35 Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.

37 Parameters

38 ----------

39 df : DataFrameXchg

40 Object supporting the interchange protocol, i.e. `__dataframe__` method.

41 allow_copy : bool, default: True

42 Whether to allow copying the memory to perform the conversion

43 (if false then zero-copy approach is requested).

45 Returns

46 -------

47 pd.DataFrame

49 Examples

50 --------

51 >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

52 >>> interchange_object = df_not_necessarily_pandas.__dataframe__()

53 >>> interchange_object.column_names()

54 Index(['A', 'B'], dtype='object')

55 >>> df_pandas = (pd.api.interchange.from_dataframe

56 ... (interchange_object.select_columns_by_name(['A'])))

57 >>> df_pandas

58 A

59 0 1

60 1 2

62 These methods (``column_names``, ``select_columns_by_name``) should work

63 for any dataframe library which implements the interchange protocol.

64 """

65 if isinstance(df, pd.DataFrame):

66 return df

68 if not hasattr(df, "__dataframe__"):

69 raise ValueError("`df` does not support __dataframe__")

71 return _from_dataframe(

72 df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy

73 )

76def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True):

77 """

78 Build a ``pd.DataFrame`` from the DataFrame interchange object.

80 Parameters

81 ----------

82 df : DataFrameXchg

83 Object supporting the interchange protocol, i.e. `__dataframe__` method.

84 allow_copy : bool, default: True

85 Whether to allow copying the memory to perform the conversion

86 (if false then zero-copy approach is requested).

88 Returns

89 -------

90 pd.DataFrame

91 """

92 pandas_dfs = []

93 for chunk in df.get_chunks():

94 pandas_df = protocol_df_chunk_to_pandas(chunk)

95 pandas_dfs.append(pandas_df)

97 if not allow_copy and len(pandas_dfs) > 1:

98 raise RuntimeError(

99 "To join chunks a copy is required which is forbidden by allow_copy=False"

100 )

101 if not pandas_dfs:

102 pandas_df = protocol_df_chunk_to_pandas(df)

103 elif len(pandas_dfs) == 1:

104 pandas_df = pandas_dfs[0]

105 else:

106 pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)

107

108 index_obj = df.metadata.get("pandas.index", None)

109 if index_obj is not None:

110 pandas_df.index = index_obj

111

112 return pandas_df

113

114

115def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:

116 """

117 Convert interchange protocol chunk to ``pd.DataFrame``.

118

119 Parameters

120 ----------

121 df : DataFrameXchg

122

123 Returns

124 -------

125 pd.DataFrame

126 """

127 # We need a dict of columns here, with each column being a NumPy array (at

128 # least for now, deal with non-NumPy dtypes later).

129 columns: dict[str, Any] = {}

130 buffers = [] # hold on to buffers, keeps memory alive

131 for name in df.column_names():

132 if not isinstance(name, str):

133 raise ValueError(f"Column {name} is not a string")

134 if name in columns:

135 raise ValueError(f"Column {name} is not unique")

136 col = df.get_column_by_name(name)

137 dtype = col.dtype[0]

138 if dtype in (

139 DtypeKind.INT,

140 DtypeKind.UINT,

141 DtypeKind.FLOAT,

142 DtypeKind.BOOL,

143 ):

144 columns[name], buf = primitive_column_to_ndarray(col)

145 elif dtype == DtypeKind.CATEGORICAL:

146 columns[name], buf = categorical_column_to_series(col)

147 elif dtype == DtypeKind.STRING:

148 columns[name], buf = string_column_to_ndarray(col)

149 elif dtype == DtypeKind.DATETIME:

150 columns[name], buf = datetime_column_to_ndarray(col)

151 else:

152 raise NotImplementedError(f"Data type {dtype} not handled yet")

153

154 buffers.append(buf)

155

156 pandas_df = pd.DataFrame(columns)

157 pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers

158 return pandas_df

159

160

161def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

162 """

163 Convert a column holding one of the primitive dtypes to a NumPy array.

164

165 A primitive type is one of: int, uint, float, bool.

166

167 Parameters

168 ----------

169 col : Column

170

171 Returns

172 -------

173 tuple

174 Tuple of np.ndarray holding the data and the memory owner object

175 that keeps the memory alive.

176 """

177 buffers = col.get_buffers()

178

179 data_buff, data_dtype = buffers["data"]

180 data = buffer_to_ndarray(

181 data_buff, data_dtype, offset=col.offset, length=col.size()

182 )

183

184 data = set_nulls(data, col, buffers["validity"])

185 return data, buffers

186

187

188def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:

189 """

190 Convert a column holding categorical data to a pandas Series.

191

192 Parameters

193 ----------

194 col : Column

195

196 Returns

197 -------

198 tuple

199 Tuple of pd.Series holding the data and the memory owner object

200 that keeps the memory alive.

201 """

202 categorical = col.describe_categorical

203

204 if not categorical["is_dictionary"]:

205 raise NotImplementedError("Non-dictionary categoricals not supported yet")

206

207 cat_column = categorical["categories"]

208 if hasattr(cat_column, "_col"):

209 # Item "Column" of "Optional[Column]" has no attribute "_col"

210 # Item "None" of "Optional[Column]" has no attribute "_col"

211 categories = np.array(cat_column._col) # type: ignore[union-attr]

212 else:

213 raise NotImplementedError(

214 "Interchanging categorical columns isn't supported yet, and our "

215 "fallback of using the `col._col` attribute (a ndarray) failed."

216 )

217 buffers = col.get_buffers()

218

219 codes_buff, codes_dtype = buffers["data"]

220 codes = buffer_to_ndarray(

221 codes_buff, codes_dtype, offset=col.offset, length=col.size()

222 )

223

224 # Doing module in order to not get ``IndexError`` for

225 # out-of-bounds sentinel values in `codes`

226 if len(categories) > 0:

227 values = categories[codes % len(categories)]

228 else:

229 values = codes

230

231 cat = pd.Categorical(

232 values, categories=categories, ordered=categorical["is_ordered"]

233 )

234 data = pd.Series(cat)

235

236 data = set_nulls(data, col, buffers["validity"])

237 return data, buffers

238

239

240def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

241 """

242 Convert a column holding string data to a NumPy array.

243

244 Parameters

245 ----------

246 col : Column

247

248 Returns

249 -------

250 tuple

251 Tuple of np.ndarray holding the data and the memory owner object

252 that keeps the memory alive.

253 """

254 null_kind, sentinel_val = col.describe_null

255

256 if null_kind not in (

257 ColumnNullType.NON_NULLABLE,

258 ColumnNullType.USE_BITMASK,

259 ColumnNullType.USE_BYTEMASK,

260 ):

261 raise NotImplementedError(

262 f"{null_kind} null kind is not yet supported for string columns."

263 )

264

265 buffers = col.get_buffers()

266

267 assert buffers["offsets"], "String buffers must contain offsets"

268 # Retrieve the data buffer containing the UTF-8 code units

269 data_buff, _ = buffers["data"]

270 # We're going to reinterpret the buffer as uint8, so make sure we can do it safely

271 assert col.dtype[2] in (

272 ArrowCTypes.STRING,

273 ArrowCTypes.LARGE_STRING,

274 ) # format_str == utf-8

275 # Convert the buffers to NumPy arrays. In order to go from STRING to

276 # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)

277 data_dtype = (

278 DtypeKind.UINT,

279 8,

280 ArrowCTypes.UINT8,

281 Endianness.NATIVE,

282 )

283 # Specify zero offset as we don't want to chunk the string data

284 data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)

285

286 # Retrieve the offsets buffer containing the index offsets demarcating

287 # the beginning and the ending of each string

288 offset_buff, offset_dtype = buffers["offsets"]

289 # Offsets buffer contains start-stop positions of strings in the data buffer,

290 # meaning that it has more elements than in the data buffer, do `col.size() + 1`

291 # here to pass a proper offsets buffer size

292 offsets = buffer_to_ndarray(

293 offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1

294 )

295

296 null_pos = None

297 if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):

298 validity = buffers["validity"]

299 if validity is not None:

300 valid_buff, valid_dtype = validity

301 null_pos = buffer_to_ndarray(

302 valid_buff, valid_dtype, offset=col.offset, length=col.size()

303 )

304 if sentinel_val == 0:

305 null_pos = ~null_pos

306

307 # Assemble the strings from the code units

308 str_list: list[None | float | str] = [None] * col.size()

309 for i in range(col.size()):

310 # Check for missing values

311 if null_pos is not None and null_pos[i]:

312 str_list[i] = np.nan

313 continue

314

315 # Extract a range of code units

316 units = data[offsets[i] : offsets[i + 1]]

317

318 # Convert the list of code units to bytes

319 str_bytes = bytes(units)

320

321 # Create the string

322 string = str_bytes.decode(encoding="utf-8")

323

324 # Add to our list of strings

325 str_list[i] = string

326

327 # Convert the string list to a NumPy array

328 return np.asarray(str_list, dtype="object"), buffers

329

330

331def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:

332 """Parse datetime `format_str` to interpret the `data`."""

333 # timestamp 'ts{unit}:tz'

334 timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)

335 if timestamp_meta:

336 unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)

337 if unit != "s":

338 # the format string describes only a first letter of the unit, so

339 # add one extra letter to convert the unit to numpy-style:

340 # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'

341 unit += "s"

342 data = data.astype(f"datetime64[{unit}]")

343 if tz != "":

344 data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz)

345 return data

346

347 # date 'td{Days/Ms}'

348 date_meta = re.match(r"td([Dm])", format_str)

349 if date_meta:

350 unit = date_meta.group(1)

351 if unit == "D":

352 # NumPy doesn't support DAY unit, so converting days to seconds

353 # (converting to uint64 to avoid overflow)

354 data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")

355 elif unit == "m":

356 data = data.astype("datetime64[ms]")

357 else:

358 raise NotImplementedError(f"Date unit is not supported: {unit}")

359 return data

360

361 raise NotImplementedError(f"DateTime kind is not supported: {format_str}")

362

363

364def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]:

365 """

366 Convert a column holding DateTime data to a NumPy array.

367

368 Parameters

369 ----------

370 col : Column

371

372 Returns

373 -------

374 tuple

375 Tuple of np.ndarray holding the data and the memory owner object

376 that keeps the memory alive.

377 """

378 buffers = col.get_buffers()

379

380 _, col_bit_width, format_str, _ = col.dtype

381 dbuf, _ = buffers["data"]

382 # Consider dtype being `uint` to get number of units passed since the 01.01.1970

383

384 data = buffer_to_ndarray(

385 dbuf,

386 (

387 DtypeKind.INT,

388 col_bit_width,

389 getattr(ArrowCTypes, f"INT{col_bit_width}"),

390 Endianness.NATIVE,

391 ),

392 offset=col.offset,

393 length=col.size(),

394 )

395

396 data = parse_datetime_format_str(format_str, data) # type: ignore[assignment]

397 data = set_nulls(data, col, buffers["validity"])

398 return data, buffers

399

400

401def buffer_to_ndarray(

402 buffer: Buffer,

403 dtype: tuple[DtypeKind, int, str, str],

404 *,

405 length: int,

406 offset: int = 0,

407) -> np.ndarray:

408 """

409 Build a NumPy array from the passed buffer.

410

411 Parameters

412 ----------

413 buffer : Buffer

414 Buffer to build a NumPy array from.

415 dtype : tuple

416 Data type of the buffer conforming protocol dtypes format.

417 offset : int, default: 0

418 Number of elements to offset from the start of the buffer.

419 length : int, optional

420 If the buffer is a bit-mask, specifies a number of bits to read

421 from the buffer. Has no effect otherwise.

422

423 Returns

424 -------

425 np.ndarray

426

427 Notes

428 -----

429 The returned array doesn't own the memory. The caller of this function is

430 responsible for keeping the memory owner object alive as long as

431 the returned NumPy array is being used.

432 """

433 kind, bit_width, _, _ = dtype

434

435 column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)

436 if column_dtype is None:

437 raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")

438

439 # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer

440 # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports

441 # it since https://github.com/numpy/numpy/pull/19083

442 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)

443

444 if bit_width == 1:

445 assert length is not None, "`length` must be specified for a bit-mask buffer."

446 pa = import_optional_dependency("pyarrow")

447 arr = pa.BooleanArray.from_buffers(

448 pa.bool_(),

449 length,

450 [None, pa.foreign_buffer(buffer.ptr, length)],

451 offset=offset,

452 )

453 return np.asarray(arr)

454 else:

455 data_pointer = ctypes.cast(

456 buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)

457 )

458 if length > 0:

459 return np.ctypeslib.as_array(data_pointer, shape=(length,))

460 return np.array([], dtype=ctypes_type)

461

462

463def set_nulls(

464 data: np.ndarray | pd.Series,

465 col: Column,

466 validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,

467 allow_modify_inplace: bool = True,

468):

469 """

470 Set null values for the data according to the column null kind.

471

472 Parameters

473 ----------

474 data : np.ndarray or pd.Series

475 Data to set nulls in.

476 col : Column

477 Column object that describes the `data`.

478 validity : tuple(Buffer, dtype) or None

479 The return value of ``col.buffers()``. We do not access the ``col.buffers()``

480 here to not take the ownership of the memory of buffer objects.

481 allow_modify_inplace : bool, default: True

482 Whether to modify the `data` inplace when zero-copy is possible (True) or always

483 modify a copy of the `data` (False).

484

485 Returns

486 -------

487 np.ndarray or pd.Series

488 Data with the nulls being set.

489 """

490 if validity is None:

491 return data

492 null_kind, sentinel_val = col.describe_null

493 null_pos = None

494

495 if null_kind == ColumnNullType.USE_SENTINEL:

496 null_pos = pd.Series(data) == sentinel_val

497 elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):

498 assert validity, "Expected to have a validity buffer for the mask"

499 valid_buff, valid_dtype = validity

500 null_pos = buffer_to_ndarray(

501 valid_buff, valid_dtype, offset=col.offset, length=col.size()

502 )

503 if sentinel_val == 0:

504 null_pos = ~null_pos

505 elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):

506 pass

507 else:

508 raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")

509

510 if null_pos is not None and np.any(null_pos):

511 if not allow_modify_inplace:

512 data = data.copy()

513 try:

514 data[null_pos] = None

515 except TypeError:

516 # TypeError happens if the `data` dtype appears to be non-nullable

517 # in numpy notation (bool, int, uint). If this happens,

518 # cast the `data` to nullable float dtype.

519 data = data.astype(float)

520 data[null_pos] = None

521 except SettingWithCopyError:

522 # `SettingWithCopyError` may happen for datetime-like with missing values.

523 data = data.copy()

524 data[null_pos] = None

525

526 return data

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/from_dataframe.py: 12%

181 statements