1from __future__ import annotations
2
3import ctypes
4import re
5from typing import Any
6
7import numpy as np
8
9from pandas.compat._optional import import_optional_dependency
10from pandas.errors import SettingWithCopyError
11
12import pandas as pd
13from pandas.core.interchange.dataframe_protocol import (
14 Buffer,
15 Column,
16 ColumnNullType,
17 DataFrame as DataFrameXchg,
18 DtypeKind,
19)
20from pandas.core.interchange.utils import (
21 ArrowCTypes,
22 Endianness,
23)
24
25_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {
26 DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
27 DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
28 DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
29 DtypeKind.BOOL: {1: bool, 8: bool},
30}
31
32
33def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
34 """
35 Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
36
37 Parameters
38 ----------
39 df : DataFrameXchg
40 Object supporting the interchange protocol, i.e. `__dataframe__` method.
41 allow_copy : bool, default: True
42 Whether to allow copying the memory to perform the conversion
43 (if false then zero-copy approach is requested).
44
45 Returns
46 -------
47 pd.DataFrame
48
49 Examples
50 --------
51 >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
52 >>> interchange_object = df_not_necessarily_pandas.__dataframe__()
53 >>> interchange_object.column_names()
54 Index(['A', 'B'], dtype='object')
55 >>> df_pandas = (pd.api.interchange.from_dataframe
56 ... (interchange_object.select_columns_by_name(['A'])))
57 >>> df_pandas
58 A
59 0 1
60 1 2
61
62 These methods (``column_names``, ``select_columns_by_name``) should work
63 for any dataframe library which implements the interchange protocol.
64 """
65 if isinstance(df, pd.DataFrame):
66 return df
67
68 if not hasattr(df, "__dataframe__"):
69 raise ValueError("`df` does not support __dataframe__")
70
71 return _from_dataframe(
72 df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy
73 )
74
75
76def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True):
77 """
78 Build a ``pd.DataFrame`` from the DataFrame interchange object.
79
80 Parameters
81 ----------
82 df : DataFrameXchg
83 Object supporting the interchange protocol, i.e. `__dataframe__` method.
84 allow_copy : bool, default: True
85 Whether to allow copying the memory to perform the conversion
86 (if false then zero-copy approach is requested).
87
88 Returns
89 -------
90 pd.DataFrame
91 """
92 pandas_dfs = []
93 for chunk in df.get_chunks():
94 pandas_df = protocol_df_chunk_to_pandas(chunk)
95 pandas_dfs.append(pandas_df)
96
97 if not allow_copy and len(pandas_dfs) > 1:
98 raise RuntimeError(
99 "To join chunks a copy is required which is forbidden by allow_copy=False"
100 )
101 if not pandas_dfs:
102 pandas_df = protocol_df_chunk_to_pandas(df)
103 elif len(pandas_dfs) == 1:
104 pandas_df = pandas_dfs[0]
105 else:
106 pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)
107
108 index_obj = df.metadata.get("pandas.index", None)
109 if index_obj is not None:
110 pandas_df.index = index_obj
111
112 return pandas_df
113
114
115def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
116 """
117 Convert interchange protocol chunk to ``pd.DataFrame``.
118
119 Parameters
120 ----------
121 df : DataFrameXchg
122
123 Returns
124 -------
125 pd.DataFrame
126 """
127 # We need a dict of columns here, with each column being a NumPy array (at
128 # least for now, deal with non-NumPy dtypes later).
129 columns: dict[str, Any] = {}
130 buffers = [] # hold on to buffers, keeps memory alive
131 for name in df.column_names():
132 if not isinstance(name, str):
133 raise ValueError(f"Column {name} is not a string")
134 if name in columns:
135 raise ValueError(f"Column {name} is not unique")
136 col = df.get_column_by_name(name)
137 dtype = col.dtype[0]
138 if dtype in (
139 DtypeKind.INT,
140 DtypeKind.UINT,
141 DtypeKind.FLOAT,
142 DtypeKind.BOOL,
143 ):
144 columns[name], buf = primitive_column_to_ndarray(col)
145 elif dtype == DtypeKind.CATEGORICAL:
146 columns[name], buf = categorical_column_to_series(col)
147 elif dtype == DtypeKind.STRING:
148 columns[name], buf = string_column_to_ndarray(col)
149 elif dtype == DtypeKind.DATETIME:
150 columns[name], buf = datetime_column_to_ndarray(col)
151 else:
152 raise NotImplementedError(f"Data type {dtype} not handled yet")
153
154 buffers.append(buf)
155
156 pandas_df = pd.DataFrame(columns)
157 pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers
158 return pandas_df
159
160
161def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
162 """
163 Convert a column holding one of the primitive dtypes to a NumPy array.
164
165 A primitive type is one of: int, uint, float, bool.
166
167 Parameters
168 ----------
169 col : Column
170
171 Returns
172 -------
173 tuple
174 Tuple of np.ndarray holding the data and the memory owner object
175 that keeps the memory alive.
176 """
177 buffers = col.get_buffers()
178
179 data_buff, data_dtype = buffers["data"]
180 data = buffer_to_ndarray(
181 data_buff, data_dtype, offset=col.offset, length=col.size()
182 )
183
184 data = set_nulls(data, col, buffers["validity"])
185 return data, buffers
186
187
188def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
189 """
190 Convert a column holding categorical data to a pandas Series.
191
192 Parameters
193 ----------
194 col : Column
195
196 Returns
197 -------
198 tuple
199 Tuple of pd.Series holding the data and the memory owner object
200 that keeps the memory alive.
201 """
202 categorical = col.describe_categorical
203
204 if not categorical["is_dictionary"]:
205 raise NotImplementedError("Non-dictionary categoricals not supported yet")
206
207 cat_column = categorical["categories"]
208 if hasattr(cat_column, "_col"):
209 # Item "Column" of "Optional[Column]" has no attribute "_col"
210 # Item "None" of "Optional[Column]" has no attribute "_col"
211 categories = np.array(cat_column._col) # type: ignore[union-attr]
212 else:
213 raise NotImplementedError(
214 "Interchanging categorical columns isn't supported yet, and our "
215 "fallback of using the `col._col` attribute (a ndarray) failed."
216 )
217 buffers = col.get_buffers()
218
219 codes_buff, codes_dtype = buffers["data"]
220 codes = buffer_to_ndarray(
221 codes_buff, codes_dtype, offset=col.offset, length=col.size()
222 )
223
224 # Doing module in order to not get ``IndexError`` for
225 # out-of-bounds sentinel values in `codes`
226 if len(categories) > 0:
227 values = categories[codes % len(categories)]
228 else:
229 values = codes
230
231 cat = pd.Categorical(
232 values, categories=categories, ordered=categorical["is_ordered"]
233 )
234 data = pd.Series(cat)
235
236 data = set_nulls(data, col, buffers["validity"])
237 return data, buffers
238
239
240def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
241 """
242 Convert a column holding string data to a NumPy array.
243
244 Parameters
245 ----------
246 col : Column
247
248 Returns
249 -------
250 tuple
251 Tuple of np.ndarray holding the data and the memory owner object
252 that keeps the memory alive.
253 """
254 null_kind, sentinel_val = col.describe_null
255
256 if null_kind not in (
257 ColumnNullType.NON_NULLABLE,
258 ColumnNullType.USE_BITMASK,
259 ColumnNullType.USE_BYTEMASK,
260 ):
261 raise NotImplementedError(
262 f"{null_kind} null kind is not yet supported for string columns."
263 )
264
265 buffers = col.get_buffers()
266
267 assert buffers["offsets"], "String buffers must contain offsets"
268 # Retrieve the data buffer containing the UTF-8 code units
269 data_buff, _ = buffers["data"]
270 # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
271 assert col.dtype[2] in (
272 ArrowCTypes.STRING,
273 ArrowCTypes.LARGE_STRING,
274 ) # format_str == utf-8
275 # Convert the buffers to NumPy arrays. In order to go from STRING to
276 # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
277 data_dtype = (
278 DtypeKind.UINT,
279 8,
280 ArrowCTypes.UINT8,
281 Endianness.NATIVE,
282 )
283 # Specify zero offset as we don't want to chunk the string data
284 data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
285
286 # Retrieve the offsets buffer containing the index offsets demarcating
287 # the beginning and the ending of each string
288 offset_buff, offset_dtype = buffers["offsets"]
289 # Offsets buffer contains start-stop positions of strings in the data buffer,
290 # meaning that it has more elements than in the data buffer, do `col.size() + 1`
291 # here to pass a proper offsets buffer size
292 offsets = buffer_to_ndarray(
293 offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
294 )
295
296 null_pos = None
297 if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
298 validity = buffers["validity"]
299 if validity is not None:
300 valid_buff, valid_dtype = validity
301 null_pos = buffer_to_ndarray(
302 valid_buff, valid_dtype, offset=col.offset, length=col.size()
303 )
304 if sentinel_val == 0:
305 null_pos = ~null_pos
306
307 # Assemble the strings from the code units
308 str_list: list[None | float | str] = [None] * col.size()
309 for i in range(col.size()):
310 # Check for missing values
311 if null_pos is not None and null_pos[i]:
312 str_list[i] = np.nan
313 continue
314
315 # Extract a range of code units
316 units = data[offsets[i] : offsets[i + 1]]
317
318 # Convert the list of code units to bytes
319 str_bytes = bytes(units)
320
321 # Create the string
322 string = str_bytes.decode(encoding="utf-8")
323
324 # Add to our list of strings
325 str_list[i] = string
326
327 # Convert the string list to a NumPy array
328 return np.asarray(str_list, dtype="object"), buffers
329
330
331def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:
332 """Parse datetime `format_str` to interpret the `data`."""
333 # timestamp 'ts{unit}:tz'
334 timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
335 if timestamp_meta:
336 unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
337 if unit != "s":
338 # the format string describes only a first letter of the unit, so
339 # add one extra letter to convert the unit to numpy-style:
340 # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
341 unit += "s"
342 data = data.astype(f"datetime64[{unit}]")
343 if tz != "":
344 data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz)
345 return data
346
347 # date 'td{Days/Ms}'
348 date_meta = re.match(r"td([Dm])", format_str)
349 if date_meta:
350 unit = date_meta.group(1)
351 if unit == "D":
352 # NumPy doesn't support DAY unit, so converting days to seconds
353 # (converting to uint64 to avoid overflow)
354 data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")
355 elif unit == "m":
356 data = data.astype("datetime64[ms]")
357 else:
358 raise NotImplementedError(f"Date unit is not supported: {unit}")
359 return data
360
361 raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
362
363
364def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]:
365 """
366 Convert a column holding DateTime data to a NumPy array.
367
368 Parameters
369 ----------
370 col : Column
371
372 Returns
373 -------
374 tuple
375 Tuple of np.ndarray holding the data and the memory owner object
376 that keeps the memory alive.
377 """
378 buffers = col.get_buffers()
379
380 _, col_bit_width, format_str, _ = col.dtype
381 dbuf, _ = buffers["data"]
382 # Consider dtype being `uint` to get number of units passed since the 01.01.1970
383
384 data = buffer_to_ndarray(
385 dbuf,
386 (
387 DtypeKind.INT,
388 col_bit_width,
389 getattr(ArrowCTypes, f"INT{col_bit_width}"),
390 Endianness.NATIVE,
391 ),
392 offset=col.offset,
393 length=col.size(),
394 )
395
396 data = parse_datetime_format_str(format_str, data) # type: ignore[assignment]
397 data = set_nulls(data, col, buffers["validity"])
398 return data, buffers
399
400
401def buffer_to_ndarray(
402 buffer: Buffer,
403 dtype: tuple[DtypeKind, int, str, str],
404 *,
405 length: int,
406 offset: int = 0,
407) -> np.ndarray:
408 """
409 Build a NumPy array from the passed buffer.
410
411 Parameters
412 ----------
413 buffer : Buffer
414 Buffer to build a NumPy array from.
415 dtype : tuple
416 Data type of the buffer conforming protocol dtypes format.
417 offset : int, default: 0
418 Number of elements to offset from the start of the buffer.
419 length : int, optional
420 If the buffer is a bit-mask, specifies a number of bits to read
421 from the buffer. Has no effect otherwise.
422
423 Returns
424 -------
425 np.ndarray
426
427 Notes
428 -----
429 The returned array doesn't own the memory. The caller of this function is
430 responsible for keeping the memory owner object alive as long as
431 the returned NumPy array is being used.
432 """
433 kind, bit_width, _, _ = dtype
434
435 column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)
436 if column_dtype is None:
437 raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")
438
439 # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer
440 # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
441 # it since https://github.com/numpy/numpy/pull/19083
442 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
443
444 if bit_width == 1:
445 assert length is not None, "`length` must be specified for a bit-mask buffer."
446 pa = import_optional_dependency("pyarrow")
447 arr = pa.BooleanArray.from_buffers(
448 pa.bool_(),
449 length,
450 [None, pa.foreign_buffer(buffer.ptr, length)],
451 offset=offset,
452 )
453 return np.asarray(arr)
454 else:
455 data_pointer = ctypes.cast(
456 buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
457 )
458 if length > 0:
459 return np.ctypeslib.as_array(data_pointer, shape=(length,))
460 return np.array([], dtype=ctypes_type)
461
462
463def set_nulls(
464 data: np.ndarray | pd.Series,
465 col: Column,
466 validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
467 allow_modify_inplace: bool = True,
468):
469 """
470 Set null values for the data according to the column null kind.
471
472 Parameters
473 ----------
474 data : np.ndarray or pd.Series
475 Data to set nulls in.
476 col : Column
477 Column object that describes the `data`.
478 validity : tuple(Buffer, dtype) or None
479 The return value of ``col.buffers()``. We do not access the ``col.buffers()``
480 here to not take the ownership of the memory of buffer objects.
481 allow_modify_inplace : bool, default: True
482 Whether to modify the `data` inplace when zero-copy is possible (True) or always
483 modify a copy of the `data` (False).
484
485 Returns
486 -------
487 np.ndarray or pd.Series
488 Data with the nulls being set.
489 """
490 if validity is None:
491 return data
492 null_kind, sentinel_val = col.describe_null
493 null_pos = None
494
495 if null_kind == ColumnNullType.USE_SENTINEL:
496 null_pos = pd.Series(data) == sentinel_val
497 elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
498 assert validity, "Expected to have a validity buffer for the mask"
499 valid_buff, valid_dtype = validity
500 null_pos = buffer_to_ndarray(
501 valid_buff, valid_dtype, offset=col.offset, length=col.size()
502 )
503 if sentinel_val == 0:
504 null_pos = ~null_pos
505 elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
506 pass
507 else:
508 raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")
509
510 if null_pos is not None and np.any(null_pos):
511 if not allow_modify_inplace:
512 data = data.copy()
513 try:
514 data[null_pos] = None
515 except TypeError:
516 # TypeError happens if the `data` dtype appears to be non-nullable
517 # in numpy notation (bool, int, uint). If this happens,
518 # cast the `data` to nullable float dtype.
519 data = data.astype(float)
520 data[null_pos] = None
521 except SettingWithCopyError:
522 # `SettingWithCopyError` may happen for datetime-like with missing values.
523 data = data.copy()
524 data[null_pos] = None
525
526 return data