1"""
2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
3"""
4
5from __future__ import annotations
6
7from abc import (
8 ABC,
9 abstractmethod,
10)
11import enum
12from typing import (
13 TYPE_CHECKING,
14 Any,
15 TypedDict,
16)
17
18if TYPE_CHECKING:
19 from collections.abc import (
20 Iterable,
21 Sequence,
22 )
23
24
25class DlpackDeviceType(enum.IntEnum):
26 """Integer enum for device type codes matching DLPack."""
27
28 CPU = 1
29 CUDA = 2
30 CPU_PINNED = 3
31 OPENCL = 4
32 VULKAN = 7
33 METAL = 8
34 VPI = 9
35 ROCM = 10
36
37
38class DtypeKind(enum.IntEnum):
39 """
40 Integer enum for data types.
41
42 Attributes
43 ----------
44 INT : int
45 Matches to signed integer data type.
46 UINT : int
47 Matches to unsigned integer data type.
48 FLOAT : int
49 Matches to floating point data type.
50 BOOL : int
51 Matches to boolean data type.
52 STRING : int
53 Matches to string data type (UTF-8 encoded).
54 DATETIME : int
55 Matches to datetime data type.
56 CATEGORICAL : int
57 Matches to categorical data type.
58 """
59
60 INT = 0
61 UINT = 1
62 FLOAT = 2
63 BOOL = 20
64 STRING = 21 # UTF-8
65 DATETIME = 22
66 CATEGORICAL = 23
67
68
69class ColumnNullType(enum.IntEnum):
70 """
71 Integer enum for null type representation.
72
73 Attributes
74 ----------
75 NON_NULLABLE : int
76 Non-nullable column.
77 USE_NAN : int
78 Use explicit float NaN value.
79 USE_SENTINEL : int
80 Sentinel value besides NaN/NaT.
81 USE_BITMASK : int
82 The bit is set/unset representing a null on a certain position.
83 USE_BYTEMASK : int
84 The byte is set/unset representing a null on a certain position.
85 """
86
87 NON_NULLABLE = 0
88 USE_NAN = 1
89 USE_SENTINEL = 2
90 USE_BITMASK = 3
91 USE_BYTEMASK = 4
92
93
94class ColumnBuffers(TypedDict):
95 # first element is a buffer containing the column data;
96 # second element is the data buffer's associated dtype
97 data: tuple[Buffer, Any]
98
99 # first element is a buffer containing mask values indicating missing data;
100 # second element is the mask value buffer's associated dtype.
101 # None if the null representation is not a bit or byte mask
102 validity: tuple[Buffer, Any] | None
103
104 # first element is a buffer containing the offset values for
105 # variable-size binary data (e.g., variable-length strings);
106 # second element is the offsets buffer's associated dtype.
107 # None if the data buffer does not have an associated offsets buffer
108 offsets: tuple[Buffer, Any] | None
109
110
111class CategoricalDescription(TypedDict):
112 # whether the ordering of dictionary indices is semantically meaningful
113 is_ordered: bool
114 # whether a dictionary-style mapping of categorical values to other objects exists
115 is_dictionary: bool
116 # Python-level only (e.g. ``{int: str}``).
117 # None if not a dictionary-style categorical.
118 categories: Column | None
119
120
121class Buffer(ABC):
122 """
123 Data in the buffer is guaranteed to be contiguous in memory.
124
125 Note that there is no dtype attribute present, a buffer can be thought of
126 as simply a block of memory. However, if the column that the buffer is
127 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
128 implemented, then that dtype information will be contained in the return
129 value from ``__dlpack__``.
130
131 This distinction is useful to support both data exchange via DLPack on a
132 buffer and (b) dtypes like variable-length strings which do not have a
133 fixed number of bytes per element.
134 """
135
136 @property
137 @abstractmethod
138 def bufsize(self) -> int:
139 """
140 Buffer size in bytes.
141 """
142
143 @property
144 @abstractmethod
145 def ptr(self) -> int:
146 """
147 Pointer to start of the buffer as an integer.
148 """
149
150 @abstractmethod
151 def __dlpack__(self):
152 """
153 Produce DLPack capsule (see array API standard).
154
155 Raises:
156
157 - TypeError : if the buffer contains unsupported dtypes.
158 - NotImplementedError : if DLPack support is not implemented
159
160 Useful to have to connect to array libraries. Support optional because
161 it's not completely trivial to implement for a Python-only library.
162 """
163 raise NotImplementedError("__dlpack__")
164
165 @abstractmethod
166 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
167 """
168 Device type and device ID for where the data in the buffer resides.
169 Uses device type codes matching DLPack.
170 Note: must be implemented even if ``__dlpack__`` is not.
171 """
172
173
174class Column(ABC):
175 """
176 A column object, with only the methods and properties required by the
177 interchange protocol defined.
178
179 A column can contain one or more chunks. Each chunk can contain up to three
180 buffers - a data buffer, a mask buffer (depending on null representation),
181 and an offsets buffer (if variable-size binary; e.g., variable-length
182 strings).
183
184 TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
185 Instead, it seems to use "children" for both columns with a bit mask,
186 and for nested dtypes. Unclear whether this is elegant or confusing.
187 This design requires checking the null representation explicitly.
188
189 The Arrow design requires checking:
190 1. the ARROW_FLAG_NULLABLE (for sentinel values)
191 2. if a column has two children, combined with one of those children
192 having a null dtype.
193
194 Making the mask concept explicit seems useful. One null dtype would
195 not be enough to cover both bit and byte masks, so that would mean
196 even more checking if we did it the Arrow way.
197
198 TBD: there's also the "chunk" concept here, which is implicit in Arrow as
199 multiple buffers per array (= column here). Semantically it may make
200 sense to have both: chunks were meant for example for lazy evaluation
201 of data which doesn't fit in memory, while multiple buffers per column
202 could also come from doing a selection operation on a single
203 contiguous buffer.
204
205 Given these concepts, one would expect chunks to be all of the same
206 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
207 while multiple buffers could have data-dependent lengths. Not an issue
208 in pandas if one column is backed by a single NumPy array, but in
209 Arrow it seems possible.
210 Are multiple chunks *and* multiple buffers per column necessary for
211 the purposes of this interchange protocol, or must producers either
212 reuse the chunk concept for this or copy the data?
213
214 Note: this Column object can only be produced by ``__dataframe__``, so
215 doesn't need its own version or ``__column__`` protocol.
216 """
217
218 @abstractmethod
219 def size(self) -> int:
220 """
221 Size of the column, in elements.
222
223 Corresponds to DataFrame.num_rows() if column is a single chunk;
224 equal to size of this current chunk otherwise.
225 """
226
227 @property
228 @abstractmethod
229 def offset(self) -> int:
230 """
231 Offset of first element.
232
233 May be > 0 if using chunks; for example for a column with N chunks of
234 equal size M (only the last chunk may be shorter),
235 ``offset = n * M``, ``n = 0 .. N-1``.
236 """
237
238 @property
239 @abstractmethod
240 def dtype(self) -> tuple[DtypeKind, int, str, str]:
241 """
242 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
243
244 Bit-width : the number of bits as an integer
245 Format string : data type description format string in Apache Arrow C
246 Data Interface format.
247 Endianness : current only native endianness (``=``) is supported
248
249 Notes:
250 - Kind specifiers are aligned with DLPack where possible (hence the
251 jump to 20, leave enough room for future extension)
252 - Masks must be specified as boolean with either bit width 1 (for bit
253 masks) or 8 (for byte masks).
254 - Dtype width in bits was preferred over bytes
255 - Endianness isn't too useful, but included now in case in the future
256 we need to support non-native endianness
257 - Went with Apache Arrow format strings over NumPy format strings
258 because they're more complete from a dataframe perspective
259 - Format strings are mostly useful for datetime specification, and
260 for categoricals.
261 - For categoricals, the format string describes the type of the
262 categorical in the data buffer. In case of a separate encoding of
263 the categorical (e.g. an integer to string mapping), this can
264 be derived from ``self.describe_categorical``.
265 - Data types not included: complex, Arrow-style null, binary, decimal,
266 and nested (list, struct, map, union) dtypes.
267 """
268
269 @property
270 @abstractmethod
271 def describe_categorical(self) -> CategoricalDescription:
272 """
273 If the dtype is categorical, there are two options:
274 - There are only values in the data buffer.
275 - There is a separate non-categorical Column encoding for categorical values.
276
277 Raises TypeError if the dtype is not categorical
278
279 Returns the dictionary with description on how to interpret the data buffer:
280 - "is_ordered" : bool, whether the ordering of dictionary indices is
281 semantically meaningful.
282 - "is_dictionary" : bool, whether a mapping of
283 categorical values to other objects exists
284 - "categories" : Column representing the (implicit) mapping of indices to
285 category values (e.g. an array of cat1, cat2, ...).
286 None if not a dictionary-style categorical.
287
288 TBD: are there any other in-memory representations that are needed?
289 """
290
291 @property
292 @abstractmethod
293 def describe_null(self) -> tuple[ColumnNullType, Any]:
294 """
295 Return the missing value (or "null") representation the column dtype
296 uses, as a tuple ``(kind, value)``.
297
298 Value : if kind is "sentinel value", the actual value. If kind is a bit
299 mask or a byte mask, the value (0 or 1) indicating a missing value. None
300 otherwise.
301 """
302
303 @property
304 @abstractmethod
305 def null_count(self) -> int | None:
306 """
307 Number of null elements, if known.
308
309 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
310 """
311
312 @property
313 @abstractmethod
314 def metadata(self) -> dict[str, Any]:
315 """
316 The metadata for the column. See `DataFrame.metadata` for more details.
317 """
318
319 @abstractmethod
320 def num_chunks(self) -> int:
321 """
322 Return the number of chunks the column consists of.
323 """
324
325 @abstractmethod
326 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
327 """
328 Return an iterator yielding the chunks.
329
330 See `DataFrame.get_chunks` for details on ``n_chunks``.
331 """
332
333 @abstractmethod
334 def get_buffers(self) -> ColumnBuffers:
335 """
336 Return a dictionary containing the underlying buffers.
337
338 The returned dictionary has the following contents:
339
340 - "data": a two-element tuple whose first element is a buffer
341 containing the data and whose second element is the data
342 buffer's associated dtype.
343 - "validity": a two-element tuple whose first element is a buffer
344 containing mask values indicating missing data and
345 whose second element is the mask value buffer's
346 associated dtype. None if the null representation is
347 not a bit or byte mask.
348 - "offsets": a two-element tuple whose first element is a buffer
349 containing the offset values for variable-size binary
350 data (e.g., variable-length strings) and whose second
351 element is the offsets buffer's associated dtype. None
352 if the data buffer does not have an associated offsets
353 buffer.
354 """
355
356
357# def get_children(self) -> Iterable[Column]:
358# """
359# Children columns underneath the column, each object in this iterator
360# must adhere to the column specification.
361# """
362# pass
363
364
365class DataFrame(ABC):
366 """
367 A data frame class, with only the methods required by the interchange
368 protocol defined.
369
370 A "data frame" represents an ordered collection of named columns.
371 A column's "name" must be a unique string.
372 Columns may be accessed by name or by position.
373
374 This could be a public data frame class, or an object with the methods and
375 attributes defined on this DataFrame class could be returned from the
376 ``__dataframe__`` method of a public data frame class in a library adhering
377 to the dataframe interchange protocol specification.
378 """
379
380 version = 0 # version of the protocol
381
382 @abstractmethod
383 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
384 """Construct a new interchange object, potentially changing the parameters."""
385
386 @property
387 @abstractmethod
388 def metadata(self) -> dict[str, Any]:
389 """
390 The metadata for the data frame, as a dictionary with string keys. The
391 contents of `metadata` may be anything, they are meant for a library
392 to store information that it needs to, e.g., roundtrip losslessly or
393 for two implementations to share data that is not (yet) part of the
394 interchange protocol specification. For avoiding collisions with other
395 entries, please add name the keys with the name of the library
396 followed by a period and the desired name, e.g, ``pandas.indexcol``.
397 """
398
399 @abstractmethod
400 def num_columns(self) -> int:
401 """
402 Return the number of columns in the DataFrame.
403 """
404
405 @abstractmethod
406 def num_rows(self) -> int | None:
407 # TODO: not happy with Optional, but need to flag it may be expensive
408 # why include it if it may be None - what do we expect consumers
409 # to do here?
410 """
411 Return the number of rows in the DataFrame, if available.
412 """
413
414 @abstractmethod
415 def num_chunks(self) -> int:
416 """
417 Return the number of chunks the DataFrame consists of.
418 """
419
420 @abstractmethod
421 def column_names(self) -> Iterable[str]:
422 """
423 Return an iterator yielding the column names.
424 """
425
426 @abstractmethod
427 def get_column(self, i: int) -> Column:
428 """
429 Return the column at the indicated position.
430 """
431
432 @abstractmethod
433 def get_column_by_name(self, name: str) -> Column:
434 """
435 Return the column whose name is the indicated name.
436 """
437
438 @abstractmethod
439 def get_columns(self) -> Iterable[Column]:
440 """
441 Return an iterator yielding the columns.
442 """
443
444 @abstractmethod
445 def select_columns(self, indices: Sequence[int]) -> DataFrame:
446 """
447 Create a new DataFrame by selecting a subset of columns by index.
448 """
449
450 @abstractmethod
451 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
452 """
453 Create a new DataFrame by selecting a subset of columns by name.
454 """
455
456 @abstractmethod
457 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
458 """
459 Return an iterator yielding the chunks.
460
461 By default (None), yields the chunks that the data is stored as by the
462 producer. If given, ``n_chunks`` must be a multiple of
463 ``self.num_chunks()``, meaning the producer must subdivide each chunk
464 before yielding it.
465 """