1"""
2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
3"""
4
5from __future__ import annotations
6
7from abc import (
8 ABC,
9 abstractmethod,
10)
11import enum
12from typing import (
13 Any,
14 Iterable,
15 Sequence,
16 TypedDict,
17)
18
19
20class DlpackDeviceType(enum.IntEnum):
21 """Integer enum for device type codes matching DLPack."""
22
23 CPU = 1
24 CUDA = 2
25 CPU_PINNED = 3
26 OPENCL = 4
27 VULKAN = 7
28 METAL = 8
29 VPI = 9
30 ROCM = 10
31
32
33class DtypeKind(enum.IntEnum):
34 """
35 Integer enum for data types.
36
37 Attributes
38 ----------
39 INT : int
40 Matches to signed integer data type.
41 UINT : int
42 Matches to unsigned integer data type.
43 FLOAT : int
44 Matches to floating point data type.
45 BOOL : int
46 Matches to boolean data type.
47 STRING : int
48 Matches to string data type (UTF-8 encoded).
49 DATETIME : int
50 Matches to datetime data type.
51 CATEGORICAL : int
52 Matches to categorical data type.
53 """
54
55 INT = 0
56 UINT = 1
57 FLOAT = 2
58 BOOL = 20
59 STRING = 21 # UTF-8
60 DATETIME = 22
61 CATEGORICAL = 23
62
63
64class ColumnNullType(enum.IntEnum):
65 """
66 Integer enum for null type representation.
67
68 Attributes
69 ----------
70 NON_NULLABLE : int
71 Non-nullable column.
72 USE_NAN : int
73 Use explicit float NaN value.
74 USE_SENTINEL : int
75 Sentinel value besides NaN/NaT.
76 USE_BITMASK : int
77 The bit is set/unset representing a null on a certain position.
78 USE_BYTEMASK : int
79 The byte is set/unset representing a null on a certain position.
80 """
81
82 NON_NULLABLE = 0
83 USE_NAN = 1
84 USE_SENTINEL = 2
85 USE_BITMASK = 3
86 USE_BYTEMASK = 4
87
88
89class ColumnBuffers(TypedDict):
90 # first element is a buffer containing the column data;
91 # second element is the data buffer's associated dtype
92 data: tuple[Buffer, Any]
93
94 # first element is a buffer containing mask values indicating missing data;
95 # second element is the mask value buffer's associated dtype.
96 # None if the null representation is not a bit or byte mask
97 validity: tuple[Buffer, Any] | None
98
99 # first element is a buffer containing the offset values for
100 # variable-size binary data (e.g., variable-length strings);
101 # second element is the offsets buffer's associated dtype.
102 # None if the data buffer does not have an associated offsets buffer
103 offsets: tuple[Buffer, Any] | None
104
105
106class CategoricalDescription(TypedDict):
107 # whether the ordering of dictionary indices is semantically meaningful
108 is_ordered: bool
109 # whether a dictionary-style mapping of categorical values to other objects exists
110 is_dictionary: bool
111 # Python-level only (e.g. ``{int: str}``).
112 # None if not a dictionary-style categorical.
113 categories: Column | None
114
115
116class Buffer(ABC):
117 """
118 Data in the buffer is guaranteed to be contiguous in memory.
119
120 Note that there is no dtype attribute present, a buffer can be thought of
121 as simply a block of memory. However, if the column that the buffer is
122 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
123 implemented, then that dtype information will be contained in the return
124 value from ``__dlpack__``.
125
126 This distinction is useful to support both data exchange via DLPack on a
127 buffer and (b) dtypes like variable-length strings which do not have a
128 fixed number of bytes per element.
129 """
130
131 @property
132 @abstractmethod
133 def bufsize(self) -> int:
134 """
135 Buffer size in bytes.
136 """
137
138 @property
139 @abstractmethod
140 def ptr(self) -> int:
141 """
142 Pointer to start of the buffer as an integer.
143 """
144
145 @abstractmethod
146 def __dlpack__(self):
147 """
148 Produce DLPack capsule (see array API standard).
149
150 Raises:
151
152 - TypeError : if the buffer contains unsupported dtypes.
153 - NotImplementedError : if DLPack support is not implemented
154
155 Useful to have to connect to array libraries. Support optional because
156 it's not completely trivial to implement for a Python-only library.
157 """
158 raise NotImplementedError("__dlpack__")
159
160 @abstractmethod
161 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
162 """
163 Device type and device ID for where the data in the buffer resides.
164 Uses device type codes matching DLPack.
165 Note: must be implemented even if ``__dlpack__`` is not.
166 """
167
168
169class Column(ABC):
170 """
171 A column object, with only the methods and properties required by the
172 interchange protocol defined.
173
174 A column can contain one or more chunks. Each chunk can contain up to three
175 buffers - a data buffer, a mask buffer (depending on null representation),
176 and an offsets buffer (if variable-size binary; e.g., variable-length
177 strings).
178
179 TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
180 Instead, it seems to use "children" for both columns with a bit mask,
181 and for nested dtypes. Unclear whether this is elegant or confusing.
182 This design requires checking the null representation explicitly.
183
184 The Arrow design requires checking:
185 1. the ARROW_FLAG_NULLABLE (for sentinel values)
186 2. if a column has two children, combined with one of those children
187 having a null dtype.
188
189 Making the mask concept explicit seems useful. One null dtype would
190 not be enough to cover both bit and byte masks, so that would mean
191 even more checking if we did it the Arrow way.
192
193 TBD: there's also the "chunk" concept here, which is implicit in Arrow as
194 multiple buffers per array (= column here). Semantically it may make
195 sense to have both: chunks were meant for example for lazy evaluation
196 of data which doesn't fit in memory, while multiple buffers per column
197 could also come from doing a selection operation on a single
198 contiguous buffer.
199
200 Given these concepts, one would expect chunks to be all of the same
201 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
202 while multiple buffers could have data-dependent lengths. Not an issue
203 in pandas if one column is backed by a single NumPy array, but in
204 Arrow it seems possible.
205 Are multiple chunks *and* multiple buffers per column necessary for
206 the purposes of this interchange protocol, or must producers either
207 reuse the chunk concept for this or copy the data?
208
209 Note: this Column object can only be produced by ``__dataframe__``, so
210 doesn't need its own version or ``__column__`` protocol.
211 """
212
213 @abstractmethod
214 def size(self) -> int:
215 """
216 Size of the column, in elements.
217
218 Corresponds to DataFrame.num_rows() if column is a single chunk;
219 equal to size of this current chunk otherwise.
220 """
221
222 @property
223 @abstractmethod
224 def offset(self) -> int:
225 """
226 Offset of first element.
227
228 May be > 0 if using chunks; for example for a column with N chunks of
229 equal size M (only the last chunk may be shorter),
230 ``offset = n * M``, ``n = 0 .. N-1``.
231 """
232
233 @property
234 @abstractmethod
235 def dtype(self) -> tuple[DtypeKind, int, str, str]:
236 """
237 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
238
239 Bit-width : the number of bits as an integer
240 Format string : data type description format string in Apache Arrow C
241 Data Interface format.
242 Endianness : current only native endianness (``=``) is supported
243
244 Notes:
245 - Kind specifiers are aligned with DLPack where possible (hence the
246 jump to 20, leave enough room for future extension)
247 - Masks must be specified as boolean with either bit width 1 (for bit
248 masks) or 8 (for byte masks).
249 - Dtype width in bits was preferred over bytes
250 - Endianness isn't too useful, but included now in case in the future
251 we need to support non-native endianness
252 - Went with Apache Arrow format strings over NumPy format strings
253 because they're more complete from a dataframe perspective
254 - Format strings are mostly useful for datetime specification, and
255 for categoricals.
256 - For categoricals, the format string describes the type of the
257 categorical in the data buffer. In case of a separate encoding of
258 the categorical (e.g. an integer to string mapping), this can
259 be derived from ``self.describe_categorical``.
260 - Data types not included: complex, Arrow-style null, binary, decimal,
261 and nested (list, struct, map, union) dtypes.
262 """
263
264 @property
265 @abstractmethod
266 def describe_categorical(self) -> CategoricalDescription:
267 """
268 If the dtype is categorical, there are two options:
269 - There are only values in the data buffer.
270 - There is a separate non-categorical Column encoding for categorical values.
271
272 Raises TypeError if the dtype is not categorical
273
274 Returns the dictionary with description on how to interpret the data buffer:
275 - "is_ordered" : bool, whether the ordering of dictionary indices is
276 semantically meaningful.
277 - "is_dictionary" : bool, whether a mapping of
278 categorical values to other objects exists
279 - "categories" : Column representing the (implicit) mapping of indices to
280 category values (e.g. an array of cat1, cat2, ...).
281 None if not a dictionary-style categorical.
282
283 TBD: are there any other in-memory representations that are needed?
284 """
285
286 @property
287 @abstractmethod
288 def describe_null(self) -> tuple[ColumnNullType, Any]:
289 """
290 Return the missing value (or "null") representation the column dtype
291 uses, as a tuple ``(kind, value)``.
292
293 Value : if kind is "sentinel value", the actual value. If kind is a bit
294 mask or a byte mask, the value (0 or 1) indicating a missing value. None
295 otherwise.
296 """
297
298 @property
299 @abstractmethod
300 def null_count(self) -> int | None:
301 """
302 Number of null elements, if known.
303
304 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
305 """
306
307 @property
308 @abstractmethod
309 def metadata(self) -> dict[str, Any]:
310 """
311 The metadata for the column. See `DataFrame.metadata` for more details.
312 """
313
314 @abstractmethod
315 def num_chunks(self) -> int:
316 """
317 Return the number of chunks the column consists of.
318 """
319
320 @abstractmethod
321 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
322 """
323 Return an iterator yielding the chunks.
324
325 See `DataFrame.get_chunks` for details on ``n_chunks``.
326 """
327
328 @abstractmethod
329 def get_buffers(self) -> ColumnBuffers:
330 """
331 Return a dictionary containing the underlying buffers.
332
333 The returned dictionary has the following contents:
334
335 - "data": a two-element tuple whose first element is a buffer
336 containing the data and whose second element is the data
337 buffer's associated dtype.
338 - "validity": a two-element tuple whose first element is a buffer
339 containing mask values indicating missing data and
340 whose second element is the mask value buffer's
341 associated dtype. None if the null representation is
342 not a bit or byte mask.
343 - "offsets": a two-element tuple whose first element is a buffer
344 containing the offset values for variable-size binary
345 data (e.g., variable-length strings) and whose second
346 element is the offsets buffer's associated dtype. None
347 if the data buffer does not have an associated offsets
348 buffer.
349 """
350
351
352# def get_children(self) -> Iterable[Column]:
353# """
354# Children columns underneath the column, each object in this iterator
355# must adhere to the column specification.
356# """
357# pass
358
359
360class DataFrame(ABC):
361 """
362 A data frame class, with only the methods required by the interchange
363 protocol defined.
364
365 A "data frame" represents an ordered collection of named columns.
366 A column's "name" must be a unique string.
367 Columns may be accessed by name or by position.
368
369 This could be a public data frame class, or an object with the methods and
370 attributes defined on this DataFrame class could be returned from the
371 ``__dataframe__`` method of a public data frame class in a library adhering
372 to the dataframe interchange protocol specification.
373 """
374
375 version = 0 # version of the protocol
376
377 @abstractmethod
378 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
379 """Construct a new interchange object, potentially changing the parameters."""
380
381 @property
382 @abstractmethod
383 def metadata(self) -> dict[str, Any]:
384 """
385 The metadata for the data frame, as a dictionary with string keys. The
386 contents of `metadata` may be anything, they are meant for a library
387 to store information that it needs to, e.g., roundtrip losslessly or
388 for two implementations to share data that is not (yet) part of the
389 interchange protocol specification. For avoiding collisions with other
390 entries, please add name the keys with the name of the library
391 followed by a period and the desired name, e.g, ``pandas.indexcol``.
392 """
393
394 @abstractmethod
395 def num_columns(self) -> int:
396 """
397 Return the number of columns in the DataFrame.
398 """
399
400 @abstractmethod
401 def num_rows(self) -> int | None:
402 # TODO: not happy with Optional, but need to flag it may be expensive
403 # why include it if it may be None - what do we expect consumers
404 # to do here?
405 """
406 Return the number of rows in the DataFrame, if available.
407 """
408
409 @abstractmethod
410 def num_chunks(self) -> int:
411 """
412 Return the number of chunks the DataFrame consists of.
413 """
414
415 @abstractmethod
416 def column_names(self) -> Iterable[str]:
417 """
418 Return an iterator yielding the column names.
419 """
420
421 @abstractmethod
422 def get_column(self, i: int) -> Column:
423 """
424 Return the column at the indicated position.
425 """
426
427 @abstractmethod
428 def get_column_by_name(self, name: str) -> Column:
429 """
430 Return the column whose name is the indicated name.
431 """
432
433 @abstractmethod
434 def get_columns(self) -> Iterable[Column]:
435 """
436 Return an iterator yielding the columns.
437 """
438
439 @abstractmethod
440 def select_columns(self, indices: Sequence[int]) -> DataFrame:
441 """
442 Create a new DataFrame by selecting a subset of columns by index.
443 """
444
445 @abstractmethod
446 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
447 """
448 Create a new DataFrame by selecting a subset of columns by name.
449 """
450
451 @abstractmethod
452 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
453 """
454 Return an iterator yielding the chunks.
455
456 By default (None), yields the chunks that the data is stored as by the
457 producer. If given, ``n_chunks`` must be a multiple of
458 ``self.num_chunks()``, meaning the producer must subdivide each chunk
459 before yielding it.
460 """