Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/dataframe_protocol.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

103 statements  

1""" 

2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api 

3""" 

4 

5from __future__ import annotations 

6 

7from abc import ( 

8 ABC, 

9 abstractmethod, 

10) 

11import enum 

12from typing import ( 

13 TYPE_CHECKING, 

14 Any, 

15 TypedDict, 

16) 

17 

18if TYPE_CHECKING: 

19 from collections.abc import ( 

20 Iterable, 

21 Sequence, 

22 ) 

23 

24 

25class DlpackDeviceType(enum.IntEnum): 

26 """Integer enum for device type codes matching DLPack.""" 

27 

28 CPU = 1 

29 CUDA = 2 

30 CPU_PINNED = 3 

31 OPENCL = 4 

32 VULKAN = 7 

33 METAL = 8 

34 VPI = 9 

35 ROCM = 10 

36 

37 

38class DtypeKind(enum.IntEnum): 

39 """ 

40 Integer enum for data types. 

41 

42 Attributes 

43 ---------- 

44 INT : int 

45 Matches to signed integer data type. 

46 UINT : int 

47 Matches to unsigned integer data type. 

48 FLOAT : int 

49 Matches to floating point data type. 

50 BOOL : int 

51 Matches to boolean data type. 

52 STRING : int 

53 Matches to string data type (UTF-8 encoded). 

54 DATETIME : int 

55 Matches to datetime data type. 

56 CATEGORICAL : int 

57 Matches to categorical data type. 

58 """ 

59 

60 INT = 0 

61 UINT = 1 

62 FLOAT = 2 

63 BOOL = 20 

64 STRING = 21 # UTF-8 

65 DATETIME = 22 

66 CATEGORICAL = 23 

67 

68 

69class ColumnNullType(enum.IntEnum): 

70 """ 

71 Integer enum for null type representation. 

72 

73 Attributes 

74 ---------- 

75 NON_NULLABLE : int 

76 Non-nullable column. 

77 USE_NAN : int 

78 Use explicit float NaN value. 

79 USE_SENTINEL : int 

80 Sentinel value besides NaN/NaT. 

81 USE_BITMASK : int 

82 The bit is set/unset representing a null on a certain position. 

83 USE_BYTEMASK : int 

84 The byte is set/unset representing a null on a certain position. 

85 """ 

86 

87 NON_NULLABLE = 0 

88 USE_NAN = 1 

89 USE_SENTINEL = 2 

90 USE_BITMASK = 3 

91 USE_BYTEMASK = 4 

92 

93 

94class ColumnBuffers(TypedDict): 

95 # first element is a buffer containing the column data; 

96 # second element is the data buffer's associated dtype 

97 data: tuple[Buffer, Any] 

98 

99 # first element is a buffer containing mask values indicating missing data; 

100 # second element is the mask value buffer's associated dtype. 

101 # None if the null representation is not a bit or byte mask 

102 validity: tuple[Buffer, Any] | None 

103 

104 # first element is a buffer containing the offset values for 

105 # variable-size binary data (e.g., variable-length strings); 

106 # second element is the offsets buffer's associated dtype. 

107 # None if the data buffer does not have an associated offsets buffer 

108 offsets: tuple[Buffer, Any] | None 

109 

110 

111class CategoricalDescription(TypedDict): 

112 # whether the ordering of dictionary indices is semantically meaningful 

113 is_ordered: bool 

114 # whether a dictionary-style mapping of categorical values to other objects exists 

115 is_dictionary: bool 

116 # Python-level only (e.g. ``{int: str}``). 

117 # None if not a dictionary-style categorical. 

118 categories: Column | None 

119 

120 

121class Buffer(ABC): 

122 """ 

123 Data in the buffer is guaranteed to be contiguous in memory. 

124 

125 Note that there is no dtype attribute present, a buffer can be thought of 

126 as simply a block of memory. However, if the column that the buffer is 

127 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is 

128 implemented, then that dtype information will be contained in the return 

129 value from ``__dlpack__``. 

130 

131 This distinction is useful to support both data exchange via DLPack on a 

132 buffer and (b) dtypes like variable-length strings which do not have a 

133 fixed number of bytes per element. 

134 """ 

135 

136 @property 

137 @abstractmethod 

138 def bufsize(self) -> int: 

139 """ 

140 Buffer size in bytes. 

141 """ 

142 

143 @property 

144 @abstractmethod 

145 def ptr(self) -> int: 

146 """ 

147 Pointer to start of the buffer as an integer. 

148 """ 

149 

150 @abstractmethod 

151 def __dlpack__(self): 

152 """ 

153 Produce DLPack capsule (see array API standard). 

154 

155 Raises: 

156 

157 - TypeError : if the buffer contains unsupported dtypes. 

158 - NotImplementedError : if DLPack support is not implemented 

159 

160 Useful to have to connect to array libraries. Support optional because 

161 it's not completely trivial to implement for a Python-only library. 

162 """ 

163 raise NotImplementedError("__dlpack__") 

164 

165 @abstractmethod 

166 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: 

167 """ 

168 Device type and device ID for where the data in the buffer resides. 

169 Uses device type codes matching DLPack. 

170 Note: must be implemented even if ``__dlpack__`` is not. 

171 """ 

172 

173 

174class Column(ABC): 

175 """ 

176 A column object, with only the methods and properties required by the 

177 interchange protocol defined. 

178 

179 A column can contain one or more chunks. Each chunk can contain up to three 

180 buffers - a data buffer, a mask buffer (depending on null representation), 

181 and an offsets buffer (if variable-size binary; e.g., variable-length 

182 strings). 

183 

184 TBD: Arrow has a separate "null" dtype, and has no separate mask concept. 

185 Instead, it seems to use "children" for both columns with a bit mask, 

186 and for nested dtypes. Unclear whether this is elegant or confusing. 

187 This design requires checking the null representation explicitly. 

188 

189 The Arrow design requires checking: 

190 1. the ARROW_FLAG_NULLABLE (for sentinel values) 

191 2. if a column has two children, combined with one of those children 

192 having a null dtype. 

193 

194 Making the mask concept explicit seems useful. One null dtype would 

195 not be enough to cover both bit and byte masks, so that would mean 

196 even more checking if we did it the Arrow way. 

197 

198 TBD: there's also the "chunk" concept here, which is implicit in Arrow as 

199 multiple buffers per array (= column here). Semantically it may make 

200 sense to have both: chunks were meant for example for lazy evaluation 

201 of data which doesn't fit in memory, while multiple buffers per column 

202 could also come from doing a selection operation on a single 

203 contiguous buffer. 

204 

205 Given these concepts, one would expect chunks to be all of the same 

206 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), 

207 while multiple buffers could have data-dependent lengths. Not an issue 

208 in pandas if one column is backed by a single NumPy array, but in 

209 Arrow it seems possible. 

210 Are multiple chunks *and* multiple buffers per column necessary for 

211 the purposes of this interchange protocol, or must producers either 

212 reuse the chunk concept for this or copy the data? 

213 

214 Note: this Column object can only be produced by ``__dataframe__``, so 

215 doesn't need its own version or ``__column__`` protocol. 

216 """ 

217 

218 @abstractmethod 

219 def size(self) -> int: 

220 """ 

221 Size of the column, in elements. 

222 

223 Corresponds to DataFrame.num_rows() if column is a single chunk; 

224 equal to size of this current chunk otherwise. 

225 """ 

226 

227 @property 

228 @abstractmethod 

229 def offset(self) -> int: 

230 """ 

231 Offset of first element. 

232 

233 May be > 0 if using chunks; for example for a column with N chunks of 

234 equal size M (only the last chunk may be shorter), 

235 ``offset = n * M``, ``n = 0 .. N-1``. 

236 """ 

237 

238 @property 

239 @abstractmethod 

240 def dtype(self) -> tuple[DtypeKind, int, str, str]: 

241 """ 

242 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. 

243 

244 Bit-width : the number of bits as an integer 

245 Format string : data type description format string in Apache Arrow C 

246 Data Interface format. 

247 Endianness : current only native endianness (``=``) is supported 

248 

249 Notes: 

250 - Kind specifiers are aligned with DLPack where possible (hence the 

251 jump to 20, leave enough room for future extension) 

252 - Masks must be specified as boolean with either bit width 1 (for bit 

253 masks) or 8 (for byte masks). 

254 - Dtype width in bits was preferred over bytes 

255 - Endianness isn't too useful, but included now in case in the future 

256 we need to support non-native endianness 

257 - Went with Apache Arrow format strings over NumPy format strings 

258 because they're more complete from a dataframe perspective 

259 - Format strings are mostly useful for datetime specification, and 

260 for categoricals. 

261 - For categoricals, the format string describes the type of the 

262 categorical in the data buffer. In case of a separate encoding of 

263 the categorical (e.g. an integer to string mapping), this can 

264 be derived from ``self.describe_categorical``. 

265 - Data types not included: complex, Arrow-style null, binary, decimal, 

266 and nested (list, struct, map, union) dtypes. 

267 """ 

268 

269 @property 

270 @abstractmethod 

271 def describe_categorical(self) -> CategoricalDescription: 

272 """ 

273 If the dtype is categorical, there are two options: 

274 - There are only values in the data buffer. 

275 - There is a separate non-categorical Column encoding for categorical values. 

276 

277 Raises TypeError if the dtype is not categorical 

278 

279 Returns the dictionary with description on how to interpret the data buffer: 

280 - "is_ordered" : bool, whether the ordering of dictionary indices is 

281 semantically meaningful. 

282 - "is_dictionary" : bool, whether a mapping of 

283 categorical values to other objects exists 

284 - "categories" : Column representing the (implicit) mapping of indices to 

285 category values (e.g. an array of cat1, cat2, ...). 

286 None if not a dictionary-style categorical. 

287 

288 TBD: are there any other in-memory representations that are needed? 

289 """ 

290 

291 @property 

292 @abstractmethod 

293 def describe_null(self) -> tuple[ColumnNullType, Any]: 

294 """ 

295 Return the missing value (or "null") representation the column dtype 

296 uses, as a tuple ``(kind, value)``. 

297 

298 Value : if kind is "sentinel value", the actual value. If kind is a bit 

299 mask or a byte mask, the value (0 or 1) indicating a missing value. None 

300 otherwise. 

301 """ 

302 

303 @property 

304 @abstractmethod 

305 def null_count(self) -> int | None: 

306 """ 

307 Number of null elements, if known. 

308 

309 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. 

310 """ 

311 

312 @property 

313 @abstractmethod 

314 def metadata(self) -> dict[str, Any]: 

315 """ 

316 The metadata for the column. See `DataFrame.metadata` for more details. 

317 """ 

318 

319 @abstractmethod 

320 def num_chunks(self) -> int: 

321 """ 

322 Return the number of chunks the column consists of. 

323 """ 

324 

325 @abstractmethod 

326 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: 

327 """ 

328 Return an iterator yielding the chunks. 

329 

330 See `DataFrame.get_chunks` for details on ``n_chunks``. 

331 """ 

332 

333 @abstractmethod 

334 def get_buffers(self) -> ColumnBuffers: 

335 """ 

336 Return a dictionary containing the underlying buffers. 

337 

338 The returned dictionary has the following contents: 

339 

340 - "data": a two-element tuple whose first element is a buffer 

341 containing the data and whose second element is the data 

342 buffer's associated dtype. 

343 - "validity": a two-element tuple whose first element is a buffer 

344 containing mask values indicating missing data and 

345 whose second element is the mask value buffer's 

346 associated dtype. None if the null representation is 

347 not a bit or byte mask. 

348 - "offsets": a two-element tuple whose first element is a buffer 

349 containing the offset values for variable-size binary 

350 data (e.g., variable-length strings) and whose second 

351 element is the offsets buffer's associated dtype. None 

352 if the data buffer does not have an associated offsets 

353 buffer. 

354 """ 

355 

356 

357# def get_children(self) -> Iterable[Column]: 

358# """ 

359# Children columns underneath the column, each object in this iterator 

360# must adhere to the column specification. 

361# """ 

362# pass 

363 

364 

365class DataFrame(ABC): 

366 """ 

367 A data frame class, with only the methods required by the interchange 

368 protocol defined. 

369 

370 A "data frame" represents an ordered collection of named columns. 

371 A column's "name" must be a unique string. 

372 Columns may be accessed by name or by position. 

373 

374 This could be a public data frame class, or an object with the methods and 

375 attributes defined on this DataFrame class could be returned from the 

376 ``__dataframe__`` method of a public data frame class in a library adhering 

377 to the dataframe interchange protocol specification. 

378 """ 

379 

380 version = 0 # version of the protocol 

381 

382 @abstractmethod 

383 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): 

384 """Construct a new interchange object, potentially changing the parameters.""" 

385 

386 @property 

387 @abstractmethod 

388 def metadata(self) -> dict[str, Any]: 

389 """ 

390 The metadata for the data frame, as a dictionary with string keys. The 

391 contents of `metadata` may be anything, they are meant for a library 

392 to store information that it needs to, e.g., roundtrip losslessly or 

393 for two implementations to share data that is not (yet) part of the 

394 interchange protocol specification. For avoiding collisions with other 

395 entries, please add name the keys with the name of the library 

396 followed by a period and the desired name, e.g, ``pandas.indexcol``. 

397 """ 

398 

399 @abstractmethod 

400 def num_columns(self) -> int: 

401 """ 

402 Return the number of columns in the DataFrame. 

403 """ 

404 

405 @abstractmethod 

406 def num_rows(self) -> int | None: 

407 # TODO: not happy with Optional, but need to flag it may be expensive 

408 # why include it if it may be None - what do we expect consumers 

409 # to do here? 

410 """ 

411 Return the number of rows in the DataFrame, if available. 

412 """ 

413 

414 @abstractmethod 

415 def num_chunks(self) -> int: 

416 """ 

417 Return the number of chunks the DataFrame consists of. 

418 """ 

419 

420 @abstractmethod 

421 def column_names(self) -> Iterable[str]: 

422 """ 

423 Return an iterator yielding the column names. 

424 """ 

425 

426 @abstractmethod 

427 def get_column(self, i: int) -> Column: 

428 """ 

429 Return the column at the indicated position. 

430 """ 

431 

432 @abstractmethod 

433 def get_column_by_name(self, name: str) -> Column: 

434 """ 

435 Return the column whose name is the indicated name. 

436 """ 

437 

438 @abstractmethod 

439 def get_columns(self) -> Iterable[Column]: 

440 """ 

441 Return an iterator yielding the columns. 

442 """ 

443 

444 @abstractmethod 

445 def select_columns(self, indices: Sequence[int]) -> DataFrame: 

446 """ 

447 Create a new DataFrame by selecting a subset of columns by index. 

448 """ 

449 

450 @abstractmethod 

451 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: 

452 """ 

453 Create a new DataFrame by selecting a subset of columns by name. 

454 """ 

455 

456 @abstractmethod 

457 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: 

458 """ 

459 Return an iterator yielding the chunks. 

460 

461 By default (None), yields the chunks that the data is stored as by the 

462 producer. If given, ``n_chunks`` must be a multiple of 

463 ``self.num_chunks()``, meaning the producer must subdivide each chunk 

464 before yielding it. 

465 """