Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/interchange/dataframe_protocol.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

101 statements  

1""" 

2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api 

3""" 

4 

5from __future__ import annotations 

6 

7from abc import ( 

8 ABC, 

9 abstractmethod, 

10) 

11import enum 

12from typing import ( 

13 Any, 

14 Iterable, 

15 Sequence, 

16 TypedDict, 

17) 

18 

19 

20class DlpackDeviceType(enum.IntEnum): 

21 """Integer enum for device type codes matching DLPack.""" 

22 

23 CPU = 1 

24 CUDA = 2 

25 CPU_PINNED = 3 

26 OPENCL = 4 

27 VULKAN = 7 

28 METAL = 8 

29 VPI = 9 

30 ROCM = 10 

31 

32 

33class DtypeKind(enum.IntEnum): 

34 """ 

35 Integer enum for data types. 

36 

37 Attributes 

38 ---------- 

39 INT : int 

40 Matches to signed integer data type. 

41 UINT : int 

42 Matches to unsigned integer data type. 

43 FLOAT : int 

44 Matches to floating point data type. 

45 BOOL : int 

46 Matches to boolean data type. 

47 STRING : int 

48 Matches to string data type (UTF-8 encoded). 

49 DATETIME : int 

50 Matches to datetime data type. 

51 CATEGORICAL : int 

52 Matches to categorical data type. 

53 """ 

54 

55 INT = 0 

56 UINT = 1 

57 FLOAT = 2 

58 BOOL = 20 

59 STRING = 21 # UTF-8 

60 DATETIME = 22 

61 CATEGORICAL = 23 

62 

63 

64class ColumnNullType(enum.IntEnum): 

65 """ 

66 Integer enum for null type representation. 

67 

68 Attributes 

69 ---------- 

70 NON_NULLABLE : int 

71 Non-nullable column. 

72 USE_NAN : int 

73 Use explicit float NaN value. 

74 USE_SENTINEL : int 

75 Sentinel value besides NaN/NaT. 

76 USE_BITMASK : int 

77 The bit is set/unset representing a null on a certain position. 

78 USE_BYTEMASK : int 

79 The byte is set/unset representing a null on a certain position. 

80 """ 

81 

82 NON_NULLABLE = 0 

83 USE_NAN = 1 

84 USE_SENTINEL = 2 

85 USE_BITMASK = 3 

86 USE_BYTEMASK = 4 

87 

88 

89class ColumnBuffers(TypedDict): 

90 # first element is a buffer containing the column data; 

91 # second element is the data buffer's associated dtype 

92 data: tuple[Buffer, Any] 

93 

94 # first element is a buffer containing mask values indicating missing data; 

95 # second element is the mask value buffer's associated dtype. 

96 # None if the null representation is not a bit or byte mask 

97 validity: tuple[Buffer, Any] | None 

98 

99 # first element is a buffer containing the offset values for 

100 # variable-size binary data (e.g., variable-length strings); 

101 # second element is the offsets buffer's associated dtype. 

102 # None if the data buffer does not have an associated offsets buffer 

103 offsets: tuple[Buffer, Any] | None 

104 

105 

106class CategoricalDescription(TypedDict): 

107 # whether the ordering of dictionary indices is semantically meaningful 

108 is_ordered: bool 

109 # whether a dictionary-style mapping of categorical values to other objects exists 

110 is_dictionary: bool 

111 # Python-level only (e.g. ``{int: str}``). 

112 # None if not a dictionary-style categorical. 

113 categories: Column | None 

114 

115 

116class Buffer(ABC): 

117 """ 

118 Data in the buffer is guaranteed to be contiguous in memory. 

119 

120 Note that there is no dtype attribute present, a buffer can be thought of 

121 as simply a block of memory. However, if the column that the buffer is 

122 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is 

123 implemented, then that dtype information will be contained in the return 

124 value from ``__dlpack__``. 

125 

126 This distinction is useful to support both data exchange via DLPack on a 

127 buffer and (b) dtypes like variable-length strings which do not have a 

128 fixed number of bytes per element. 

129 """ 

130 

131 @property 

132 @abstractmethod 

133 def bufsize(self) -> int: 

134 """ 

135 Buffer size in bytes. 

136 """ 

137 

138 @property 

139 @abstractmethod 

140 def ptr(self) -> int: 

141 """ 

142 Pointer to start of the buffer as an integer. 

143 """ 

144 

145 @abstractmethod 

146 def __dlpack__(self): 

147 """ 

148 Produce DLPack capsule (see array API standard). 

149 

150 Raises: 

151 

152 - TypeError : if the buffer contains unsupported dtypes. 

153 - NotImplementedError : if DLPack support is not implemented 

154 

155 Useful to have to connect to array libraries. Support optional because 

156 it's not completely trivial to implement for a Python-only library. 

157 """ 

158 raise NotImplementedError("__dlpack__") 

159 

160 @abstractmethod 

161 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: 

162 """ 

163 Device type and device ID for where the data in the buffer resides. 

164 Uses device type codes matching DLPack. 

165 Note: must be implemented even if ``__dlpack__`` is not. 

166 """ 

167 

168 

169class Column(ABC): 

170 """ 

171 A column object, with only the methods and properties required by the 

172 interchange protocol defined. 

173 

174 A column can contain one or more chunks. Each chunk can contain up to three 

175 buffers - a data buffer, a mask buffer (depending on null representation), 

176 and an offsets buffer (if variable-size binary; e.g., variable-length 

177 strings). 

178 

179 TBD: Arrow has a separate "null" dtype, and has no separate mask concept. 

180 Instead, it seems to use "children" for both columns with a bit mask, 

181 and for nested dtypes. Unclear whether this is elegant or confusing. 

182 This design requires checking the null representation explicitly. 

183 

184 The Arrow design requires checking: 

185 1. the ARROW_FLAG_NULLABLE (for sentinel values) 

186 2. if a column has two children, combined with one of those children 

187 having a null dtype. 

188 

189 Making the mask concept explicit seems useful. One null dtype would 

190 not be enough to cover both bit and byte masks, so that would mean 

191 even more checking if we did it the Arrow way. 

192 

193 TBD: there's also the "chunk" concept here, which is implicit in Arrow as 

194 multiple buffers per array (= column here). Semantically it may make 

195 sense to have both: chunks were meant for example for lazy evaluation 

196 of data which doesn't fit in memory, while multiple buffers per column 

197 could also come from doing a selection operation on a single 

198 contiguous buffer. 

199 

200 Given these concepts, one would expect chunks to be all of the same 

201 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), 

202 while multiple buffers could have data-dependent lengths. Not an issue 

203 in pandas if one column is backed by a single NumPy array, but in 

204 Arrow it seems possible. 

205 Are multiple chunks *and* multiple buffers per column necessary for 

206 the purposes of this interchange protocol, or must producers either 

207 reuse the chunk concept for this or copy the data? 

208 

209 Note: this Column object can only be produced by ``__dataframe__``, so 

210 doesn't need its own version or ``__column__`` protocol. 

211 """ 

212 

213 @abstractmethod 

214 def size(self) -> int: 

215 """ 

216 Size of the column, in elements. 

217 

218 Corresponds to DataFrame.num_rows() if column is a single chunk; 

219 equal to size of this current chunk otherwise. 

220 """ 

221 

222 @property 

223 @abstractmethod 

224 def offset(self) -> int: 

225 """ 

226 Offset of first element. 

227 

228 May be > 0 if using chunks; for example for a column with N chunks of 

229 equal size M (only the last chunk may be shorter), 

230 ``offset = n * M``, ``n = 0 .. N-1``. 

231 """ 

232 

233 @property 

234 @abstractmethod 

235 def dtype(self) -> tuple[DtypeKind, int, str, str]: 

236 """ 

237 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. 

238 

239 Bit-width : the number of bits as an integer 

240 Format string : data type description format string in Apache Arrow C 

241 Data Interface format. 

242 Endianness : current only native endianness (``=``) is supported 

243 

244 Notes: 

245 - Kind specifiers are aligned with DLPack where possible (hence the 

246 jump to 20, leave enough room for future extension) 

247 - Masks must be specified as boolean with either bit width 1 (for bit 

248 masks) or 8 (for byte masks). 

249 - Dtype width in bits was preferred over bytes 

250 - Endianness isn't too useful, but included now in case in the future 

251 we need to support non-native endianness 

252 - Went with Apache Arrow format strings over NumPy format strings 

253 because they're more complete from a dataframe perspective 

254 - Format strings are mostly useful for datetime specification, and 

255 for categoricals. 

256 - For categoricals, the format string describes the type of the 

257 categorical in the data buffer. In case of a separate encoding of 

258 the categorical (e.g. an integer to string mapping), this can 

259 be derived from ``self.describe_categorical``. 

260 - Data types not included: complex, Arrow-style null, binary, decimal, 

261 and nested (list, struct, map, union) dtypes. 

262 """ 

263 

264 @property 

265 @abstractmethod 

266 def describe_categorical(self) -> CategoricalDescription: 

267 """ 

268 If the dtype is categorical, there are two options: 

269 - There are only values in the data buffer. 

270 - There is a separate non-categorical Column encoding for categorical values. 

271 

272 Raises TypeError if the dtype is not categorical 

273 

274 Returns the dictionary with description on how to interpret the data buffer: 

275 - "is_ordered" : bool, whether the ordering of dictionary indices is 

276 semantically meaningful. 

277 - "is_dictionary" : bool, whether a mapping of 

278 categorical values to other objects exists 

279 - "categories" : Column representing the (implicit) mapping of indices to 

280 category values (e.g. an array of cat1, cat2, ...). 

281 None if not a dictionary-style categorical. 

282 

283 TBD: are there any other in-memory representations that are needed? 

284 """ 

285 

286 @property 

287 @abstractmethod 

288 def describe_null(self) -> tuple[ColumnNullType, Any]: 

289 """ 

290 Return the missing value (or "null") representation the column dtype 

291 uses, as a tuple ``(kind, value)``. 

292 

293 Value : if kind is "sentinel value", the actual value. If kind is a bit 

294 mask or a byte mask, the value (0 or 1) indicating a missing value. None 

295 otherwise. 

296 """ 

297 

298 @property 

299 @abstractmethod 

300 def null_count(self) -> int | None: 

301 """ 

302 Number of null elements, if known. 

303 

304 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. 

305 """ 

306 

307 @property 

308 @abstractmethod 

309 def metadata(self) -> dict[str, Any]: 

310 """ 

311 The metadata for the column. See `DataFrame.metadata` for more details. 

312 """ 

313 

314 @abstractmethod 

315 def num_chunks(self) -> int: 

316 """ 

317 Return the number of chunks the column consists of. 

318 """ 

319 

320 @abstractmethod 

321 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: 

322 """ 

323 Return an iterator yielding the chunks. 

324 

325 See `DataFrame.get_chunks` for details on ``n_chunks``. 

326 """ 

327 

328 @abstractmethod 

329 def get_buffers(self) -> ColumnBuffers: 

330 """ 

331 Return a dictionary containing the underlying buffers. 

332 

333 The returned dictionary has the following contents: 

334 

335 - "data": a two-element tuple whose first element is a buffer 

336 containing the data and whose second element is the data 

337 buffer's associated dtype. 

338 - "validity": a two-element tuple whose first element is a buffer 

339 containing mask values indicating missing data and 

340 whose second element is the mask value buffer's 

341 associated dtype. None if the null representation is 

342 not a bit or byte mask. 

343 - "offsets": a two-element tuple whose first element is a buffer 

344 containing the offset values for variable-size binary 

345 data (e.g., variable-length strings) and whose second 

346 element is the offsets buffer's associated dtype. None 

347 if the data buffer does not have an associated offsets 

348 buffer. 

349 """ 

350 

351 

352# def get_children(self) -> Iterable[Column]: 

353# """ 

354# Children columns underneath the column, each object in this iterator 

355# must adhere to the column specification. 

356# """ 

357# pass 

358 

359 

360class DataFrame(ABC): 

361 """ 

362 A data frame class, with only the methods required by the interchange 

363 protocol defined. 

364 

365 A "data frame" represents an ordered collection of named columns. 

366 A column's "name" must be a unique string. 

367 Columns may be accessed by name or by position. 

368 

369 This could be a public data frame class, or an object with the methods and 

370 attributes defined on this DataFrame class could be returned from the 

371 ``__dataframe__`` method of a public data frame class in a library adhering 

372 to the dataframe interchange protocol specification. 

373 """ 

374 

375 version = 0 # version of the protocol 

376 

377 @abstractmethod 

378 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): 

379 """Construct a new interchange object, potentially changing the parameters.""" 

380 

381 @property 

382 @abstractmethod 

383 def metadata(self) -> dict[str, Any]: 

384 """ 

385 The metadata for the data frame, as a dictionary with string keys. The 

386 contents of `metadata` may be anything, they are meant for a library 

387 to store information that it needs to, e.g., roundtrip losslessly or 

388 for two implementations to share data that is not (yet) part of the 

389 interchange protocol specification. For avoiding collisions with other 

390 entries, please add name the keys with the name of the library 

391 followed by a period and the desired name, e.g, ``pandas.indexcol``. 

392 """ 

393 

394 @abstractmethod 

395 def num_columns(self) -> int: 

396 """ 

397 Return the number of columns in the DataFrame. 

398 """ 

399 

400 @abstractmethod 

401 def num_rows(self) -> int | None: 

402 # TODO: not happy with Optional, but need to flag it may be expensive 

403 # why include it if it may be None - what do we expect consumers 

404 # to do here? 

405 """ 

406 Return the number of rows in the DataFrame, if available. 

407 """ 

408 

409 @abstractmethod 

410 def num_chunks(self) -> int: 

411 """ 

412 Return the number of chunks the DataFrame consists of. 

413 """ 

414 

415 @abstractmethod 

416 def column_names(self) -> Iterable[str]: 

417 """ 

418 Return an iterator yielding the column names. 

419 """ 

420 

421 @abstractmethod 

422 def get_column(self, i: int) -> Column: 

423 """ 

424 Return the column at the indicated position. 

425 """ 

426 

427 @abstractmethod 

428 def get_column_by_name(self, name: str) -> Column: 

429 """ 

430 Return the column whose name is the indicated name. 

431 """ 

432 

433 @abstractmethod 

434 def get_columns(self) -> Iterable[Column]: 

435 """ 

436 Return an iterator yielding the columns. 

437 """ 

438 

439 @abstractmethod 

440 def select_columns(self, indices: Sequence[int]) -> DataFrame: 

441 """ 

442 Create a new DataFrame by selecting a subset of columns by index. 

443 """ 

444 

445 @abstractmethod 

446 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: 

447 """ 

448 Create a new DataFrame by selecting a subset of columns by name. 

449 """ 

450 

451 @abstractmethod 

452 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: 

453 """ 

454 Return an iterator yielding the chunks. 

455 

456 By default (None), yields the chunks that the data is stored as by the 

457 producer. If given, ``n_chunks`` must be a multiple of 

458 ``self.num_chunks()``, meaning the producer must subdivide each chunk 

459 before yielding it. 

460 """