Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/interchange/dataframe

1"""

2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api

3"""

5from __future__ import annotations

7from abc import (

8 ABC,

9 abstractmethod,

10)

11import enum

12from typing import (

13 Any,

14 Iterable,

15 Sequence,

16 TypedDict,

17)

20class DlpackDeviceType(enum.IntEnum):

21 """Integer enum for device type codes matching DLPack."""

23 CPU = 1

24 CUDA = 2

25 CPU_PINNED = 3

26 OPENCL = 4

27 VULKAN = 7

28 METAL = 8

29 VPI = 9

30 ROCM = 10

33class DtypeKind(enum.IntEnum):

34 """

35 Integer enum for data types.

37 Attributes

38 ----------

39 INT : int

40 Matches to signed integer data type.

41 UINT : int

42 Matches to unsigned integer data type.

43 FLOAT : int

44 Matches to floating point data type.

45 BOOL : int

46 Matches to boolean data type.

47 STRING : int

48 Matches to string data type (UTF-8 encoded).

49 DATETIME : int

50 Matches to datetime data type.

51 CATEGORICAL : int

52 Matches to categorical data type.

53 """

55 INT = 0

56 UINT = 1

57 FLOAT = 2

58 BOOL = 20

59 STRING = 21 # UTF-8

60 DATETIME = 22

61 CATEGORICAL = 23

64class ColumnNullType(enum.IntEnum):

65 """

66 Integer enum for null type representation.

68 Attributes

69 ----------

70 NON_NULLABLE : int

71 Non-nullable column.

72 USE_NAN : int

73 Use explicit float NaN value.

74 USE_SENTINEL : int

75 Sentinel value besides NaN/NaT.

76 USE_BITMASK : int

77 The bit is set/unset representing a null on a certain position.

78 USE_BYTEMASK : int

79 The byte is set/unset representing a null on a certain position.

80 """

82 NON_NULLABLE = 0

83 USE_NAN = 1

84 USE_SENTINEL = 2

85 USE_BITMASK = 3

86 USE_BYTEMASK = 4

89class ColumnBuffers(TypedDict):

90 # first element is a buffer containing the column data;

91 # second element is the data buffer's associated dtype

92 data: tuple[Buffer, Any]

94 # first element is a buffer containing mask values indicating missing data;

95 # second element is the mask value buffer's associated dtype.

96 # None if the null representation is not a bit or byte mask

97 validity: tuple[Buffer, Any] | None

99 # first element is a buffer containing the offset values for

100 # variable-size binary data (e.g., variable-length strings);

101 # second element is the offsets buffer's associated dtype.

102 # None if the data buffer does not have an associated offsets buffer

103 offsets: tuple[Buffer, Any] | None

104

105

106class CategoricalDescription(TypedDict):

107 # whether the ordering of dictionary indices is semantically meaningful

108 is_ordered: bool

109 # whether a dictionary-style mapping of categorical values to other objects exists

110 is_dictionary: bool

111 # Python-level only (e.g. ``{int: str}``).

112 # None if not a dictionary-style categorical.

113 categories: Column | None

114

115

116class Buffer(ABC):

117 """

118 Data in the buffer is guaranteed to be contiguous in memory.

119

120 Note that there is no dtype attribute present, a buffer can be thought of

121 as simply a block of memory. However, if the column that the buffer is

122 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is

123 implemented, then that dtype information will be contained in the return

124 value from ``__dlpack__``.

125

126 This distinction is useful to support both data exchange via DLPack on a

127 buffer and (b) dtypes like variable-length strings which do not have a

128 fixed number of bytes per element.

129 """

130

131 @property

132 @abstractmethod

133 def bufsize(self) -> int:

134 """

135 Buffer size in bytes.

136 """

137

138 @property

139 @abstractmethod

140 def ptr(self) -> int:

141 """

142 Pointer to start of the buffer as an integer.

143 """

144

145 @abstractmethod

146 def __dlpack__(self):

147 """

148 Produce DLPack capsule (see array API standard).

149

150 Raises:

151

152 - TypeError : if the buffer contains unsupported dtypes.

153 - NotImplementedError : if DLPack support is not implemented

154

155 Useful to have to connect to array libraries. Support optional because

156 it's not completely trivial to implement for a Python-only library.

157 """

158 raise NotImplementedError("__dlpack__")

159

160 @abstractmethod

161 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:

162 """

163 Device type and device ID for where the data in the buffer resides.

164 Uses device type codes matching DLPack.

165 Note: must be implemented even if ``__dlpack__`` is not.

166 """

167

168

169class Column(ABC):

170 """

171 A column object, with only the methods and properties required by the

172 interchange protocol defined.

173

174 A column can contain one or more chunks. Each chunk can contain up to three

175 buffers - a data buffer, a mask buffer (depending on null representation),

176 and an offsets buffer (if variable-size binary; e.g., variable-length

177 strings).

178

179 TBD: Arrow has a separate "null" dtype, and has no separate mask concept.

180 Instead, it seems to use "children" for both columns with a bit mask,

181 and for nested dtypes. Unclear whether this is elegant or confusing.

182 This design requires checking the null representation explicitly.

183

184 The Arrow design requires checking:

185 1. the ARROW_FLAG_NULLABLE (for sentinel values)

186 2. if a column has two children, combined with one of those children

187 having a null dtype.

188

189 Making the mask concept explicit seems useful. One null dtype would

190 not be enough to cover both bit and byte masks, so that would mean

191 even more checking if we did it the Arrow way.

192

193 TBD: there's also the "chunk" concept here, which is implicit in Arrow as

194 multiple buffers per array (= column here). Semantically it may make

195 sense to have both: chunks were meant for example for lazy evaluation

196 of data which doesn't fit in memory, while multiple buffers per column

197 could also come from doing a selection operation on a single

198 contiguous buffer.

199

200 Given these concepts, one would expect chunks to be all of the same

201 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),

202 while multiple buffers could have data-dependent lengths. Not an issue

203 in pandas if one column is backed by a single NumPy array, but in

204 Arrow it seems possible.

205 Are multiple chunks *and* multiple buffers per column necessary for

206 the purposes of this interchange protocol, or must producers either

207 reuse the chunk concept for this or copy the data?

208

209 Note: this Column object can only be produced by ``__dataframe__``, so

210 doesn't need its own version or ``__column__`` protocol.

211 """

212

213 @abstractmethod

214 def size(self) -> int:

215 """

216 Size of the column, in elements.

217

218 Corresponds to DataFrame.num_rows() if column is a single chunk;

219 equal to size of this current chunk otherwise.

220 """

221

222 @property

223 @abstractmethod

224 def offset(self) -> int:

225 """

226 Offset of first element.

227

228 May be > 0 if using chunks; for example for a column with N chunks of

229 equal size M (only the last chunk may be shorter),

230 ``offset = n * M``, ``n = 0 .. N-1``.

231 """

232

233 @property

234 @abstractmethod

235 def dtype(self) -> tuple[DtypeKind, int, str, str]:

236 """

237 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.

238

239 Bit-width : the number of bits as an integer

240 Format string : data type description format string in Apache Arrow C

241 Data Interface format.

242 Endianness : current only native endianness (``=``) is supported

243

244 Notes:

245 - Kind specifiers are aligned with DLPack where possible (hence the

246 jump to 20, leave enough room for future extension)

247 - Masks must be specified as boolean with either bit width 1 (for bit

248 masks) or 8 (for byte masks).

249 - Dtype width in bits was preferred over bytes

250 - Endianness isn't too useful, but included now in case in the future

251 we need to support non-native endianness

252 - Went with Apache Arrow format strings over NumPy format strings

253 because they're more complete from a dataframe perspective

254 - Format strings are mostly useful for datetime specification, and

255 for categoricals.

256 - For categoricals, the format string describes the type of the

257 categorical in the data buffer. In case of a separate encoding of

258 the categorical (e.g. an integer to string mapping), this can

259 be derived from ``self.describe_categorical``.

260 - Data types not included: complex, Arrow-style null, binary, decimal,

261 and nested (list, struct, map, union) dtypes.

262 """

263

264 @property

265 @abstractmethod

266 def describe_categorical(self) -> CategoricalDescription:

267 """

268 If the dtype is categorical, there are two options:

269 - There are only values in the data buffer.

270 - There is a separate non-categorical Column encoding for categorical values.

271

272 Raises TypeError if the dtype is not categorical

273

274 Returns the dictionary with description on how to interpret the data buffer:

275 - "is_ordered" : bool, whether the ordering of dictionary indices is

276 semantically meaningful.

277 - "is_dictionary" : bool, whether a mapping of

278 categorical values to other objects exists

279 - "categories" : Column representing the (implicit) mapping of indices to

280 category values (e.g. an array of cat1, cat2, ...).

281 None if not a dictionary-style categorical.

282

283 TBD: are there any other in-memory representations that are needed?

284 """

285

286 @property

287 @abstractmethod

288 def describe_null(self) -> tuple[ColumnNullType, Any]:

289 """

290 Return the missing value (or "null") representation the column dtype

291 uses, as a tuple ``(kind, value)``.

292

293 Value : if kind is "sentinel value", the actual value. If kind is a bit

294 mask or a byte mask, the value (0 or 1) indicating a missing value. None

295 otherwise.

296 """

297

298 @property

299 @abstractmethod

300 def null_count(self) -> int | None:

301 """

302 Number of null elements, if known.

303

304 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.

305 """

306

307 @property

308 @abstractmethod

309 def metadata(self) -> dict[str, Any]:

310 """

311 The metadata for the column. See `DataFrame.metadata` for more details.

312 """

313

314 @abstractmethod

315 def num_chunks(self) -> int:

316 """

317 Return the number of chunks the column consists of.

318 """

319

320 @abstractmethod

321 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:

322 """

323 Return an iterator yielding the chunks.

324

325 See `DataFrame.get_chunks` for details on ``n_chunks``.

326 """

327

328 @abstractmethod

329 def get_buffers(self) -> ColumnBuffers:

330 """

331 Return a dictionary containing the underlying buffers.

332

333 The returned dictionary has the following contents:

334

335 - "data": a two-element tuple whose first element is a buffer

336 containing the data and whose second element is the data

337 buffer's associated dtype.

338 - "validity": a two-element tuple whose first element is a buffer

339 containing mask values indicating missing data and

340 whose second element is the mask value buffer's

341 associated dtype. None if the null representation is

342 not a bit or byte mask.

343 - "offsets": a two-element tuple whose first element is a buffer

344 containing the offset values for variable-size binary

345 data (e.g., variable-length strings) and whose second

346 element is the offsets buffer's associated dtype. None

347 if the data buffer does not have an associated offsets

348 buffer.

349 """

350

351

352# def get_children(self) -> Iterable[Column]:

353# """

354# Children columns underneath the column, each object in this iterator

355# must adhere to the column specification.

356# """

357# pass

358

359

360class DataFrame(ABC):

361 """

362 A data frame class, with only the methods required by the interchange

363 protocol defined.

364

365 A "data frame" represents an ordered collection of named columns.

366 A column's "name" must be a unique string.

367 Columns may be accessed by name or by position.

368

369 This could be a public data frame class, or an object with the methods and

370 attributes defined on this DataFrame class could be returned from the

371 ``__dataframe__`` method of a public data frame class in a library adhering

372 to the dataframe interchange protocol specification.

373 """

374

375 version = 0 # version of the protocol

376

377 @abstractmethod

378 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):

379 """Construct a new interchange object, potentially changing the parameters."""

380

381 @property

382 @abstractmethod

383 def metadata(self) -> dict[str, Any]:

384 """

385 The metadata for the data frame, as a dictionary with string keys. The

386 contents of `metadata` may be anything, they are meant for a library

387 to store information that it needs to, e.g., roundtrip losslessly or

388 for two implementations to share data that is not (yet) part of the

389 interchange protocol specification. For avoiding collisions with other

390 entries, please add name the keys with the name of the library

391 followed by a period and the desired name, e.g, ``pandas.indexcol``.

392 """

393

394 @abstractmethod

395 def num_columns(self) -> int:

396 """

397 Return the number of columns in the DataFrame.

398 """

399

400 @abstractmethod

401 def num_rows(self) -> int | None:

402 # TODO: not happy with Optional, but need to flag it may be expensive

403 # why include it if it may be None - what do we expect consumers

404 # to do here?

405 """

406 Return the number of rows in the DataFrame, if available.

407 """

408

409 @abstractmethod

410 def num_chunks(self) -> int:

411 """

412 Return the number of chunks the DataFrame consists of.

413 """

414

415 @abstractmethod

416 def column_names(self) -> Iterable[str]:

417 """

418 Return an iterator yielding the column names.

419 """

420

421 @abstractmethod

422 def get_column(self, i: int) -> Column:

423 """

424 Return the column at the indicated position.

425 """

426

427 @abstractmethod

428 def get_column_by_name(self, name: str) -> Column:

429 """

430 Return the column whose name is the indicated name.

431 """

432

433 @abstractmethod

434 def get_columns(self) -> Iterable[Column]:

435 """

436 Return an iterator yielding the columns.

437 """

438

439 @abstractmethod

440 def select_columns(self, indices: Sequence[int]) -> DataFrame:

441 """

442 Create a new DataFrame by selecting a subset of columns by index.

443 """

444

445 @abstractmethod

446 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:

447 """

448 Create a new DataFrame by selecting a subset of columns by name.

449 """

450

451 @abstractmethod

452 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:

453 """

454 Return an iterator yielding the chunks.

455

456 By default (None), yields the chunks that the data is stored as by the

457 producer. If given, ``n_chunks`` must be a multiple of

458 ``self.num_chunks()``, meaning the producer must subdivide each chunk

459 before yielding it.

460 """

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/interchange/dataframe_protocol.py: 99%

101 statements