Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/dataframe

1"""

2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api

3"""

5from __future__ import annotations

7from abc import (

8 ABC,

9 abstractmethod,

10)

11import enum

12from typing import (

13 TYPE_CHECKING,

14 Any,

15 TypedDict,

16)

18if TYPE_CHECKING:

19 from collections.abc import (

20 Iterable,

21 Sequence,

22 )

25class DlpackDeviceType(enum.IntEnum):

26 """Integer enum for device type codes matching DLPack."""

28 CPU = 1

29 CUDA = 2

30 CPU_PINNED = 3

31 OPENCL = 4

32 VULKAN = 7

33 METAL = 8

34 VPI = 9

35 ROCM = 10

38class DtypeKind(enum.IntEnum):

39 """

40 Integer enum for data types.

42 Attributes

43 ----------

44 INT : int

45 Matches to signed integer data type.

46 UINT : int

47 Matches to unsigned integer data type.

48 FLOAT : int

49 Matches to floating point data type.

50 BOOL : int

51 Matches to boolean data type.

52 STRING : int

53 Matches to string data type (UTF-8 encoded).

54 DATETIME : int

55 Matches to datetime data type.

56 CATEGORICAL : int

57 Matches to categorical data type.

58 """

60 INT = 0

61 UINT = 1

62 FLOAT = 2

63 BOOL = 20

64 STRING = 21 # UTF-8

65 DATETIME = 22

66 CATEGORICAL = 23

69class ColumnNullType(enum.IntEnum):

70 """

71 Integer enum for null type representation.

73 Attributes

74 ----------

75 NON_NULLABLE : int

76 Non-nullable column.

77 USE_NAN : int

78 Use explicit float NaN value.

79 USE_SENTINEL : int

80 Sentinel value besides NaN/NaT.

81 USE_BITMASK : int

82 The bit is set/unset representing a null on a certain position.

83 USE_BYTEMASK : int

84 The byte is set/unset representing a null on a certain position.

85 """

87 NON_NULLABLE = 0

88 USE_NAN = 1

89 USE_SENTINEL = 2

90 USE_BITMASK = 3

91 USE_BYTEMASK = 4

94class ColumnBuffers(TypedDict):

95 # first element is a buffer containing the column data;

96 # second element is the data buffer's associated dtype

97 data: tuple[Buffer, Any]

99 # first element is a buffer containing mask values indicating missing data;

100 # second element is the mask value buffer's associated dtype.

101 # None if the null representation is not a bit or byte mask

102 validity: tuple[Buffer, Any] | None

103

104 # first element is a buffer containing the offset values for

105 # variable-size binary data (e.g., variable-length strings);

106 # second element is the offsets buffer's associated dtype.

107 # None if the data buffer does not have an associated offsets buffer

108 offsets: tuple[Buffer, Any] | None

109

110

111class CategoricalDescription(TypedDict):

112 # whether the ordering of dictionary indices is semantically meaningful

113 is_ordered: bool

114 # whether a dictionary-style mapping of categorical values to other objects exists

115 is_dictionary: bool

116 # Python-level only (e.g. ``{int: str}``).

117 # None if not a dictionary-style categorical.

118 categories: Column | None

119

120

121class Buffer(ABC):

122 """

123 Data in the buffer is guaranteed to be contiguous in memory.

124

125 Note that there is no dtype attribute present, a buffer can be thought of

126 as simply a block of memory. However, if the column that the buffer is

127 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is

128 implemented, then that dtype information will be contained in the return

129 value from ``__dlpack__``.

130

131 This distinction is useful to support both data exchange via DLPack on a

132 buffer and (b) dtypes like variable-length strings which do not have a

133 fixed number of bytes per element.

134 """

135

136 @property

137 @abstractmethod

138 def bufsize(self) -> int:

139 """

140 Buffer size in bytes.

141 """

142

143 @property

144 @abstractmethod

145 def ptr(self) -> int:

146 """

147 Pointer to start of the buffer as an integer.

148 """

149

150 @abstractmethod

151 def __dlpack__(self):

152 """

153 Produce DLPack capsule (see array API standard).

154

155 Raises:

156

157 - TypeError : if the buffer contains unsupported dtypes.

158 - NotImplementedError : if DLPack support is not implemented

159

160 Useful to have to connect to array libraries. Support optional because

161 it's not completely trivial to implement for a Python-only library.

162 """

163 raise NotImplementedError("__dlpack__")

164

165 @abstractmethod

166 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:

167 """

168 Device type and device ID for where the data in the buffer resides.

169 Uses device type codes matching DLPack.

170 Note: must be implemented even if ``__dlpack__`` is not.

171 """

172

173

174class Column(ABC):

175 """

176 A column object, with only the methods and properties required by the

177 interchange protocol defined.

178

179 A column can contain one or more chunks. Each chunk can contain up to three

180 buffers - a data buffer, a mask buffer (depending on null representation),

181 and an offsets buffer (if variable-size binary; e.g., variable-length

182 strings).

183

184 TBD: Arrow has a separate "null" dtype, and has no separate mask concept.

185 Instead, it seems to use "children" for both columns with a bit mask,

186 and for nested dtypes. Unclear whether this is elegant or confusing.

187 This design requires checking the null representation explicitly.

188

189 The Arrow design requires checking:

190 1. the ARROW_FLAG_NULLABLE (for sentinel values)

191 2. if a column has two children, combined with one of those children

192 having a null dtype.

193

194 Making the mask concept explicit seems useful. One null dtype would

195 not be enough to cover both bit and byte masks, so that would mean

196 even more checking if we did it the Arrow way.

197

198 TBD: there's also the "chunk" concept here, which is implicit in Arrow as

199 multiple buffers per array (= column here). Semantically it may make

200 sense to have both: chunks were meant for example for lazy evaluation

201 of data which doesn't fit in memory, while multiple buffers per column

202 could also come from doing a selection operation on a single

203 contiguous buffer.

204

205 Given these concepts, one would expect chunks to be all of the same

206 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),

207 while multiple buffers could have data-dependent lengths. Not an issue

208 in pandas if one column is backed by a single NumPy array, but in

209 Arrow it seems possible.

210 Are multiple chunks *and* multiple buffers per column necessary for

211 the purposes of this interchange protocol, or must producers either

212 reuse the chunk concept for this or copy the data?

213

214 Note: this Column object can only be produced by ``__dataframe__``, so

215 doesn't need its own version or ``__column__`` protocol.

216 """

217

218 @abstractmethod

219 def size(self) -> int:

220 """

221 Size of the column, in elements.

222

223 Corresponds to DataFrame.num_rows() if column is a single chunk;

224 equal to size of this current chunk otherwise.

225 """

226

227 @property

228 @abstractmethod

229 def offset(self) -> int:

230 """

231 Offset of first element.

232

233 May be > 0 if using chunks; for example for a column with N chunks of

234 equal size M (only the last chunk may be shorter),

235 ``offset = n * M``, ``n = 0 .. N-1``.

236 """

237

238 @property

239 @abstractmethod

240 def dtype(self) -> tuple[DtypeKind, int, str, str]:

241 """

242 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.

243

244 Bit-width : the number of bits as an integer

245 Format string : data type description format string in Apache Arrow C

246 Data Interface format.

247 Endianness : current only native endianness (``=``) is supported

248

249 Notes:

250 - Kind specifiers are aligned with DLPack where possible (hence the

251 jump to 20, leave enough room for future extension)

252 - Masks must be specified as boolean with either bit width 1 (for bit

253 masks) or 8 (for byte masks).

254 - Dtype width in bits was preferred over bytes

255 - Endianness isn't too useful, but included now in case in the future

256 we need to support non-native endianness

257 - Went with Apache Arrow format strings over NumPy format strings

258 because they're more complete from a dataframe perspective

259 - Format strings are mostly useful for datetime specification, and

260 for categoricals.

261 - For categoricals, the format string describes the type of the

262 categorical in the data buffer. In case of a separate encoding of

263 the categorical (e.g. an integer to string mapping), this can

264 be derived from ``self.describe_categorical``.

265 - Data types not included: complex, Arrow-style null, binary, decimal,

266 and nested (list, struct, map, union) dtypes.

267 """

268

269 @property

270 @abstractmethod

271 def describe_categorical(self) -> CategoricalDescription:

272 """

273 If the dtype is categorical, there are two options:

274 - There are only values in the data buffer.

275 - There is a separate non-categorical Column encoding for categorical values.

276

277 Raises TypeError if the dtype is not categorical

278

279 Returns the dictionary with description on how to interpret the data buffer:

280 - "is_ordered" : bool, whether the ordering of dictionary indices is

281 semantically meaningful.

282 - "is_dictionary" : bool, whether a mapping of

283 categorical values to other objects exists

284 - "categories" : Column representing the (implicit) mapping of indices to

285 category values (e.g. an array of cat1, cat2, ...).

286 None if not a dictionary-style categorical.

287

288 TBD: are there any other in-memory representations that are needed?

289 """

290

291 @property

292 @abstractmethod

293 def describe_null(self) -> tuple[ColumnNullType, Any]:

294 """

295 Return the missing value (or "null") representation the column dtype

296 uses, as a tuple ``(kind, value)``.

297

298 Value : if kind is "sentinel value", the actual value. If kind is a bit

299 mask or a byte mask, the value (0 or 1) indicating a missing value. None

300 otherwise.

301 """

302

303 @property

304 @abstractmethod

305 def null_count(self) -> int | None:

306 """

307 Number of null elements, if known.

308

309 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.

310 """

311

312 @property

313 @abstractmethod

314 def metadata(self) -> dict[str, Any]:

315 """

316 The metadata for the column. See `DataFrame.metadata` for more details.

317 """

318

319 @abstractmethod

320 def num_chunks(self) -> int:

321 """

322 Return the number of chunks the column consists of.

323 """

324

325 @abstractmethod

326 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:

327 """

328 Return an iterator yielding the chunks.

329

330 See `DataFrame.get_chunks` for details on ``n_chunks``.

331 """

332

333 @abstractmethod

334 def get_buffers(self) -> ColumnBuffers:

335 """

336 Return a dictionary containing the underlying buffers.

337

338 The returned dictionary has the following contents:

339

340 - "data": a two-element tuple whose first element is a buffer

341 containing the data and whose second element is the data

342 buffer's associated dtype.

343 - "validity": a two-element tuple whose first element is a buffer

344 containing mask values indicating missing data and

345 whose second element is the mask value buffer's

346 associated dtype. None if the null representation is

347 not a bit or byte mask.

348 - "offsets": a two-element tuple whose first element is a buffer

349 containing the offset values for variable-size binary

350 data (e.g., variable-length strings) and whose second

351 element is the offsets buffer's associated dtype. None

352 if the data buffer does not have an associated offsets

353 buffer.

354 """

355

356

357# def get_children(self) -> Iterable[Column]:

358# """

359# Children columns underneath the column, each object in this iterator

360# must adhere to the column specification.

361# """

362# pass

363

364

365class DataFrame(ABC):

366 """

367 A data frame class, with only the methods required by the interchange

368 protocol defined.

369

370 A "data frame" represents an ordered collection of named columns.

371 A column's "name" must be a unique string.

372 Columns may be accessed by name or by position.

373

374 This could be a public data frame class, or an object with the methods and

375 attributes defined on this DataFrame class could be returned from the

376 ``__dataframe__`` method of a public data frame class in a library adhering

377 to the dataframe interchange protocol specification.

378 """

379

380 version = 0 # version of the protocol

381

382 @abstractmethod

383 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):

384 """Construct a new interchange object, potentially changing the parameters."""

385

386 @property

387 @abstractmethod

388 def metadata(self) -> dict[str, Any]:

389 """

390 The metadata for the data frame, as a dictionary with string keys. The

391 contents of `metadata` may be anything, they are meant for a library

392 to store information that it needs to, e.g., roundtrip losslessly or

393 for two implementations to share data that is not (yet) part of the

394 interchange protocol specification. For avoiding collisions with other

395 entries, please add name the keys with the name of the library

396 followed by a period and the desired name, e.g, ``pandas.indexcol``.

397 """

398

399 @abstractmethod

400 def num_columns(self) -> int:

401 """

402 Return the number of columns in the DataFrame.

403 """

404

405 @abstractmethod

406 def num_rows(self) -> int | None:

407 # TODO: not happy with Optional, but need to flag it may be expensive

408 # why include it if it may be None - what do we expect consumers

409 # to do here?

410 """

411 Return the number of rows in the DataFrame, if available.

412 """

413

414 @abstractmethod

415 def num_chunks(self) -> int:

416 """

417 Return the number of chunks the DataFrame consists of.

418 """

419

420 @abstractmethod

421 def column_names(self) -> Iterable[str]:

422 """

423 Return an iterator yielding the column names.

424 """

425

426 @abstractmethod

427 def get_column(self, i: int) -> Column:

428 """

429 Return the column at the indicated position.

430 """

431

432 @abstractmethod

433 def get_column_by_name(self, name: str) -> Column:

434 """

435 Return the column whose name is the indicated name.

436 """

437

438 @abstractmethod

439 def get_columns(self) -> Iterable[Column]:

440 """

441 Return an iterator yielding the columns.

442 """

443

444 @abstractmethod

445 def select_columns(self, indices: Sequence[int]) -> DataFrame:

446 """

447 Create a new DataFrame by selecting a subset of columns by index.

448 """

449

450 @abstractmethod

451 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:

452 """

453 Create a new DataFrame by selecting a subset of columns by name.

454 """

455

456 @abstractmethod

457 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:

458 """

459 Return an iterator yielding the chunks.

460

461 By default (None), yields the chunks that the data is stored as by the

462 producer. If given, ``n_chunks`` must be a multiple of

463 ``self.num_chunks()``, meaning the producer must subdivide each chunk

464 before yielding it.

465 """

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/dataframe_protocol.py: 98%

103 statements