Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/excel/

1from __future__ import annotations

3import abc

4import datetime

5from functools import partial

6from io import BytesIO

7import os

8from textwrap import fill

9from types import TracebackType

10from typing import (

11 IO,

12 Any,

13 Callable,

14 Hashable,

15 Iterable,

16 List,

17 Literal,

18 Mapping,

19 Sequence,

20 Union,

21 cast,

22 overload,

23)

24import zipfile

26from pandas._config import config

28from pandas._libs import lib

29from pandas._libs.parsers import STR_NA_VALUES

30from pandas._typing import (

31 DtypeArg,

32 DtypeBackend,

33 FilePath,

34 IntStrT,

35 ReadBuffer,

36 StorageOptions,

37 WriteExcelBuffer,

38)

39from pandas.compat._optional import (

40 get_version,

41 import_optional_dependency,

42)

43from pandas.errors import EmptyDataError

44from pandas.util._decorators import (

45 Appender,

46 doc,

47)

48from pandas.util._validators import check_dtype_backend

50from pandas.core.dtypes.common import (

51 is_bool,

52 is_float,

53 is_integer,

54 is_list_like,

55)

57from pandas.core.frame import DataFrame

58from pandas.core.shared_docs import _shared_docs

59from pandas.util.version import Version

61from pandas.io.common import (

62 IOHandles,

63 get_handle,

64 stringify_path,

65 validate_header_arg,

66)

67from pandas.io.excel._util import (

68 fill_mi_header,

69 get_default_engine,

70 get_writer,

71 maybe_convert_usecols,

72 pop_header_name,

73)

74from pandas.io.parsers import TextParser

75from pandas.io.parsers.readers import validate_integer

77_read_excel_doc = (

78 """

79Read an Excel file into a pandas DataFrame.

81Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions

82read from a local filesystem or URL. Supports an option to read

83a single sheet or a list of sheets.

85Parameters

86----------

87io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object

88 Any valid string path is acceptable. The string could be a URL. Valid

89 URL schemes include http, ftp, s3, and file. For file URLs, a host is

90 expected. A local file could be: ``file://localhost/path/to/table.xlsx``.

92 If you want to pass in a path object, pandas accepts any ``os.PathLike``.

94 By file-like object, we refer to objects with a ``read()`` method,

95 such as a file handle (e.g. via builtin ``open`` function)

96 or ``StringIO``.

97sheet_name : str, int, list, or None, default 0

98 Strings are used for sheet names. Integers are used in zero-indexed

99 sheet positions (chart sheets do not count as a sheet position).

100 Lists of strings/integers are used to request multiple sheets.

101 Specify None to get all worksheets.

102

103 Available cases:

104

105 * Defaults to ``0``: 1st sheet as a `DataFrame`

106 * ``1``: 2nd sheet as a `DataFrame`

107 * ``"Sheet1"``: Load sheet with name "Sheet1"

108 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"

109 as a dict of `DataFrame`

110 * None: All worksheets.

111

112header : int, list of int, default 0

113 Row (0-indexed) to use for the column labels of the parsed

114 DataFrame. If a list of integers is passed those row positions will

115 be combined into a ``MultiIndex``. Use None if there is no header.

116names : array-like, default None

117 List of column names to use. If file contains no header row,

118 then you should explicitly pass header=None.

119index_col : int, list of int, default None

120 Column (0-indexed) to use as the row labels of the DataFrame.

121 Pass None if there is no such column. If a list is passed,

122 those columns will be combined into a ``MultiIndex``. If a

123 subset of data is selected with ``usecols``, index_col

124 is based on the subset.

125

126 Missing values will be forward filled to allow roundtripping with

127 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the

128 missing values use ``set_index`` after reading the data instead of

129 ``index_col``.

130usecols : str, list-like, or callable, default None

131 * If None, then parse all columns.

132 * If str, then indicates comma separated list of Excel column letters

133 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of

134 both sides.

135 * If list of int, then indicates list of column numbers to be parsed

136 (0-indexed).

137 * If list of string, then indicates list of column names to be parsed.

138 * If callable, then evaluate each column name against it and parse the

139 column if the callable returns ``True``.

140

141 Returns a subset of the columns according to behavior above.

142dtype : Type name or dict of column -> type, default None

143 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}

144 Use `object` to preserve data as stored in Excel and not interpret dtype.

145 If converters are specified, they will be applied INSTEAD

146 of dtype conversion.

147engine : str, default None

148 If io is not a buffer or path, this must be set to identify io.

149 Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".

150 Engine compatibility :

151

152 - "xlrd" supports old-style Excel files (.xls).

153 - "openpyxl" supports newer Excel file formats.

154 - "odf" supports OpenDocument file formats (.odf, .ods, .odt).

155 - "pyxlsb" supports Binary Excel files.

156

157 .. versionchanged:: 1.2.0

158 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_

159 now only supports old-style ``.xls`` files.

160 When ``engine=None``, the following logic will be

161 used to determine the engine:

162

163 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),

164 then `odf <https://pypi.org/project/odfpy/>`_ will be used.

165 - Otherwise if ``path_or_buffer`` is an xls format,

166 ``xlrd`` will be used.

167 - Otherwise if ``path_or_buffer`` is in xlsb format,

168 ``pyxlsb`` will be used.

169

170 .. versionadded:: 1.3.0

171 - Otherwise ``openpyxl`` will be used.

172

173 .. versionchanged:: 1.3.0

174

175converters : dict, default None

176 Dict of functions for converting values in certain columns. Keys can

177 either be integers or column labels, values are functions that take one

178 input argument, the Excel cell content, and return the transformed

179 content.

180true_values : list, default None

181 Values to consider as True.

182false_values : list, default None

183 Values to consider as False.

184skiprows : list-like, int, or callable, optional

185 Line numbers to skip (0-indexed) or number of lines to skip (int) at the

186 start of the file. If callable, the callable function will be evaluated

187 against the row indices, returning True if the row should be skipped and

188 False otherwise. An example of a valid callable argument would be ``lambda

189 x: x in [0, 2]``.

190nrows : int, default None

191 Number of rows to parse.

192na_values : scalar, str, list-like, or dict, default None

193 Additional strings to recognize as NA/NaN. If dict passed, specific

194 per-column NA values. By default the following values are interpreted

195 as NaN: '"""

196 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")

197 + """'.

198keep_default_na : bool, default True

199 Whether or not to include the default NaN values when parsing the data.

200 Depending on whether `na_values` is passed in, the behavior is as follows:

201

202 * If `keep_default_na` is True, and `na_values` are specified, `na_values`

203 is appended to the default NaN values used for parsing.

204 * If `keep_default_na` is True, and `na_values` are not specified, only

205 the default NaN values are used for parsing.

206 * If `keep_default_na` is False, and `na_values` are specified, only

207 the NaN values specified `na_values` are used for parsing.

208 * If `keep_default_na` is False, and `na_values` are not specified, no

209 strings will be parsed as NaN.

210

211 Note that if `na_filter` is passed in as False, the `keep_default_na` and

212 `na_values` parameters will be ignored.

213na_filter : bool, default True

214 Detect missing value markers (empty strings and the value of na_values). In

215 data without any NAs, passing na_filter=False can improve the performance

216 of reading a large file.

217verbose : bool, default False

218 Indicate number of NA values placed in non-numeric columns.

219parse_dates : bool, list-like, or dict, default False

220 The behavior is as follows:

221

222 * bool. If True -> try parsing the index.

223 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

224 each as a separate date column.

225 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

226 a single date column.

227 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

228 result 'foo'

229

230 If a column or index contains an unparsable date, the entire column or

231 index will be returned unaltered as an object data type. If you don`t want to

232 parse some cells as date just change their type in Excel to "Text".

233 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.

234

235 Note: A fast-path exists for iso8601-formatted dates.

236date_parser : function, optional

237 Function to use for converting a sequence of string columns to an array of

238 datetime instances. The default uses ``dateutil.parser.parser`` to do the

239 conversion. Pandas will try to call `date_parser` in three different ways,

240 advancing to the next if an exception occurs: 1) Pass one or more arrays

241 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the

242 string values from the columns defined by `parse_dates` into a single array

243 and pass that; and 3) call `date_parser` once for each row using one or

244 more strings (corresponding to the columns defined by `parse_dates`) as

245 arguments.

246

247 .. deprecated:: 2.0.0

248 Use ``date_format`` instead, or read in as ``object`` and then apply

249 :func:`to_datetime` as-needed.

250date_format : str or dict of column -> format, default ``None``

251 If used in conjunction with ``parse_dates``, will parse dates according to this

252 format. For anything more complex,

253 please read in as ``object`` and then apply :func:`to_datetime` as-needed.

254

255 .. versionadded:: 2.0.0

256thousands : str, default None

257 Thousands separator for parsing string columns to numeric. Note that

258 this parameter is only necessary for columns stored as TEXT in Excel,

259 any numeric columns will automatically be parsed, regardless of display

260 format.

261decimal : str, default '.'

262 Character to recognize as decimal point for parsing string columns to numeric.

263 Note that this parameter is only necessary for columns stored as TEXT in Excel,

264 any numeric columns will automatically be parsed, regardless of display

265 format.(e.g. use ',' for European data).

266

267 .. versionadded:: 1.4.0

268

269comment : str, default None

270 Comments out remainder of line. Pass a character or characters to this

271 argument to indicate comments in the input file. Any data between the

272 comment string and the end of the current line is ignored.

273skipfooter : int, default 0

274 Rows at the end to skip (0-indexed).

275{storage_options}

276

277 .. versionadded:: 1.2.0

278

279dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames

280 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

281 arrays, nullable dtypes are used for all dtypes that have a nullable

282 implementation when "numpy_nullable" is set, pyarrow is used for all

283 dtypes if "pyarrow" is set.

284

285 The dtype_backends are still experimential.

286

287 .. versionadded:: 2.0

288

289Returns

290-------

291DataFrame or dict of DataFrames

292 DataFrame from the passed in Excel file. See notes in sheet_name

293 argument for more information on when a dict of DataFrames is returned.

294

295See Also

296--------

297DataFrame.to_excel : Write DataFrame to an Excel file.

298DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

299read_csv : Read a comma-separated values (csv) file into DataFrame.

300read_fwf : Read a table of fixed-width formatted lines into DataFrame.

301

302Examples

303--------

304The file can be read using the file name as string or an open file object:

305

306>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP

307 Name Value

3080 string1 1

3091 string2 2

3102 #Comment 3

311

312>>> pd.read_excel(open('tmp.xlsx', 'rb'),

313... sheet_name='Sheet3') # doctest: +SKIP

314 Unnamed: 0 Name Value

3150 0 string1 1

3161 1 string2 2

3172 2 #Comment 3

318

319Index and header can be specified via the `index_col` and `header` arguments

320

321>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP

322 0 1 2

3230 NaN Name Value

3241 0.0 string1 1

3252 1.0 string2 2

3263 2.0 #Comment 3

327

328Column types are inferred but can be explicitly specified

329

330>>> pd.read_excel('tmp.xlsx', index_col=0,

331... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP

332 Name Value

3330 string1 1.0

3341 string2 2.0

3352 #Comment 3.0

336

337True, False, and NA values, and thousands separators have defaults,

338but can be explicitly specified, too. Supply the values you would like

339as strings or lists of strings!

340

341>>> pd.read_excel('tmp.xlsx', index_col=0,

342... na_values=['string1', 'string2']) # doctest: +SKIP

343 Name Value

3440 NaN 1

3451 NaN 2

3462 #Comment 3

347

348Comment lines in the excel input file can be skipped using the `comment` kwarg

349

350>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP

351 Name Value

3520 string1 1.0

3531 string2 2.0

3542 None NaN

355"""

356)

357

358

359@overload

360def read_excel(

361 io,

362 # sheet name is str or int -> DataFrame

363 sheet_name: str | int = ...,

364 *,

365 header: int | Sequence[int] | None = ...,

366 names: list[str] | None = ...,

367 index_col: int | Sequence[int] | None = ...,

368 usecols: int

369 | str

370 | Sequence[int]

371 | Sequence[str]

372 | Callable[[str], bool]

373 | None = ...,

374 dtype: DtypeArg | None = ...,

375 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,

376 converters: dict[str, Callable] | dict[int, Callable] | None = ...,

377 true_values: Iterable[Hashable] | None = ...,

378 false_values: Iterable[Hashable] | None = ...,

379 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,

380 nrows: int | None = ...,

381 na_values=...,

382 keep_default_na: bool = ...,

383 na_filter: bool = ...,

384 verbose: bool = ...,

385 parse_dates: list | dict | bool = ...,

386 date_parser: Callable | lib.NoDefault = ...,

387 date_format: dict[Hashable, str] | str | None = ...,

388 thousands: str | None = ...,

389 decimal: str = ...,

390 comment: str | None = ...,

391 skipfooter: int = ...,

392 storage_options: StorageOptions = ...,

393 dtype_backend: DtypeBackend | lib.NoDefault = ...,

394) -> DataFrame:

395 ...

396

397

398@overload

399def read_excel(

400 io,

401 # sheet name is list or None -> dict[IntStrT, DataFrame]

402 sheet_name: list[IntStrT] | None,

403 *,

404 header: int | Sequence[int] | None = ...,

405 names: list[str] | None = ...,

406 index_col: int | Sequence[int] | None = ...,

407 usecols: int

408 | str

409 | Sequence[int]

410 | Sequence[str]

411 | Callable[[str], bool]

412 | None = ...,

413 dtype: DtypeArg | None = ...,

414 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,

415 converters: dict[str, Callable] | dict[int, Callable] | None = ...,

416 true_values: Iterable[Hashable] | None = ...,

417 false_values: Iterable[Hashable] | None = ...,

418 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,

419 nrows: int | None = ...,

420 na_values=...,

421 keep_default_na: bool = ...,

422 na_filter: bool = ...,

423 verbose: bool = ...,

424 parse_dates: list | dict | bool = ...,

425 date_parser: Callable | lib.NoDefault = ...,

426 date_format: dict[Hashable, str] | str | None = ...,

427 thousands: str | None = ...,

428 decimal: str = ...,

429 comment: str | None = ...,

430 skipfooter: int = ...,

431 storage_options: StorageOptions = ...,

432 dtype_backend: DtypeBackend | lib.NoDefault = ...,

433) -> dict[IntStrT, DataFrame]:

434 ...

435

436

437@doc(storage_options=_shared_docs["storage_options"])

438@Appender(_read_excel_doc)

439def read_excel(

440 io,

441 sheet_name: str | int | list[IntStrT] | None = 0,

442 *,

443 header: int | Sequence[int] | None = 0,

444 names: list[str] | None = None,

445 index_col: int | Sequence[int] | None = None,

446 usecols: int

447 | str

448 | Sequence[int]

449 | Sequence[str]

450 | Callable[[str], bool]

451 | None = None,

452 dtype: DtypeArg | None = None,

453 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None,

454 converters: dict[str, Callable] | dict[int, Callable] | None = None,

455 true_values: Iterable[Hashable] | None = None,

456 false_values: Iterable[Hashable] | None = None,

457 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,

458 nrows: int | None = None,

459 na_values=None,

460 keep_default_na: bool = True,

461 na_filter: bool = True,

462 verbose: bool = False,

463 parse_dates: list | dict | bool = False,

464 date_parser: Callable | lib.NoDefault = lib.no_default,

465 date_format: dict[Hashable, str] | str | None = None,

466 thousands: str | None = None,

467 decimal: str = ".",

468 comment: str | None = None,

469 skipfooter: int = 0,

470 storage_options: StorageOptions = None,

471 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

472) -> DataFrame | dict[IntStrT, DataFrame]:

473 check_dtype_backend(dtype_backend)

474

475 should_close = False

476 if not isinstance(io, ExcelFile):

477 should_close = True

478 io = ExcelFile(io, storage_options=storage_options, engine=engine)

479 elif engine and engine != io.engine:

480 raise ValueError(

481 "Engine should not be specified when passing "

482 "an ExcelFile - ExcelFile already has the engine set"

483 )

484

485 try:

486 data = io.parse(

487 sheet_name=sheet_name,

488 header=header,

489 names=names,

490 index_col=index_col,

491 usecols=usecols,

492 dtype=dtype,

493 converters=converters,

494 true_values=true_values,

495 false_values=false_values,

496 skiprows=skiprows,

497 nrows=nrows,

498 na_values=na_values,

499 keep_default_na=keep_default_na,

500 na_filter=na_filter,

501 verbose=verbose,

502 parse_dates=parse_dates,

503 date_parser=date_parser,

504 date_format=date_format,

505 thousands=thousands,

506 decimal=decimal,

507 comment=comment,

508 skipfooter=skipfooter,

509 dtype_backend=dtype_backend,

510 )

511 finally:

512 # make sure to close opened file handles

513 if should_close:

514 io.close()

515 return data

516

517

518class BaseExcelReader(metaclass=abc.ABCMeta):

519 def __init__(

520 self, filepath_or_buffer, storage_options: StorageOptions = None

521 ) -> None:

522 # First argument can also be bytes, so create a buffer

523 if isinstance(filepath_or_buffer, bytes):

524 filepath_or_buffer = BytesIO(filepath_or_buffer)

525

526 self.handles = IOHandles(

527 handle=filepath_or_buffer, compression={"method": None}

528 )

529 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):

530 self.handles = get_handle(

531 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False

532 )

533

534 if isinstance(self.handles.handle, self._workbook_class):

535 self.book = self.handles.handle

536 elif hasattr(self.handles.handle, "read"):

537 # N.B. xlrd.Book has a read attribute too

538 self.handles.handle.seek(0)

539 try:

540 self.book = self.load_workbook(self.handles.handle)

541 except Exception:

542 self.close()

543 raise

544 else:

545 raise ValueError(

546 "Must explicitly set engine if not passing in buffer or path for io."

547 )

548

549 @property

550 @abc.abstractmethod

551 def _workbook_class(self):

552 pass

553

554 @abc.abstractmethod

555 def load_workbook(self, filepath_or_buffer):

556 pass

557

558 def close(self) -> None:

559 if hasattr(self, "book"):

560 if hasattr(self.book, "close"):

561 # pyxlsb: opens a TemporaryFile

562 # openpyxl: https://stackoverflow.com/questions/31416842/

563 # openpyxl-does-not-close-excel-workbook-in-read-only-mode

564 self.book.close()

565 elif hasattr(self.book, "release_resources"):

566 # xlrd

567 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548

568 self.book.release_resources()

569 self.handles.close()

570

571 @property

572 @abc.abstractmethod

573 def sheet_names(self) -> list[str]:

574 pass

575

576 @abc.abstractmethod

577 def get_sheet_by_name(self, name: str):

578 pass

579

580 @abc.abstractmethod

581 def get_sheet_by_index(self, index: int):

582 pass

583

584 @abc.abstractmethod

585 def get_sheet_data(self, sheet, rows: int | None = None):

586 pass

587

588 def raise_if_bad_sheet_by_index(self, index: int) -> None:

589 n_sheets = len(self.sheet_names)

590 if index >= n_sheets:

591 raise ValueError(

592 f"Worksheet index {index} is invalid, {n_sheets} worksheets found"

593 )

594

595 def raise_if_bad_sheet_by_name(self, name: str) -> None:

596 if name not in self.sheet_names:

597 raise ValueError(f"Worksheet named '{name}' not found")

598

599 def _check_skiprows_func(

600 self,

601 skiprows: Callable,

602 rows_to_use: int,

603 ) -> int:

604 """

605 Determine how many file rows are required to obtain `nrows` data

606 rows when `skiprows` is a function.

607

608 Parameters

609 ----------

610 skiprows : function

611 The function passed to read_excel by the user.

612 rows_to_use : int

613 The number of rows that will be needed for the header and

614 the data.

615

616 Returns

617 -------

618 int

619 """

620 i = 0

621 rows_used_so_far = 0

622 while rows_used_so_far < rows_to_use:

623 if not skiprows(i):

624 rows_used_so_far += 1

625 i += 1

626 return i

627

628 def _calc_rows(

629 self,

630 header: int | Sequence[int] | None,

631 index_col: int | Sequence[int] | None,

632 skiprows: Sequence[int] | int | Callable[[int], object] | None,

633 nrows: int | None,

634 ) -> int | None:

635 """

636 If nrows specified, find the number of rows needed from the

637 file, otherwise return None.

638

639

640 Parameters

641 ----------

642 header : int, list of int, or None

643 See read_excel docstring.

644 index_col : int, list of int, or None

645 See read_excel docstring.

646 skiprows : list-like, int, callable, or None

647 See read_excel docstring.

648 nrows : int or None

649 See read_excel docstring.

650

651 Returns

652 -------

653 int or None

654 """

655 if nrows is None:

656 return None

657 if header is None:

658 header_rows = 1

659 elif is_integer(header):

660 header = cast(int, header)

661 header_rows = 1 + header

662 else:

663 header = cast(Sequence, header)

664 header_rows = 1 + header[-1]

665 # If there is a MultiIndex header and an index then there is also

666 # a row containing just the index name(s)

667 if is_list_like(header) and index_col is not None:

668 header = cast(Sequence, header)

669 if len(header) > 1:

670 header_rows += 1

671 if skiprows is None:

672 return header_rows + nrows

673 if is_integer(skiprows):

674 skiprows = cast(int, skiprows)

675 return header_rows + nrows + skiprows

676 if is_list_like(skiprows):

677

678 def f(skiprows: Sequence, x: int) -> bool:

679 return x in skiprows

680

681 skiprows = cast(Sequence, skiprows)

682 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)

683 if callable(skiprows):

684 return self._check_skiprows_func(

685 skiprows,

686 header_rows + nrows,

687 )

688 # else unexpected skiprows type: read_excel will not optimize

689 # the number of rows read from file

690 return None

691

692 def parse(

693 self,

694 sheet_name: str | int | list[int] | list[str] | None = 0,

695 header: int | Sequence[int] | None = 0,

696 names=None,

697 index_col: int | Sequence[int] | None = None,

698 usecols=None,

699 dtype: DtypeArg | None = None,

700 true_values: Iterable[Hashable] | None = None,

701 false_values: Iterable[Hashable] | None = None,

702 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,

703 nrows: int | None = None,

704 na_values=None,

705 verbose: bool = False,

706 parse_dates: list | dict | bool = False,

707 date_parser: Callable | lib.NoDefault = lib.no_default,

708 date_format: dict[Hashable, str] | str | None = None,

709 thousands: str | None = None,

710 decimal: str = ".",

711 comment: str | None = None,

712 skipfooter: int = 0,

713 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

714 **kwds,

715 ):

716 validate_header_arg(header)

717 validate_integer("nrows", nrows)

718

719 ret_dict = False

720

721 # Keep sheetname to maintain backwards compatibility.

722 sheets: list[int] | list[str]

723 if isinstance(sheet_name, list):

724 sheets = sheet_name

725 ret_dict = True

726 elif sheet_name is None:

727 sheets = self.sheet_names

728 ret_dict = True

729 elif isinstance(sheet_name, str):

730 sheets = [sheet_name]

731 else:

732 sheets = [sheet_name]

733

734 # handle same-type duplicates.

735 sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys()))

736

737 output = {}

738

739 last_sheetname = None

740 for asheetname in sheets:

741 last_sheetname = asheetname

742 if verbose:

743 print(f"Reading sheet {asheetname}")

744

745 if isinstance(asheetname, str):

746 sheet = self.get_sheet_by_name(asheetname)

747 else: # assume an integer if not a string

748 sheet = self.get_sheet_by_index(asheetname)

749

750 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)

751 data = self.get_sheet_data(sheet, file_rows_needed)

752 if hasattr(sheet, "close"):

753 # pyxlsb opens two TemporaryFiles

754 sheet.close()

755 usecols = maybe_convert_usecols(usecols)

756

757 if not data:

758 output[asheetname] = DataFrame()

759 continue

760

761 is_list_header = False

762 is_len_one_list_header = False

763 if is_list_like(header):

764 assert isinstance(header, Sequence)

765 is_list_header = True

766 if len(header) == 1:

767 is_len_one_list_header = True

768

769 if is_len_one_list_header:

770 header = cast(Sequence[int], header)[0]

771

772 # forward fill and pull out names for MultiIndex column

773 header_names = None

774 if header is not None and is_list_like(header):

775 assert isinstance(header, Sequence)

776

777 header_names = []

778 control_row = [True] * len(data[0])

779

780 for row in header:

781 if is_integer(skiprows):

782 assert isinstance(skiprows, int)

783 row += skiprows

784

785 if row > len(data) - 1:

786 raise ValueError(

787 f"header index {row} exceeds maximum index "

788 f"{len(data) - 1} of data.",

789 )

790

791 data[row], control_row = fill_mi_header(data[row], control_row)

792

793 if index_col is not None:

794 header_name, _ = pop_header_name(data[row], index_col)

795 header_names.append(header_name)

796

797 # If there is a MultiIndex header and an index then there is also

798 # a row containing just the index name(s)

799 has_index_names = False

800 if is_list_header and not is_len_one_list_header and index_col is not None:

801 index_col_list: Sequence[int]

802 if isinstance(index_col, int):

803 index_col_list = [index_col]

804 else:

805 assert isinstance(index_col, Sequence)

806 index_col_list = index_col

807

808 # We have to handle mi without names. If any of the entries in the data

809 # columns are not empty, this is a regular row

810 assert isinstance(header, Sequence)

811 if len(header) < len(data):

812 potential_index_names = data[len(header)]

813 potential_data = [

814 x

815 for i, x in enumerate(potential_index_names)

816 if not control_row[i] and i not in index_col_list

817 ]

818 has_index_names = all(x == "" or x is None for x in potential_data)

819

820 if is_list_like(index_col):

821 # Forward fill values for MultiIndex index.

822 if header is None:

823 offset = 0

824 elif isinstance(header, int):

825 offset = 1 + header

826 else:

827 offset = 1 + max(header)

828

829 # GH34673: if MultiIndex names present and not defined in the header,

830 # offset needs to be incremented so that forward filling starts

831 # from the first MI value instead of the name

832 if has_index_names:

833 offset += 1

834

835 # Check if we have an empty dataset

836 # before trying to collect data.

837 if offset < len(data):

838 assert isinstance(index_col, Sequence)

839

840 for col in index_col:

841 last = data[offset][col]

842

843 for row in range(offset + 1, len(data)):

844 if data[row][col] == "" or data[row][col] is None:

845 data[row][col] = last

846 else:

847 last = data[row][col]

848

849 # GH 12292 : error when read one empty column from excel file

850 try:

851 parser = TextParser(

852 data,

853 names=names,

854 header=header,

855 index_col=index_col,

856 has_index_names=has_index_names,

857 dtype=dtype,

858 true_values=true_values,

859 false_values=false_values,

860 skiprows=skiprows,

861 nrows=nrows,

862 na_values=na_values,

863 skip_blank_lines=False, # GH 39808

864 parse_dates=parse_dates,

865 date_parser=date_parser,

866 date_format=date_format,

867 thousands=thousands,

868 decimal=decimal,

869 comment=comment,

870 skipfooter=skipfooter,

871 usecols=usecols,

872 dtype_backend=dtype_backend,

873 **kwds,

874 )

875

876 output[asheetname] = parser.read(nrows=nrows)

877

878 if header_names:

879 output[asheetname].columns = output[asheetname].columns.set_names(

880 header_names

881 )

882

883 except EmptyDataError:

884 # No Data, return an empty DataFrame

885 output[asheetname] = DataFrame()

886

887 except Exception as err:

888 err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])

889 raise err

890

891 if last_sheetname is None:

892 raise ValueError("Sheet name is an empty list")

893

894 if ret_dict:

895 return output

896 else:

897 return output[last_sheetname]

898

899

900@doc(storage_options=_shared_docs["storage_options"])

901class ExcelWriter(metaclass=abc.ABCMeta):

902 """

903 Class for writing DataFrame objects into excel sheets.

904

905 Default is to use:

906

907 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter

908 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__

909 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files

910

911 See ``DataFrame.to_excel`` for typical usage.

912

913 The writer should be used as a context manager. Otherwise, call `close()` to save

914 and close any opened file handles.

915

916 Parameters

917 ----------

918 path : str or typing.BinaryIO

919 Path to xls or xlsx or ods file.

920 engine : str (optional)

921 Engine to use for writing. If None, defaults to

922 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword

923 argument.

924 date_format : str, default None

925 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').

926 datetime_format : str, default None

927 Format string for datetime objects written into Excel files.

928 (e.g. 'YYYY-MM-DD HH:MM:SS').

929 mode : {{'w', 'a'}}, default 'w'

930 File mode to use (write or append). Append does not work with fsspec URLs.

931 {storage_options}

932

933 .. versionadded:: 1.2.0

934

935 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error'

936 How to behave when trying to write to a sheet that already

937 exists (append mode only).

938

939 * error: raise a ValueError.

940 * new: Create a new sheet, with a name determined by the engine.

941 * replace: Delete the contents of the sheet before writing to it.

942 * overlay: Write contents to the existing sheet without removing the old

943 contents.

944

945 .. versionadded:: 1.3.0

946

947 .. versionchanged:: 1.4.0

948

949 Added ``overlay`` option

950

951 engine_kwargs : dict, optional

952 Keyword arguments to be passed into the engine. These will be passed to

953 the following functions of the respective engines:

954

955 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)``

956 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)``

957 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)``

958 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)``

959

960 .. versionadded:: 1.3.0

961

962 Notes

963 -----

964 For compatibility with CSV writers, ExcelWriter serializes lists

965 and dicts to strings before writing.

966

967 Examples

968 --------

969 Default usage:

970

971 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP

972 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:

973 ... df.to_excel(writer) # doctest: +SKIP

974

975 To write to separate sheets in a single file:

976

977 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP

978 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP

979 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:

980 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP

981 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP

982

983 You can set the date format or datetime format:

984

985 >>> from datetime import date, datetime # doctest: +SKIP

986 >>> df = pd.DataFrame(

987 ... [

988 ... [date(2014, 1, 31), date(1999, 9, 24)],

989 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)],

990 ... ],

991 ... index=["Date", "Datetime"],

992 ... columns=["X", "Y"],

993 ... ) # doctest: +SKIP

994 >>> with pd.ExcelWriter(

995 ... "path_to_file.xlsx",

996 ... date_format="YYYY-MM-DD",

997 ... datetime_format="YYYY-MM-DD HH:MM:SS"

998 ... ) as writer:

999 ... df.to_excel(writer) # doctest: +SKIP

1000

1001 You can also append to an existing Excel file:

1002

1003 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:

1004 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP

1005

1006 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it

1007 already exists:

1008

1009 >>> with ExcelWriter(

1010 ... "path_to_file.xlsx",

1011 ... mode="a",

1012 ... engine="openpyxl",

1013 ... if_sheet_exists="replace",

1014 ... ) as writer:

1015 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP

1016

1017 You can also write multiple DataFrames to a single sheet. Note that the

1018 ``if_sheet_exists`` parameter needs to be set to ``overlay``:

1019

1020 >>> with ExcelWriter("path_to_file.xlsx",

1021 ... mode="a",

1022 ... engine="openpyxl",

1023 ... if_sheet_exists="overlay",

1024 ... ) as writer:

1025 ... df1.to_excel(writer, sheet_name="Sheet1")

1026 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP

1027

1028 You can store Excel file in RAM:

1029

1030 >>> import io

1031 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])

1032 >>> buffer = io.BytesIO()

1033 >>> with pd.ExcelWriter(buffer) as writer:

1034 ... df.to_excel(writer)

1035

1036 You can pack Excel file into zip archive:

1037

1038 >>> import zipfile # doctest: +SKIP

1039 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP

1040 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:

1041 ... with zf.open("filename.xlsx", "w") as buffer:

1042 ... with pd.ExcelWriter(buffer) as writer:

1043 ... df.to_excel(writer) # doctest: +SKIP

1044

1045 You can specify additional arguments to the underlying engine:

1046

1047 >>> with pd.ExcelWriter(

1048 ... "path_to_file.xlsx",

1049 ... engine="xlsxwriter",

1050 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}

1051 ... ) as writer:

1052 ... df.to_excel(writer) # doctest: +SKIP

1053

1054 In append mode, ``engine_kwargs`` are passed through to

1055 openpyxl's ``load_workbook``:

1056

1057 >>> with pd.ExcelWriter(

1058 ... "path_to_file.xlsx",

1059 ... engine="openpyxl",

1060 ... mode="a",

1061 ... engine_kwargs={{"keep_vba": True}}

1062 ... ) as writer:

1063 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP

1064 """

1065

1066 # Defining an ExcelWriter implementation (see abstract methods for more...)

1067

1068 # - Mandatory

1069 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``

1070 # --> called to write additional DataFrames to disk

1071 # - ``_supported_extensions`` (tuple of supported extensions), used to

1072 # check that engine supports the given extension.

1073 # - ``_engine`` - string that gives the engine name. Necessary to

1074 # instantiate class directly and bypass ``ExcelWriterMeta`` engine

1075 # lookup.

1076 # - ``save(self)`` --> called to save file to disk

1077 # - Mostly mandatory (i.e. should at least exist)

1078 # - book, cur_sheet, path

1079

1080 # - Optional:

1081 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called

1082 # with path as first argument.

1083

1084 # You also need to register the class with ``register_writer()``.

1085 # Technically, ExcelWriter implementations don't need to subclass

1086 # ExcelWriter.

1087

1088 _engine: str

1089 _supported_extensions: tuple[str, ...]

1090

1091 def __new__(

1092 cls: type[ExcelWriter],

1093 path: FilePath | WriteExcelBuffer | ExcelWriter,

1094 engine: str | None = None,

1095 date_format: str | None = None,

1096 datetime_format: str | None = None,

1097 mode: str = "w",

1098 storage_options: StorageOptions = None,

1099 if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None,

1100 engine_kwargs: dict | None = None,

1101 ) -> ExcelWriter:

1102 # only switch class if generic(ExcelWriter)

1103 if cls is ExcelWriter:

1104 if engine is None or (isinstance(engine, str) and engine == "auto"):

1105 if isinstance(path, str):

1106 ext = os.path.splitext(path)[-1][1:]

1107 else:

1108 ext = "xlsx"

1109

1110 try:

1111 engine = config.get_option(f"io.excel.{ext}.writer", silent=True)

1112 if engine == "auto":

1113 engine = get_default_engine(ext, mode="writer")

1114 except KeyError as err:

1115 raise ValueError(f"No engine for filetype: '{ext}'") from err

1116

1117 # for mypy

1118 assert engine is not None

1119 cls = get_writer(engine)

1120

1121 return object.__new__(cls)

1122

1123 # declare external properties you can count on

1124 _path = None

1125

1126 @property

1127 def supported_extensions(self) -> tuple[str, ...]:

1128 """Extensions that writer engine supports."""

1129 return self._supported_extensions

1130

1131 @property

1132 def engine(self) -> str:

1133 """Name of engine."""

1134 return self._engine

1135

1136 @property

1137 @abc.abstractmethod

1138 def sheets(self) -> dict[str, Any]:

1139 """Mapping of sheet names to sheet objects."""

1140

1141 @property

1142 @abc.abstractmethod

1143 def book(self):

1144 """

1145 Book instance. Class type will depend on the engine used.

1146

1147 This attribute can be used to access engine-specific features.

1148 """

1149

1150 @abc.abstractmethod

1151 def _write_cells(

1152 self,

1153 cells,

1154 sheet_name: str | None = None,

1155 startrow: int = 0,

1156 startcol: int = 0,

1157 freeze_panes: tuple[int, int] | None = None,

1158 ) -> None:

1159 """

1160 Write given formatted cells into Excel an excel sheet

1161

1162 Parameters

1163 ----------

1164 cells : generator

1165 cell of formatted data to save to Excel sheet

1166 sheet_name : str, default None

1167 Name of Excel sheet, if None, then use self.cur_sheet

1168 startrow : upper left cell row to dump data frame

1169 startcol : upper left cell column to dump data frame

1170 freeze_panes: int tuple of length 2

1171 contains the bottom-most row and right-most column to freeze

1172 """

1173

1174 @abc.abstractmethod

1175 def _save(self) -> None:

1176 """

1177 Save workbook to disk.

1178 """

1179

1180 def __init__(

1181 self,

1182 path: FilePath | WriteExcelBuffer | ExcelWriter,

1183 engine: str | None = None,

1184 date_format: str | None = None,

1185 datetime_format: str | None = None,

1186 mode: str = "w",

1187 storage_options: StorageOptions = None,

1188 if_sheet_exists: str | None = None,

1189 engine_kwargs: dict[str, Any] | None = None,

1190 ) -> None:

1191 # validate that this engine can handle the extension

1192 if isinstance(path, str):

1193 ext = os.path.splitext(path)[-1]

1194 self.check_extension(ext)

1195

1196 # use mode to open the file

1197 if "b" not in mode:

1198 mode += "b"

1199 # use "a" for the user to append data to excel but internally use "r+" to let

1200 # the excel backend first read the existing file and then write any data to it

1201 mode = mode.replace("a", "r+")

1202

1203 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"):

1204 raise ValueError(

1205 f"'{if_sheet_exists}' is not valid for if_sheet_exists. "

1206 "Valid options are 'error', 'new', 'replace' and 'overlay'."

1207 )

1208 if if_sheet_exists and "r+" not in mode:

1209 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')")

1210 if if_sheet_exists is None:

1211 if_sheet_exists = "error"

1212 self._if_sheet_exists = if_sheet_exists

1213

1214 # cast ExcelWriter to avoid adding 'if self._handles is not None'

1215 self._handles = IOHandles(

1216 cast(IO[bytes], path), compression={"compression": None}

1217 )

1218 if not isinstance(path, ExcelWriter):

1219 self._handles = get_handle(

1220 path, mode, storage_options=storage_options, is_text=False

1221 )

1222 self._cur_sheet = None

1223

1224 if date_format is None:

1225 self._date_format = "YYYY-MM-DD"

1226 else:

1227 self._date_format = date_format

1228 if datetime_format is None:

1229 self._datetime_format = "YYYY-MM-DD HH:MM:SS"

1230 else:

1231 self._datetime_format = datetime_format

1232

1233 self._mode = mode

1234

1235 @property

1236 def date_format(self) -> str:

1237 """

1238 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).

1239 """

1240 return self._date_format

1241

1242 @property

1243 def datetime_format(self) -> str:

1244 """

1245 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).

1246 """

1247 return self._datetime_format

1248

1249 @property

1250 def if_sheet_exists(self) -> str:

1251 """

1252 How to behave when writing to a sheet that already exists in append mode.

1253 """

1254 return self._if_sheet_exists

1255

1256 def __fspath__(self) -> str:

1257 return getattr(self._handles.handle, "name", "")

1258

1259 def _get_sheet_name(self, sheet_name: str | None) -> str:

1260 if sheet_name is None:

1261 sheet_name = self._cur_sheet

1262 if sheet_name is None: # pragma: no cover

1263 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property")

1264 return sheet_name

1265

1266 def _value_with_fmt(self, val) -> tuple[object, str | None]:

1267 """

1268 Convert numpy types to Python types for the Excel writers.

1269

1270 Parameters

1271 ----------

1272 val : object

1273 Value to be written into cells

1274

1275 Returns

1276 -------

1277 Tuple with the first element being the converted value and the second

1278 being an optional format

1279 """

1280 fmt = None

1281

1282 if is_integer(val):

1283 val = int(val)

1284 elif is_float(val):

1285 val = float(val)

1286 elif is_bool(val):

1287 val = bool(val)

1288 elif isinstance(val, datetime.datetime):

1289 fmt = self._datetime_format

1290 elif isinstance(val, datetime.date):

1291 fmt = self._date_format

1292 elif isinstance(val, datetime.timedelta):

1293 val = val.total_seconds() / 86400

1294 fmt = "0"

1295 else:

1296 val = str(val)

1297

1298 return val, fmt

1299

1300 @classmethod

1301 def check_extension(cls, ext: str) -> Literal[True]:

1302 """

1303 checks that path's extension against the Writer's supported

1304 extensions. If it isn't supported, raises UnsupportedFiletypeError.

1305 """

1306 if ext.startswith("."):

1307 ext = ext[1:]

1308 if not any(ext in extension for extension in cls._supported_extensions):

1309 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")

1310 return True

1311

1312 # Allow use as a contextmanager

1313 def __enter__(self) -> ExcelWriter:

1314 return self

1315

1316 def __exit__(

1317 self,

1318 exc_type: type[BaseException] | None,

1319 exc_value: BaseException | None,

1320 traceback: TracebackType | None,

1321 ) -> None:

1322 self.close()

1323

1324 def close(self) -> None:

1325 """synonym for save, to make it more file-like"""

1326 self._save()

1327 self._handles.close()

1328

1329

1330XLS_SIGNATURES = (

1331 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2

1332 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3

1333 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4

1334 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary

1335)

1336ZIP_SIGNATURE = b"PK\x03\x04"

1337PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))

1338

1339

1340@doc(storage_options=_shared_docs["storage_options"])

1341def inspect_excel_format(

1342 content_or_path: FilePath | ReadBuffer[bytes],

1343 storage_options: StorageOptions = None,

1344) -> str | None:

1345 """

1346 Inspect the path or content of an excel file and get its format.

1347

1348 Adopted from xlrd: https://github.com/python-excel/xlrd.

1349

1350 Parameters

1351 ----------

1352 content_or_path : str or file-like object

1353 Path to file or content of file to inspect. May be a URL.

1354 {storage_options}

1355

1356 Returns

1357 -------

1358 str or None

1359 Format of file if it can be determined.

1360

1361 Raises

1362 ------

1363 ValueError

1364 If resulting stream is empty.

1365 BadZipFile

1366 If resulting stream does not have an XLS signature and is not a valid zipfile.

1367 """

1368 if isinstance(content_or_path, bytes):

1369 content_or_path = BytesIO(content_or_path)

1370

1371 with get_handle(

1372 content_or_path, "rb", storage_options=storage_options, is_text=False

1373 ) as handle:

1374 stream = handle.handle

1375 stream.seek(0)

1376 buf = stream.read(PEEK_SIZE)

1377 if buf is None:

1378 raise ValueError("stream is empty")

1379 assert isinstance(buf, bytes)

1380 peek = buf

1381 stream.seek(0)

1382

1383 if any(peek.startswith(sig) for sig in XLS_SIGNATURES):

1384 return "xls"

1385 elif not peek.startswith(ZIP_SIGNATURE):

1386 return None

1387

1388 with zipfile.ZipFile(stream) as zf:

1389 # Workaround for some third party files that use forward slashes and

1390 # lower case names.

1391 component_names = [

1392 name.replace("\\", "/").lower() for name in zf.namelist()

1393 ]

1394

1395 if "xl/workbook.xml" in component_names:

1396 return "xlsx"

1397 if "xl/workbook.bin" in component_names:

1398 return "xlsb"

1399 if "content.xml" in component_names:

1400 return "ods"

1401 return "zip"

1402

1403

1404class ExcelFile:

1405 """

1406 Class for parsing tabular Excel sheets into DataFrame objects.

1407

1408 See read_excel for more documentation.

1409

1410 Parameters

1411 ----------

1412 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),

1413 A file-like object, xlrd workbook or openpyxl workbook.

1414 If a string or path object, expected to be a path to a

1415 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.

1416 engine : str, default None

1417 If io is not a buffer or path, this must be set to identify io.

1418 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``

1419 Engine compatibility :

1420

1421 - ``xlrd`` supports old-style Excel files (.xls).

1422 - ``openpyxl`` supports newer Excel file formats.

1423 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).

1424 - ``pyxlsb`` supports Binary Excel files.

1425

1426 .. versionchanged:: 1.2.0

1427

1428 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_

1429 now only supports old-style ``.xls`` files.

1430 When ``engine=None``, the following logic will be

1431 used to determine the engine:

1432

1433 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),

1434 then `odf <https://pypi.org/project/odfpy/>`_ will be used.

1435 - Otherwise if ``path_or_buffer`` is an xls format,

1436 ``xlrd`` will be used.

1437 - Otherwise if ``path_or_buffer`` is in xlsb format,

1438 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.

1439

1440 .. versionadded:: 1.3.0

1441

1442 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,

1443 then ``openpyxl`` will be used.

1444 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.

1445

1446 .. warning::

1447

1448 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.

1449 This is not supported, switch to using ``openpyxl`` instead.

1450 """

1451

1452 from pandas.io.excel._odfreader import ODFReader

1453 from pandas.io.excel._openpyxl import OpenpyxlReader

1454 from pandas.io.excel._pyxlsb import PyxlsbReader

1455 from pandas.io.excel._xlrd import XlrdReader

1456

1457 _engines: Mapping[str, Any] = {

1458 "xlrd": XlrdReader,

1459 "openpyxl": OpenpyxlReader,

1460 "odf": ODFReader,

1461 "pyxlsb": PyxlsbReader,

1462 }

1463

1464 def __init__(

1465 self,

1466 path_or_buffer,

1467 engine: str | None = None,

1468 storage_options: StorageOptions = None,

1469 ) -> None:

1470 if engine is not None and engine not in self._engines:

1471 raise ValueError(f"Unknown engine: {engine}")

1472

1473 # First argument can also be bytes, so create a buffer

1474 if isinstance(path_or_buffer, bytes):

1475 path_or_buffer = BytesIO(path_or_buffer)

1476

1477 # Could be a str, ExcelFile, Book, etc.

1478 self.io = path_or_buffer

1479 # Always a string

1480 self._io = stringify_path(path_or_buffer)

1481

1482 # Determine xlrd version if installed

1483 if import_optional_dependency("xlrd", errors="ignore") is None:

1484 xlrd_version = None

1485 else:

1486 import xlrd

1487

1488 xlrd_version = Version(get_version(xlrd))

1489

1490 if engine is None:

1491 # Only determine ext if it is needed

1492 ext: str | None

1493 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):

1494 ext = "xls"

1495 else:

1496 ext = inspect_excel_format(

1497 content_or_path=path_or_buffer, storage_options=storage_options

1498 )

1499 if ext is None:

1500 raise ValueError(

1501 "Excel file format cannot be determined, you must specify "

1502 "an engine manually."

1503 )

1504

1505 engine = config.get_option(f"io.excel.{ext}.reader", silent=True)

1506 if engine == "auto":

1507 engine = get_default_engine(ext, mode="reader")

1508

1509 assert engine is not None

1510 self.engine = engine

1511 self.storage_options = storage_options

1512

1513 self._reader = self._engines[engine](self._io, storage_options=storage_options)

1514

1515 def __fspath__(self):

1516 return self._io

1517

1518 def parse(

1519 self,

1520 sheet_name: str | int | list[int] | list[str] | None = 0,

1521 header: int | Sequence[int] | None = 0,

1522 names=None,

1523 index_col: int | Sequence[int] | None = None,

1524 usecols=None,

1525 converters=None,

1526 true_values: Iterable[Hashable] | None = None,

1527 false_values: Iterable[Hashable] | None = None,

1528 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,

1529 nrows: int | None = None,

1530 na_values=None,

1531 parse_dates: list | dict | bool = False,

1532 date_parser: Callable | lib.NoDefault = lib.no_default,

1533 date_format: str | dict[Hashable, str] | None = None,

1534 thousands: str | None = None,

1535 comment: str | None = None,

1536 skipfooter: int = 0,

1537 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1538 **kwds,

1539 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:

1540 """

1541 Parse specified sheet(s) into a DataFrame.

1542

1543 Equivalent to read_excel(ExcelFile, ...) See the read_excel

1544 docstring for more info on accepted parameters.

1545

1546 Returns

1547 -------

1548 DataFrame or dict of DataFrames

1549 DataFrame from the passed in Excel file.

1550 """

1551 return self._reader.parse(

1552 sheet_name=sheet_name,

1553 header=header,

1554 names=names,

1555 index_col=index_col,

1556 usecols=usecols,

1557 converters=converters,

1558 true_values=true_values,

1559 false_values=false_values,

1560 skiprows=skiprows,

1561 nrows=nrows,

1562 na_values=na_values,

1563 parse_dates=parse_dates,

1564 date_parser=date_parser,

1565 date_format=date_format,

1566 thousands=thousands,

1567 comment=comment,

1568 skipfooter=skipfooter,

1569 dtype_backend=dtype_backend,

1570 **kwds,

1571 )

1572

1573 @property

1574 def book(self):

1575 return self._reader.book

1576

1577 @property

1578 def sheet_names(self):

1579 return self._reader.sheet_names

1580

1581 def close(self) -> None:

1582 """close io if necessary"""

1583 self._reader.close()

1584

1585 def __enter__(self) -> ExcelFile:

1586 return self

1587

1588 def __exit__(

1589 self,

1590 exc_type: type[BaseException] | None,

1591 exc_value: BaseException | None,

1592 traceback: TracebackType | None,

1593 ) -> None:

1594 self.close()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/excel/_base.py: 27%

413 statements