1from __future__ import annotations
2
3import abc
4import datetime
5from functools import partial
6from io import BytesIO
7import os
8from textwrap import fill
9from types import TracebackType
10from typing import (
11 IO,
12 Any,
13 Callable,
14 Hashable,
15 Iterable,
16 List,
17 Literal,
18 Mapping,
19 Sequence,
20 Union,
21 cast,
22 overload,
23)
24import zipfile
25
26from pandas._config import config
27
28from pandas._libs import lib
29from pandas._libs.parsers import STR_NA_VALUES
30from pandas._typing import (
31 DtypeArg,
32 DtypeBackend,
33 FilePath,
34 IntStrT,
35 ReadBuffer,
36 StorageOptions,
37 WriteExcelBuffer,
38)
39from pandas.compat._optional import (
40 get_version,
41 import_optional_dependency,
42)
43from pandas.errors import EmptyDataError
44from pandas.util._decorators import (
45 Appender,
46 doc,
47)
48from pandas.util._validators import check_dtype_backend
49
50from pandas.core.dtypes.common import (
51 is_bool,
52 is_float,
53 is_integer,
54 is_list_like,
55)
56
57from pandas.core.frame import DataFrame
58from pandas.core.shared_docs import _shared_docs
59from pandas.util.version import Version
60
61from pandas.io.common import (
62 IOHandles,
63 get_handle,
64 stringify_path,
65 validate_header_arg,
66)
67from pandas.io.excel._util import (
68 fill_mi_header,
69 get_default_engine,
70 get_writer,
71 maybe_convert_usecols,
72 pop_header_name,
73)
74from pandas.io.parsers import TextParser
75from pandas.io.parsers.readers import validate_integer
76
77_read_excel_doc = (
78 """
79Read an Excel file into a pandas DataFrame.
80
81Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
82read from a local filesystem or URL. Supports an option to read
83a single sheet or a list of sheets.
84
85Parameters
86----------
87io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
88 Any valid string path is acceptable. The string could be a URL. Valid
89 URL schemes include http, ftp, s3, and file. For file URLs, a host is
90 expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
91
92 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
93
94 By file-like object, we refer to objects with a ``read()`` method,
95 such as a file handle (e.g. via builtin ``open`` function)
96 or ``StringIO``.
97sheet_name : str, int, list, or None, default 0
98 Strings are used for sheet names. Integers are used in zero-indexed
99 sheet positions (chart sheets do not count as a sheet position).
100 Lists of strings/integers are used to request multiple sheets.
101 Specify None to get all worksheets.
102
103 Available cases:
104
105 * Defaults to ``0``: 1st sheet as a `DataFrame`
106 * ``1``: 2nd sheet as a `DataFrame`
107 * ``"Sheet1"``: Load sheet with name "Sheet1"
108 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
109 as a dict of `DataFrame`
110 * None: All worksheets.
111
112header : int, list of int, default 0
113 Row (0-indexed) to use for the column labels of the parsed
114 DataFrame. If a list of integers is passed those row positions will
115 be combined into a ``MultiIndex``. Use None if there is no header.
116names : array-like, default None
117 List of column names to use. If file contains no header row,
118 then you should explicitly pass header=None.
119index_col : int, list of int, default None
120 Column (0-indexed) to use as the row labels of the DataFrame.
121 Pass None if there is no such column. If a list is passed,
122 those columns will be combined into a ``MultiIndex``. If a
123 subset of data is selected with ``usecols``, index_col
124 is based on the subset.
125
126 Missing values will be forward filled to allow roundtripping with
127 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the
128 missing values use ``set_index`` after reading the data instead of
129 ``index_col``.
130usecols : str, list-like, or callable, default None
131 * If None, then parse all columns.
132 * If str, then indicates comma separated list of Excel column letters
133 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
134 both sides.
135 * If list of int, then indicates list of column numbers to be parsed
136 (0-indexed).
137 * If list of string, then indicates list of column names to be parsed.
138 * If callable, then evaluate each column name against it and parse the
139 column if the callable returns ``True``.
140
141 Returns a subset of the columns according to behavior above.
142dtype : Type name or dict of column -> type, default None
143 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
144 Use `object` to preserve data as stored in Excel and not interpret dtype.
145 If converters are specified, they will be applied INSTEAD
146 of dtype conversion.
147engine : str, default None
148 If io is not a buffer or path, this must be set to identify io.
149 Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
150 Engine compatibility :
151
152 - "xlrd" supports old-style Excel files (.xls).
153 - "openpyxl" supports newer Excel file formats.
154 - "odf" supports OpenDocument file formats (.odf, .ods, .odt).
155 - "pyxlsb" supports Binary Excel files.
156
157 .. versionchanged:: 1.2.0
158 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
159 now only supports old-style ``.xls`` files.
160 When ``engine=None``, the following logic will be
161 used to determine the engine:
162
163 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
164 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
165 - Otherwise if ``path_or_buffer`` is an xls format,
166 ``xlrd`` will be used.
167 - Otherwise if ``path_or_buffer`` is in xlsb format,
168 ``pyxlsb`` will be used.
169
170 .. versionadded:: 1.3.0
171 - Otherwise ``openpyxl`` will be used.
172
173 .. versionchanged:: 1.3.0
174
175converters : dict, default None
176 Dict of functions for converting values in certain columns. Keys can
177 either be integers or column labels, values are functions that take one
178 input argument, the Excel cell content, and return the transformed
179 content.
180true_values : list, default None
181 Values to consider as True.
182false_values : list, default None
183 Values to consider as False.
184skiprows : list-like, int, or callable, optional
185 Line numbers to skip (0-indexed) or number of lines to skip (int) at the
186 start of the file. If callable, the callable function will be evaluated
187 against the row indices, returning True if the row should be skipped and
188 False otherwise. An example of a valid callable argument would be ``lambda
189 x: x in [0, 2]``.
190nrows : int, default None
191 Number of rows to parse.
192na_values : scalar, str, list-like, or dict, default None
193 Additional strings to recognize as NA/NaN. If dict passed, specific
194 per-column NA values. By default the following values are interpreted
195 as NaN: '"""
196 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
197 + """'.
198keep_default_na : bool, default True
199 Whether or not to include the default NaN values when parsing the data.
200 Depending on whether `na_values` is passed in, the behavior is as follows:
201
202 * If `keep_default_na` is True, and `na_values` are specified, `na_values`
203 is appended to the default NaN values used for parsing.
204 * If `keep_default_na` is True, and `na_values` are not specified, only
205 the default NaN values are used for parsing.
206 * If `keep_default_na` is False, and `na_values` are specified, only
207 the NaN values specified `na_values` are used for parsing.
208 * If `keep_default_na` is False, and `na_values` are not specified, no
209 strings will be parsed as NaN.
210
211 Note that if `na_filter` is passed in as False, the `keep_default_na` and
212 `na_values` parameters will be ignored.
213na_filter : bool, default True
214 Detect missing value markers (empty strings and the value of na_values). In
215 data without any NAs, passing na_filter=False can improve the performance
216 of reading a large file.
217verbose : bool, default False
218 Indicate number of NA values placed in non-numeric columns.
219parse_dates : bool, list-like, or dict, default False
220 The behavior is as follows:
221
222 * bool. If True -> try parsing the index.
223 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
224 each as a separate date column.
225 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
226 a single date column.
227 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
228 result 'foo'
229
230 If a column or index contains an unparsable date, the entire column or
231 index will be returned unaltered as an object data type. If you don`t want to
232 parse some cells as date just change their type in Excel to "Text".
233 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
234
235 Note: A fast-path exists for iso8601-formatted dates.
236date_parser : function, optional
237 Function to use for converting a sequence of string columns to an array of
238 datetime instances. The default uses ``dateutil.parser.parser`` to do the
239 conversion. Pandas will try to call `date_parser` in three different ways,
240 advancing to the next if an exception occurs: 1) Pass one or more arrays
241 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
242 string values from the columns defined by `parse_dates` into a single array
243 and pass that; and 3) call `date_parser` once for each row using one or
244 more strings (corresponding to the columns defined by `parse_dates`) as
245 arguments.
246
247 .. deprecated:: 2.0.0
248 Use ``date_format`` instead, or read in as ``object`` and then apply
249 :func:`to_datetime` as-needed.
250date_format : str or dict of column -> format, default ``None``
251 If used in conjunction with ``parse_dates``, will parse dates according to this
252 format. For anything more complex,
253 please read in as ``object`` and then apply :func:`to_datetime` as-needed.
254
255 .. versionadded:: 2.0.0
256thousands : str, default None
257 Thousands separator for parsing string columns to numeric. Note that
258 this parameter is only necessary for columns stored as TEXT in Excel,
259 any numeric columns will automatically be parsed, regardless of display
260 format.
261decimal : str, default '.'
262 Character to recognize as decimal point for parsing string columns to numeric.
263 Note that this parameter is only necessary for columns stored as TEXT in Excel,
264 any numeric columns will automatically be parsed, regardless of display
265 format.(e.g. use ',' for European data).
266
267 .. versionadded:: 1.4.0
268
269comment : str, default None
270 Comments out remainder of line. Pass a character or characters to this
271 argument to indicate comments in the input file. Any data between the
272 comment string and the end of the current line is ignored.
273skipfooter : int, default 0
274 Rows at the end to skip (0-indexed).
275{storage_options}
276
277 .. versionadded:: 1.2.0
278
279dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
280 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
281 arrays, nullable dtypes are used for all dtypes that have a nullable
282 implementation when "numpy_nullable" is set, pyarrow is used for all
283 dtypes if "pyarrow" is set.
284
285 The dtype_backends are still experimential.
286
287 .. versionadded:: 2.0
288
289Returns
290-------
291DataFrame or dict of DataFrames
292 DataFrame from the passed in Excel file. See notes in sheet_name
293 argument for more information on when a dict of DataFrames is returned.
294
295See Also
296--------
297DataFrame.to_excel : Write DataFrame to an Excel file.
298DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
299read_csv : Read a comma-separated values (csv) file into DataFrame.
300read_fwf : Read a table of fixed-width formatted lines into DataFrame.
301
302Examples
303--------
304The file can be read using the file name as string or an open file object:
305
306>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
307 Name Value
3080 string1 1
3091 string2 2
3102 #Comment 3
311
312>>> pd.read_excel(open('tmp.xlsx', 'rb'),
313... sheet_name='Sheet3') # doctest: +SKIP
314 Unnamed: 0 Name Value
3150 0 string1 1
3161 1 string2 2
3172 2 #Comment 3
318
319Index and header can be specified via the `index_col` and `header` arguments
320
321>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
322 0 1 2
3230 NaN Name Value
3241 0.0 string1 1
3252 1.0 string2 2
3263 2.0 #Comment 3
327
328Column types are inferred but can be explicitly specified
329
330>>> pd.read_excel('tmp.xlsx', index_col=0,
331... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP
332 Name Value
3330 string1 1.0
3341 string2 2.0
3352 #Comment 3.0
336
337True, False, and NA values, and thousands separators have defaults,
338but can be explicitly specified, too. Supply the values you would like
339as strings or lists of strings!
340
341>>> pd.read_excel('tmp.xlsx', index_col=0,
342... na_values=['string1', 'string2']) # doctest: +SKIP
343 Name Value
3440 NaN 1
3451 NaN 2
3462 #Comment 3
347
348Comment lines in the excel input file can be skipped using the `comment` kwarg
349
350>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
351 Name Value
3520 string1 1.0
3531 string2 2.0
3542 None NaN
355"""
356)
357
358
359@overload
360def read_excel(
361 io,
362 # sheet name is str or int -> DataFrame
363 sheet_name: str | int = ...,
364 *,
365 header: int | Sequence[int] | None = ...,
366 names: list[str] | None = ...,
367 index_col: int | Sequence[int] | None = ...,
368 usecols: int
369 | str
370 | Sequence[int]
371 | Sequence[str]
372 | Callable[[str], bool]
373 | None = ...,
374 dtype: DtypeArg | None = ...,
375 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
376 converters: dict[str, Callable] | dict[int, Callable] | None = ...,
377 true_values: Iterable[Hashable] | None = ...,
378 false_values: Iterable[Hashable] | None = ...,
379 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
380 nrows: int | None = ...,
381 na_values=...,
382 keep_default_na: bool = ...,
383 na_filter: bool = ...,
384 verbose: bool = ...,
385 parse_dates: list | dict | bool = ...,
386 date_parser: Callable | lib.NoDefault = ...,
387 date_format: dict[Hashable, str] | str | None = ...,
388 thousands: str | None = ...,
389 decimal: str = ...,
390 comment: str | None = ...,
391 skipfooter: int = ...,
392 storage_options: StorageOptions = ...,
393 dtype_backend: DtypeBackend | lib.NoDefault = ...,
394) -> DataFrame:
395 ...
396
397
398@overload
399def read_excel(
400 io,
401 # sheet name is list or None -> dict[IntStrT, DataFrame]
402 sheet_name: list[IntStrT] | None,
403 *,
404 header: int | Sequence[int] | None = ...,
405 names: list[str] | None = ...,
406 index_col: int | Sequence[int] | None = ...,
407 usecols: int
408 | str
409 | Sequence[int]
410 | Sequence[str]
411 | Callable[[str], bool]
412 | None = ...,
413 dtype: DtypeArg | None = ...,
414 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
415 converters: dict[str, Callable] | dict[int, Callable] | None = ...,
416 true_values: Iterable[Hashable] | None = ...,
417 false_values: Iterable[Hashable] | None = ...,
418 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
419 nrows: int | None = ...,
420 na_values=...,
421 keep_default_na: bool = ...,
422 na_filter: bool = ...,
423 verbose: bool = ...,
424 parse_dates: list | dict | bool = ...,
425 date_parser: Callable | lib.NoDefault = ...,
426 date_format: dict[Hashable, str] | str | None = ...,
427 thousands: str | None = ...,
428 decimal: str = ...,
429 comment: str | None = ...,
430 skipfooter: int = ...,
431 storage_options: StorageOptions = ...,
432 dtype_backend: DtypeBackend | lib.NoDefault = ...,
433) -> dict[IntStrT, DataFrame]:
434 ...
435
436
437@doc(storage_options=_shared_docs["storage_options"])
438@Appender(_read_excel_doc)
439def read_excel(
440 io,
441 sheet_name: str | int | list[IntStrT] | None = 0,
442 *,
443 header: int | Sequence[int] | None = 0,
444 names: list[str] | None = None,
445 index_col: int | Sequence[int] | None = None,
446 usecols: int
447 | str
448 | Sequence[int]
449 | Sequence[str]
450 | Callable[[str], bool]
451 | None = None,
452 dtype: DtypeArg | None = None,
453 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None,
454 converters: dict[str, Callable] | dict[int, Callable] | None = None,
455 true_values: Iterable[Hashable] | None = None,
456 false_values: Iterable[Hashable] | None = None,
457 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
458 nrows: int | None = None,
459 na_values=None,
460 keep_default_na: bool = True,
461 na_filter: bool = True,
462 verbose: bool = False,
463 parse_dates: list | dict | bool = False,
464 date_parser: Callable | lib.NoDefault = lib.no_default,
465 date_format: dict[Hashable, str] | str | None = None,
466 thousands: str | None = None,
467 decimal: str = ".",
468 comment: str | None = None,
469 skipfooter: int = 0,
470 storage_options: StorageOptions = None,
471 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
472) -> DataFrame | dict[IntStrT, DataFrame]:
473 check_dtype_backend(dtype_backend)
474
475 should_close = False
476 if not isinstance(io, ExcelFile):
477 should_close = True
478 io = ExcelFile(io, storage_options=storage_options, engine=engine)
479 elif engine and engine != io.engine:
480 raise ValueError(
481 "Engine should not be specified when passing "
482 "an ExcelFile - ExcelFile already has the engine set"
483 )
484
485 try:
486 data = io.parse(
487 sheet_name=sheet_name,
488 header=header,
489 names=names,
490 index_col=index_col,
491 usecols=usecols,
492 dtype=dtype,
493 converters=converters,
494 true_values=true_values,
495 false_values=false_values,
496 skiprows=skiprows,
497 nrows=nrows,
498 na_values=na_values,
499 keep_default_na=keep_default_na,
500 na_filter=na_filter,
501 verbose=verbose,
502 parse_dates=parse_dates,
503 date_parser=date_parser,
504 date_format=date_format,
505 thousands=thousands,
506 decimal=decimal,
507 comment=comment,
508 skipfooter=skipfooter,
509 dtype_backend=dtype_backend,
510 )
511 finally:
512 # make sure to close opened file handles
513 if should_close:
514 io.close()
515 return data
516
517
518class BaseExcelReader(metaclass=abc.ABCMeta):
519 def __init__(
520 self, filepath_or_buffer, storage_options: StorageOptions = None
521 ) -> None:
522 # First argument can also be bytes, so create a buffer
523 if isinstance(filepath_or_buffer, bytes):
524 filepath_or_buffer = BytesIO(filepath_or_buffer)
525
526 self.handles = IOHandles(
527 handle=filepath_or_buffer, compression={"method": None}
528 )
529 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
530 self.handles = get_handle(
531 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False
532 )
533
534 if isinstance(self.handles.handle, self._workbook_class):
535 self.book = self.handles.handle
536 elif hasattr(self.handles.handle, "read"):
537 # N.B. xlrd.Book has a read attribute too
538 self.handles.handle.seek(0)
539 try:
540 self.book = self.load_workbook(self.handles.handle)
541 except Exception:
542 self.close()
543 raise
544 else:
545 raise ValueError(
546 "Must explicitly set engine if not passing in buffer or path for io."
547 )
548
549 @property
550 @abc.abstractmethod
551 def _workbook_class(self):
552 pass
553
554 @abc.abstractmethod
555 def load_workbook(self, filepath_or_buffer):
556 pass
557
558 def close(self) -> None:
559 if hasattr(self, "book"):
560 if hasattr(self.book, "close"):
561 # pyxlsb: opens a TemporaryFile
562 # openpyxl: https://stackoverflow.com/questions/31416842/
563 # openpyxl-does-not-close-excel-workbook-in-read-only-mode
564 self.book.close()
565 elif hasattr(self.book, "release_resources"):
566 # xlrd
567 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548
568 self.book.release_resources()
569 self.handles.close()
570
571 @property
572 @abc.abstractmethod
573 def sheet_names(self) -> list[str]:
574 pass
575
576 @abc.abstractmethod
577 def get_sheet_by_name(self, name: str):
578 pass
579
580 @abc.abstractmethod
581 def get_sheet_by_index(self, index: int):
582 pass
583
584 @abc.abstractmethod
585 def get_sheet_data(self, sheet, rows: int | None = None):
586 pass
587
588 def raise_if_bad_sheet_by_index(self, index: int) -> None:
589 n_sheets = len(self.sheet_names)
590 if index >= n_sheets:
591 raise ValueError(
592 f"Worksheet index {index} is invalid, {n_sheets} worksheets found"
593 )
594
595 def raise_if_bad_sheet_by_name(self, name: str) -> None:
596 if name not in self.sheet_names:
597 raise ValueError(f"Worksheet named '{name}' not found")
598
599 def _check_skiprows_func(
600 self,
601 skiprows: Callable,
602 rows_to_use: int,
603 ) -> int:
604 """
605 Determine how many file rows are required to obtain `nrows` data
606 rows when `skiprows` is a function.
607
608 Parameters
609 ----------
610 skiprows : function
611 The function passed to read_excel by the user.
612 rows_to_use : int
613 The number of rows that will be needed for the header and
614 the data.
615
616 Returns
617 -------
618 int
619 """
620 i = 0
621 rows_used_so_far = 0
622 while rows_used_so_far < rows_to_use:
623 if not skiprows(i):
624 rows_used_so_far += 1
625 i += 1
626 return i
627
628 def _calc_rows(
629 self,
630 header: int | Sequence[int] | None,
631 index_col: int | Sequence[int] | None,
632 skiprows: Sequence[int] | int | Callable[[int], object] | None,
633 nrows: int | None,
634 ) -> int | None:
635 """
636 If nrows specified, find the number of rows needed from the
637 file, otherwise return None.
638
639
640 Parameters
641 ----------
642 header : int, list of int, or None
643 See read_excel docstring.
644 index_col : int, list of int, or None
645 See read_excel docstring.
646 skiprows : list-like, int, callable, or None
647 See read_excel docstring.
648 nrows : int or None
649 See read_excel docstring.
650
651 Returns
652 -------
653 int or None
654 """
655 if nrows is None:
656 return None
657 if header is None:
658 header_rows = 1
659 elif is_integer(header):
660 header = cast(int, header)
661 header_rows = 1 + header
662 else:
663 header = cast(Sequence, header)
664 header_rows = 1 + header[-1]
665 # If there is a MultiIndex header and an index then there is also
666 # a row containing just the index name(s)
667 if is_list_like(header) and index_col is not None:
668 header = cast(Sequence, header)
669 if len(header) > 1:
670 header_rows += 1
671 if skiprows is None:
672 return header_rows + nrows
673 if is_integer(skiprows):
674 skiprows = cast(int, skiprows)
675 return header_rows + nrows + skiprows
676 if is_list_like(skiprows):
677
678 def f(skiprows: Sequence, x: int) -> bool:
679 return x in skiprows
680
681 skiprows = cast(Sequence, skiprows)
682 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
683 if callable(skiprows):
684 return self._check_skiprows_func(
685 skiprows,
686 header_rows + nrows,
687 )
688 # else unexpected skiprows type: read_excel will not optimize
689 # the number of rows read from file
690 return None
691
692 def parse(
693 self,
694 sheet_name: str | int | list[int] | list[str] | None = 0,
695 header: int | Sequence[int] | None = 0,
696 names=None,
697 index_col: int | Sequence[int] | None = None,
698 usecols=None,
699 dtype: DtypeArg | None = None,
700 true_values: Iterable[Hashable] | None = None,
701 false_values: Iterable[Hashable] | None = None,
702 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
703 nrows: int | None = None,
704 na_values=None,
705 verbose: bool = False,
706 parse_dates: list | dict | bool = False,
707 date_parser: Callable | lib.NoDefault = lib.no_default,
708 date_format: dict[Hashable, str] | str | None = None,
709 thousands: str | None = None,
710 decimal: str = ".",
711 comment: str | None = None,
712 skipfooter: int = 0,
713 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
714 **kwds,
715 ):
716 validate_header_arg(header)
717 validate_integer("nrows", nrows)
718
719 ret_dict = False
720
721 # Keep sheetname to maintain backwards compatibility.
722 sheets: list[int] | list[str]
723 if isinstance(sheet_name, list):
724 sheets = sheet_name
725 ret_dict = True
726 elif sheet_name is None:
727 sheets = self.sheet_names
728 ret_dict = True
729 elif isinstance(sheet_name, str):
730 sheets = [sheet_name]
731 else:
732 sheets = [sheet_name]
733
734 # handle same-type duplicates.
735 sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys()))
736
737 output = {}
738
739 last_sheetname = None
740 for asheetname in sheets:
741 last_sheetname = asheetname
742 if verbose:
743 print(f"Reading sheet {asheetname}")
744
745 if isinstance(asheetname, str):
746 sheet = self.get_sheet_by_name(asheetname)
747 else: # assume an integer if not a string
748 sheet = self.get_sheet_by_index(asheetname)
749
750 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
751 data = self.get_sheet_data(sheet, file_rows_needed)
752 if hasattr(sheet, "close"):
753 # pyxlsb opens two TemporaryFiles
754 sheet.close()
755 usecols = maybe_convert_usecols(usecols)
756
757 if not data:
758 output[asheetname] = DataFrame()
759 continue
760
761 is_list_header = False
762 is_len_one_list_header = False
763 if is_list_like(header):
764 assert isinstance(header, Sequence)
765 is_list_header = True
766 if len(header) == 1:
767 is_len_one_list_header = True
768
769 if is_len_one_list_header:
770 header = cast(Sequence[int], header)[0]
771
772 # forward fill and pull out names for MultiIndex column
773 header_names = None
774 if header is not None and is_list_like(header):
775 assert isinstance(header, Sequence)
776
777 header_names = []
778 control_row = [True] * len(data[0])
779
780 for row in header:
781 if is_integer(skiprows):
782 assert isinstance(skiprows, int)
783 row += skiprows
784
785 if row > len(data) - 1:
786 raise ValueError(
787 f"header index {row} exceeds maximum index "
788 f"{len(data) - 1} of data.",
789 )
790
791 data[row], control_row = fill_mi_header(data[row], control_row)
792
793 if index_col is not None:
794 header_name, _ = pop_header_name(data[row], index_col)
795 header_names.append(header_name)
796
797 # If there is a MultiIndex header and an index then there is also
798 # a row containing just the index name(s)
799 has_index_names = False
800 if is_list_header and not is_len_one_list_header and index_col is not None:
801 index_col_list: Sequence[int]
802 if isinstance(index_col, int):
803 index_col_list = [index_col]
804 else:
805 assert isinstance(index_col, Sequence)
806 index_col_list = index_col
807
808 # We have to handle mi without names. If any of the entries in the data
809 # columns are not empty, this is a regular row
810 assert isinstance(header, Sequence)
811 if len(header) < len(data):
812 potential_index_names = data[len(header)]
813 potential_data = [
814 x
815 for i, x in enumerate(potential_index_names)
816 if not control_row[i] and i not in index_col_list
817 ]
818 has_index_names = all(x == "" or x is None for x in potential_data)
819
820 if is_list_like(index_col):
821 # Forward fill values for MultiIndex index.
822 if header is None:
823 offset = 0
824 elif isinstance(header, int):
825 offset = 1 + header
826 else:
827 offset = 1 + max(header)
828
829 # GH34673: if MultiIndex names present and not defined in the header,
830 # offset needs to be incremented so that forward filling starts
831 # from the first MI value instead of the name
832 if has_index_names:
833 offset += 1
834
835 # Check if we have an empty dataset
836 # before trying to collect data.
837 if offset < len(data):
838 assert isinstance(index_col, Sequence)
839
840 for col in index_col:
841 last = data[offset][col]
842
843 for row in range(offset + 1, len(data)):
844 if data[row][col] == "" or data[row][col] is None:
845 data[row][col] = last
846 else:
847 last = data[row][col]
848
849 # GH 12292 : error when read one empty column from excel file
850 try:
851 parser = TextParser(
852 data,
853 names=names,
854 header=header,
855 index_col=index_col,
856 has_index_names=has_index_names,
857 dtype=dtype,
858 true_values=true_values,
859 false_values=false_values,
860 skiprows=skiprows,
861 nrows=nrows,
862 na_values=na_values,
863 skip_blank_lines=False, # GH 39808
864 parse_dates=parse_dates,
865 date_parser=date_parser,
866 date_format=date_format,
867 thousands=thousands,
868 decimal=decimal,
869 comment=comment,
870 skipfooter=skipfooter,
871 usecols=usecols,
872 dtype_backend=dtype_backend,
873 **kwds,
874 )
875
876 output[asheetname] = parser.read(nrows=nrows)
877
878 if header_names:
879 output[asheetname].columns = output[asheetname].columns.set_names(
880 header_names
881 )
882
883 except EmptyDataError:
884 # No Data, return an empty DataFrame
885 output[asheetname] = DataFrame()
886
887 except Exception as err:
888 err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
889 raise err
890
891 if last_sheetname is None:
892 raise ValueError("Sheet name is an empty list")
893
894 if ret_dict:
895 return output
896 else:
897 return output[last_sheetname]
898
899
900@doc(storage_options=_shared_docs["storage_options"])
901class ExcelWriter(metaclass=abc.ABCMeta):
902 """
903 Class for writing DataFrame objects into excel sheets.
904
905 Default is to use:
906
907 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter
908 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__
909 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files
910
911 See ``DataFrame.to_excel`` for typical usage.
912
913 The writer should be used as a context manager. Otherwise, call `close()` to save
914 and close any opened file handles.
915
916 Parameters
917 ----------
918 path : str or typing.BinaryIO
919 Path to xls or xlsx or ods file.
920 engine : str (optional)
921 Engine to use for writing. If None, defaults to
922 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
923 argument.
924 date_format : str, default None
925 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
926 datetime_format : str, default None
927 Format string for datetime objects written into Excel files.
928 (e.g. 'YYYY-MM-DD HH:MM:SS').
929 mode : {{'w', 'a'}}, default 'w'
930 File mode to use (write or append). Append does not work with fsspec URLs.
931 {storage_options}
932
933 .. versionadded:: 1.2.0
934
935 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error'
936 How to behave when trying to write to a sheet that already
937 exists (append mode only).
938
939 * error: raise a ValueError.
940 * new: Create a new sheet, with a name determined by the engine.
941 * replace: Delete the contents of the sheet before writing to it.
942 * overlay: Write contents to the existing sheet without removing the old
943 contents.
944
945 .. versionadded:: 1.3.0
946
947 .. versionchanged:: 1.4.0
948
949 Added ``overlay`` option
950
951 engine_kwargs : dict, optional
952 Keyword arguments to be passed into the engine. These will be passed to
953 the following functions of the respective engines:
954
955 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)``
956 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)``
957 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)``
958 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)``
959
960 .. versionadded:: 1.3.0
961
962 Notes
963 -----
964 For compatibility with CSV writers, ExcelWriter serializes lists
965 and dicts to strings before writing.
966
967 Examples
968 --------
969 Default usage:
970
971 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
972 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
973 ... df.to_excel(writer) # doctest: +SKIP
974
975 To write to separate sheets in a single file:
976
977 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP
978 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
979 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
980 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
981 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
982
983 You can set the date format or datetime format:
984
985 >>> from datetime import date, datetime # doctest: +SKIP
986 >>> df = pd.DataFrame(
987 ... [
988 ... [date(2014, 1, 31), date(1999, 9, 24)],
989 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)],
990 ... ],
991 ... index=["Date", "Datetime"],
992 ... columns=["X", "Y"],
993 ... ) # doctest: +SKIP
994 >>> with pd.ExcelWriter(
995 ... "path_to_file.xlsx",
996 ... date_format="YYYY-MM-DD",
997 ... datetime_format="YYYY-MM-DD HH:MM:SS"
998 ... ) as writer:
999 ... df.to_excel(writer) # doctest: +SKIP
1000
1001 You can also append to an existing Excel file:
1002
1003 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:
1004 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP
1005
1006 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it
1007 already exists:
1008
1009 >>> with ExcelWriter(
1010 ... "path_to_file.xlsx",
1011 ... mode="a",
1012 ... engine="openpyxl",
1013 ... if_sheet_exists="replace",
1014 ... ) as writer:
1015 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
1016
1017 You can also write multiple DataFrames to a single sheet. Note that the
1018 ``if_sheet_exists`` parameter needs to be set to ``overlay``:
1019
1020 >>> with ExcelWriter("path_to_file.xlsx",
1021 ... mode="a",
1022 ... engine="openpyxl",
1023 ... if_sheet_exists="overlay",
1024 ... ) as writer:
1025 ... df1.to_excel(writer, sheet_name="Sheet1")
1026 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP
1027
1028 You can store Excel file in RAM:
1029
1030 >>> import io
1031 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
1032 >>> buffer = io.BytesIO()
1033 >>> with pd.ExcelWriter(buffer) as writer:
1034 ... df.to_excel(writer)
1035
1036 You can pack Excel file into zip archive:
1037
1038 >>> import zipfile # doctest: +SKIP
1039 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
1040 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:
1041 ... with zf.open("filename.xlsx", "w") as buffer:
1042 ... with pd.ExcelWriter(buffer) as writer:
1043 ... df.to_excel(writer) # doctest: +SKIP
1044
1045 You can specify additional arguments to the underlying engine:
1046
1047 >>> with pd.ExcelWriter(
1048 ... "path_to_file.xlsx",
1049 ... engine="xlsxwriter",
1050 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}
1051 ... ) as writer:
1052 ... df.to_excel(writer) # doctest: +SKIP
1053
1054 In append mode, ``engine_kwargs`` are passed through to
1055 openpyxl's ``load_workbook``:
1056
1057 >>> with pd.ExcelWriter(
1058 ... "path_to_file.xlsx",
1059 ... engine="openpyxl",
1060 ... mode="a",
1061 ... engine_kwargs={{"keep_vba": True}}
1062 ... ) as writer:
1063 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
1064 """
1065
1066 # Defining an ExcelWriter implementation (see abstract methods for more...)
1067
1068 # - Mandatory
1069 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
1070 # --> called to write additional DataFrames to disk
1071 # - ``_supported_extensions`` (tuple of supported extensions), used to
1072 # check that engine supports the given extension.
1073 # - ``_engine`` - string that gives the engine name. Necessary to
1074 # instantiate class directly and bypass ``ExcelWriterMeta`` engine
1075 # lookup.
1076 # - ``save(self)`` --> called to save file to disk
1077 # - Mostly mandatory (i.e. should at least exist)
1078 # - book, cur_sheet, path
1079
1080 # - Optional:
1081 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called
1082 # with path as first argument.
1083
1084 # You also need to register the class with ``register_writer()``.
1085 # Technically, ExcelWriter implementations don't need to subclass
1086 # ExcelWriter.
1087
1088 _engine: str
1089 _supported_extensions: tuple[str, ...]
1090
1091 def __new__(
1092 cls: type[ExcelWriter],
1093 path: FilePath | WriteExcelBuffer | ExcelWriter,
1094 engine: str | None = None,
1095 date_format: str | None = None,
1096 datetime_format: str | None = None,
1097 mode: str = "w",
1098 storage_options: StorageOptions = None,
1099 if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None,
1100 engine_kwargs: dict | None = None,
1101 ) -> ExcelWriter:
1102 # only switch class if generic(ExcelWriter)
1103 if cls is ExcelWriter:
1104 if engine is None or (isinstance(engine, str) and engine == "auto"):
1105 if isinstance(path, str):
1106 ext = os.path.splitext(path)[-1][1:]
1107 else:
1108 ext = "xlsx"
1109
1110 try:
1111 engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
1112 if engine == "auto":
1113 engine = get_default_engine(ext, mode="writer")
1114 except KeyError as err:
1115 raise ValueError(f"No engine for filetype: '{ext}'") from err
1116
1117 # for mypy
1118 assert engine is not None
1119 cls = get_writer(engine)
1120
1121 return object.__new__(cls)
1122
1123 # declare external properties you can count on
1124 _path = None
1125
1126 @property
1127 def supported_extensions(self) -> tuple[str, ...]:
1128 """Extensions that writer engine supports."""
1129 return self._supported_extensions
1130
1131 @property
1132 def engine(self) -> str:
1133 """Name of engine."""
1134 return self._engine
1135
1136 @property
1137 @abc.abstractmethod
1138 def sheets(self) -> dict[str, Any]:
1139 """Mapping of sheet names to sheet objects."""
1140
1141 @property
1142 @abc.abstractmethod
1143 def book(self):
1144 """
1145 Book instance. Class type will depend on the engine used.
1146
1147 This attribute can be used to access engine-specific features.
1148 """
1149
1150 @abc.abstractmethod
1151 def _write_cells(
1152 self,
1153 cells,
1154 sheet_name: str | None = None,
1155 startrow: int = 0,
1156 startcol: int = 0,
1157 freeze_panes: tuple[int, int] | None = None,
1158 ) -> None:
1159 """
1160 Write given formatted cells into Excel an excel sheet
1161
1162 Parameters
1163 ----------
1164 cells : generator
1165 cell of formatted data to save to Excel sheet
1166 sheet_name : str, default None
1167 Name of Excel sheet, if None, then use self.cur_sheet
1168 startrow : upper left cell row to dump data frame
1169 startcol : upper left cell column to dump data frame
1170 freeze_panes: int tuple of length 2
1171 contains the bottom-most row and right-most column to freeze
1172 """
1173
1174 @abc.abstractmethod
1175 def _save(self) -> None:
1176 """
1177 Save workbook to disk.
1178 """
1179
1180 def __init__(
1181 self,
1182 path: FilePath | WriteExcelBuffer | ExcelWriter,
1183 engine: str | None = None,
1184 date_format: str | None = None,
1185 datetime_format: str | None = None,
1186 mode: str = "w",
1187 storage_options: StorageOptions = None,
1188 if_sheet_exists: str | None = None,
1189 engine_kwargs: dict[str, Any] | None = None,
1190 ) -> None:
1191 # validate that this engine can handle the extension
1192 if isinstance(path, str):
1193 ext = os.path.splitext(path)[-1]
1194 self.check_extension(ext)
1195
1196 # use mode to open the file
1197 if "b" not in mode:
1198 mode += "b"
1199 # use "a" for the user to append data to excel but internally use "r+" to let
1200 # the excel backend first read the existing file and then write any data to it
1201 mode = mode.replace("a", "r+")
1202
1203 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"):
1204 raise ValueError(
1205 f"'{if_sheet_exists}' is not valid for if_sheet_exists. "
1206 "Valid options are 'error', 'new', 'replace' and 'overlay'."
1207 )
1208 if if_sheet_exists and "r+" not in mode:
1209 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')")
1210 if if_sheet_exists is None:
1211 if_sheet_exists = "error"
1212 self._if_sheet_exists = if_sheet_exists
1213
1214 # cast ExcelWriter to avoid adding 'if self._handles is not None'
1215 self._handles = IOHandles(
1216 cast(IO[bytes], path), compression={"compression": None}
1217 )
1218 if not isinstance(path, ExcelWriter):
1219 self._handles = get_handle(
1220 path, mode, storage_options=storage_options, is_text=False
1221 )
1222 self._cur_sheet = None
1223
1224 if date_format is None:
1225 self._date_format = "YYYY-MM-DD"
1226 else:
1227 self._date_format = date_format
1228 if datetime_format is None:
1229 self._datetime_format = "YYYY-MM-DD HH:MM:SS"
1230 else:
1231 self._datetime_format = datetime_format
1232
1233 self._mode = mode
1234
1235 @property
1236 def date_format(self) -> str:
1237 """
1238 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).
1239 """
1240 return self._date_format
1241
1242 @property
1243 def datetime_format(self) -> str:
1244 """
1245 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).
1246 """
1247 return self._datetime_format
1248
1249 @property
1250 def if_sheet_exists(self) -> str:
1251 """
1252 How to behave when writing to a sheet that already exists in append mode.
1253 """
1254 return self._if_sheet_exists
1255
1256 def __fspath__(self) -> str:
1257 return getattr(self._handles.handle, "name", "")
1258
1259 def _get_sheet_name(self, sheet_name: str | None) -> str:
1260 if sheet_name is None:
1261 sheet_name = self._cur_sheet
1262 if sheet_name is None: # pragma: no cover
1263 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property")
1264 return sheet_name
1265
1266 def _value_with_fmt(self, val) -> tuple[object, str | None]:
1267 """
1268 Convert numpy types to Python types for the Excel writers.
1269
1270 Parameters
1271 ----------
1272 val : object
1273 Value to be written into cells
1274
1275 Returns
1276 -------
1277 Tuple with the first element being the converted value and the second
1278 being an optional format
1279 """
1280 fmt = None
1281
1282 if is_integer(val):
1283 val = int(val)
1284 elif is_float(val):
1285 val = float(val)
1286 elif is_bool(val):
1287 val = bool(val)
1288 elif isinstance(val, datetime.datetime):
1289 fmt = self._datetime_format
1290 elif isinstance(val, datetime.date):
1291 fmt = self._date_format
1292 elif isinstance(val, datetime.timedelta):
1293 val = val.total_seconds() / 86400
1294 fmt = "0"
1295 else:
1296 val = str(val)
1297
1298 return val, fmt
1299
1300 @classmethod
1301 def check_extension(cls, ext: str) -> Literal[True]:
1302 """
1303 checks that path's extension against the Writer's supported
1304 extensions. If it isn't supported, raises UnsupportedFiletypeError.
1305 """
1306 if ext.startswith("."):
1307 ext = ext[1:]
1308 if not any(ext in extension for extension in cls._supported_extensions):
1309 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")
1310 return True
1311
1312 # Allow use as a contextmanager
1313 def __enter__(self) -> ExcelWriter:
1314 return self
1315
1316 def __exit__(
1317 self,
1318 exc_type: type[BaseException] | None,
1319 exc_value: BaseException | None,
1320 traceback: TracebackType | None,
1321 ) -> None:
1322 self.close()
1323
1324 def close(self) -> None:
1325 """synonym for save, to make it more file-like"""
1326 self._save()
1327 self._handles.close()
1328
1329
1330XLS_SIGNATURES = (
1331 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2
1332 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3
1333 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4
1334 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary
1335)
1336ZIP_SIGNATURE = b"PK\x03\x04"
1337PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))
1338
1339
1340@doc(storage_options=_shared_docs["storage_options"])
1341def inspect_excel_format(
1342 content_or_path: FilePath | ReadBuffer[bytes],
1343 storage_options: StorageOptions = None,
1344) -> str | None:
1345 """
1346 Inspect the path or content of an excel file and get its format.
1347
1348 Adopted from xlrd: https://github.com/python-excel/xlrd.
1349
1350 Parameters
1351 ----------
1352 content_or_path : str or file-like object
1353 Path to file or content of file to inspect. May be a URL.
1354 {storage_options}
1355
1356 Returns
1357 -------
1358 str or None
1359 Format of file if it can be determined.
1360
1361 Raises
1362 ------
1363 ValueError
1364 If resulting stream is empty.
1365 BadZipFile
1366 If resulting stream does not have an XLS signature and is not a valid zipfile.
1367 """
1368 if isinstance(content_or_path, bytes):
1369 content_or_path = BytesIO(content_or_path)
1370
1371 with get_handle(
1372 content_or_path, "rb", storage_options=storage_options, is_text=False
1373 ) as handle:
1374 stream = handle.handle
1375 stream.seek(0)
1376 buf = stream.read(PEEK_SIZE)
1377 if buf is None:
1378 raise ValueError("stream is empty")
1379 assert isinstance(buf, bytes)
1380 peek = buf
1381 stream.seek(0)
1382
1383 if any(peek.startswith(sig) for sig in XLS_SIGNATURES):
1384 return "xls"
1385 elif not peek.startswith(ZIP_SIGNATURE):
1386 return None
1387
1388 with zipfile.ZipFile(stream) as zf:
1389 # Workaround for some third party files that use forward slashes and
1390 # lower case names.
1391 component_names = [
1392 name.replace("\\", "/").lower() for name in zf.namelist()
1393 ]
1394
1395 if "xl/workbook.xml" in component_names:
1396 return "xlsx"
1397 if "xl/workbook.bin" in component_names:
1398 return "xlsb"
1399 if "content.xml" in component_names:
1400 return "ods"
1401 return "zip"
1402
1403
1404class ExcelFile:
1405 """
1406 Class for parsing tabular Excel sheets into DataFrame objects.
1407
1408 See read_excel for more documentation.
1409
1410 Parameters
1411 ----------
1412 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),
1413 A file-like object, xlrd workbook or openpyxl workbook.
1414 If a string or path object, expected to be a path to a
1415 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
1416 engine : str, default None
1417 If io is not a buffer or path, this must be set to identify io.
1418 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
1419 Engine compatibility :
1420
1421 - ``xlrd`` supports old-style Excel files (.xls).
1422 - ``openpyxl`` supports newer Excel file formats.
1423 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
1424 - ``pyxlsb`` supports Binary Excel files.
1425
1426 .. versionchanged:: 1.2.0
1427
1428 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
1429 now only supports old-style ``.xls`` files.
1430 When ``engine=None``, the following logic will be
1431 used to determine the engine:
1432
1433 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
1434 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
1435 - Otherwise if ``path_or_buffer`` is an xls format,
1436 ``xlrd`` will be used.
1437 - Otherwise if ``path_or_buffer`` is in xlsb format,
1438 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.
1439
1440 .. versionadded:: 1.3.0
1441
1442 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
1443 then ``openpyxl`` will be used.
1444 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
1445
1446 .. warning::
1447
1448 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
1449 This is not supported, switch to using ``openpyxl`` instead.
1450 """
1451
1452 from pandas.io.excel._odfreader import ODFReader
1453 from pandas.io.excel._openpyxl import OpenpyxlReader
1454 from pandas.io.excel._pyxlsb import PyxlsbReader
1455 from pandas.io.excel._xlrd import XlrdReader
1456
1457 _engines: Mapping[str, Any] = {
1458 "xlrd": XlrdReader,
1459 "openpyxl": OpenpyxlReader,
1460 "odf": ODFReader,
1461 "pyxlsb": PyxlsbReader,
1462 }
1463
1464 def __init__(
1465 self,
1466 path_or_buffer,
1467 engine: str | None = None,
1468 storage_options: StorageOptions = None,
1469 ) -> None:
1470 if engine is not None and engine not in self._engines:
1471 raise ValueError(f"Unknown engine: {engine}")
1472
1473 # First argument can also be bytes, so create a buffer
1474 if isinstance(path_or_buffer, bytes):
1475 path_or_buffer = BytesIO(path_or_buffer)
1476
1477 # Could be a str, ExcelFile, Book, etc.
1478 self.io = path_or_buffer
1479 # Always a string
1480 self._io = stringify_path(path_or_buffer)
1481
1482 # Determine xlrd version if installed
1483 if import_optional_dependency("xlrd", errors="ignore") is None:
1484 xlrd_version = None
1485 else:
1486 import xlrd
1487
1488 xlrd_version = Version(get_version(xlrd))
1489
1490 if engine is None:
1491 # Only determine ext if it is needed
1492 ext: str | None
1493 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1494 ext = "xls"
1495 else:
1496 ext = inspect_excel_format(
1497 content_or_path=path_or_buffer, storage_options=storage_options
1498 )
1499 if ext is None:
1500 raise ValueError(
1501 "Excel file format cannot be determined, you must specify "
1502 "an engine manually."
1503 )
1504
1505 engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
1506 if engine == "auto":
1507 engine = get_default_engine(ext, mode="reader")
1508
1509 assert engine is not None
1510 self.engine = engine
1511 self.storage_options = storage_options
1512
1513 self._reader = self._engines[engine](self._io, storage_options=storage_options)
1514
1515 def __fspath__(self):
1516 return self._io
1517
1518 def parse(
1519 self,
1520 sheet_name: str | int | list[int] | list[str] | None = 0,
1521 header: int | Sequence[int] | None = 0,
1522 names=None,
1523 index_col: int | Sequence[int] | None = None,
1524 usecols=None,
1525 converters=None,
1526 true_values: Iterable[Hashable] | None = None,
1527 false_values: Iterable[Hashable] | None = None,
1528 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
1529 nrows: int | None = None,
1530 na_values=None,
1531 parse_dates: list | dict | bool = False,
1532 date_parser: Callable | lib.NoDefault = lib.no_default,
1533 date_format: str | dict[Hashable, str] | None = None,
1534 thousands: str | None = None,
1535 comment: str | None = None,
1536 skipfooter: int = 0,
1537 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1538 **kwds,
1539 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
1540 """
1541 Parse specified sheet(s) into a DataFrame.
1542
1543 Equivalent to read_excel(ExcelFile, ...) See the read_excel
1544 docstring for more info on accepted parameters.
1545
1546 Returns
1547 -------
1548 DataFrame or dict of DataFrames
1549 DataFrame from the passed in Excel file.
1550 """
1551 return self._reader.parse(
1552 sheet_name=sheet_name,
1553 header=header,
1554 names=names,
1555 index_col=index_col,
1556 usecols=usecols,
1557 converters=converters,
1558 true_values=true_values,
1559 false_values=false_values,
1560 skiprows=skiprows,
1561 nrows=nrows,
1562 na_values=na_values,
1563 parse_dates=parse_dates,
1564 date_parser=date_parser,
1565 date_format=date_format,
1566 thousands=thousands,
1567 comment=comment,
1568 skipfooter=skipfooter,
1569 dtype_backend=dtype_backend,
1570 **kwds,
1571 )
1572
1573 @property
1574 def book(self):
1575 return self._reader.book
1576
1577 @property
1578 def sheet_names(self):
1579 return self._reader.sheet_names
1580
1581 def close(self) -> None:
1582 """close io if necessary"""
1583 self._reader.close()
1584
1585 def __enter__(self) -> ExcelFile:
1586 return self
1587
1588 def __exit__(
1589 self,
1590 exc_type: type[BaseException] | None,
1591 exc_value: BaseException | None,
1592 traceback: TracebackType | None,
1593 ) -> None:
1594 self.close()