1from __future__ import annotations
2
3from collections.abc import (
4 Hashable,
5 Iterable,
6 Mapping,
7 Sequence,
8)
9import datetime
10from functools import partial
11from io import BytesIO
12import os
13from textwrap import fill
14from typing import (
15 IO,
16 TYPE_CHECKING,
17 Any,
18 Callable,
19 Generic,
20 Literal,
21 TypeVar,
22 Union,
23 cast,
24 overload,
25)
26import warnings
27import zipfile
28
29from pandas._config import config
30
31from pandas._libs import lib
32from pandas._libs.parsers import STR_NA_VALUES
33from pandas.compat._optional import (
34 get_version,
35 import_optional_dependency,
36)
37from pandas.errors import EmptyDataError
38from pandas.util._decorators import (
39 Appender,
40 doc,
41)
42from pandas.util._exceptions import find_stack_level
43from pandas.util._validators import check_dtype_backend
44
45from pandas.core.dtypes.common import (
46 is_bool,
47 is_float,
48 is_integer,
49 is_list_like,
50)
51
52from pandas.core.frame import DataFrame
53from pandas.core.shared_docs import _shared_docs
54from pandas.util.version import Version
55
56from pandas.io.common import (
57 IOHandles,
58 get_handle,
59 stringify_path,
60 validate_header_arg,
61)
62from pandas.io.excel._util import (
63 fill_mi_header,
64 get_default_engine,
65 get_writer,
66 maybe_convert_usecols,
67 pop_header_name,
68)
69from pandas.io.parsers import TextParser
70from pandas.io.parsers.readers import validate_integer
71
72if TYPE_CHECKING:
73 from types import TracebackType
74
75 from pandas._typing import (
76 DtypeArg,
77 DtypeBackend,
78 ExcelWriterIfSheetExists,
79 FilePath,
80 IntStrT,
81 ReadBuffer,
82 Self,
83 SequenceNotStr,
84 StorageOptions,
85 WriteExcelBuffer,
86 )
87_read_excel_doc = (
88 """
89Read an Excel file into a ``pandas`` ``DataFrame``.
90
91Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
92read from a local filesystem or URL. Supports an option to read
93a single sheet or a list of sheets.
94
95Parameters
96----------
97io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
98 Any valid string path is acceptable. The string could be a URL. Valid
99 URL schemes include http, ftp, s3, and file. For file URLs, a host is
100 expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
101
102 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
103
104 By file-like object, we refer to objects with a ``read()`` method,
105 such as a file handle (e.g. via builtin ``open`` function)
106 or ``StringIO``.
107
108 .. deprecated:: 2.1.0
109 Passing byte strings is deprecated. To read from a
110 byte string, wrap it in a ``BytesIO`` object.
111sheet_name : str, int, list, or None, default 0
112 Strings are used for sheet names. Integers are used in zero-indexed
113 sheet positions (chart sheets do not count as a sheet position).
114 Lists of strings/integers are used to request multiple sheets.
115 Specify ``None`` to get all worksheets.
116
117 Available cases:
118
119 * Defaults to ``0``: 1st sheet as a `DataFrame`
120 * ``1``: 2nd sheet as a `DataFrame`
121 * ``"Sheet1"``: Load sheet with name "Sheet1"
122 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
123 as a dict of `DataFrame`
124 * ``None``: All worksheets.
125
126header : int, list of int, default 0
127 Row (0-indexed) to use for the column labels of the parsed
128 DataFrame. If a list of integers is passed those row positions will
129 be combined into a ``MultiIndex``. Use None if there is no header.
130names : array-like, default None
131 List of column names to use. If file contains no header row,
132 then you should explicitly pass header=None.
133index_col : int, str, list of int, default None
134 Column (0-indexed) to use as the row labels of the DataFrame.
135 Pass None if there is no such column. If a list is passed,
136 those columns will be combined into a ``MultiIndex``. If a
137 subset of data is selected with ``usecols``, index_col
138 is based on the subset.
139
140 Missing values will be forward filled to allow roundtripping with
141 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the
142 missing values use ``set_index`` after reading the data instead of
143 ``index_col``.
144usecols : str, list-like, or callable, default None
145 * If None, then parse all columns.
146 * If str, then indicates comma separated list of Excel column letters
147 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
148 both sides.
149 * If list of int, then indicates list of column numbers to be parsed
150 (0-indexed).
151 * If list of string, then indicates list of column names to be parsed.
152 * If callable, then evaluate each column name against it and parse the
153 column if the callable returns ``True``.
154
155 Returns a subset of the columns according to behavior above.
156dtype : Type name or dict of column -> type, default None
157 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
158 Use ``object`` to preserve data as stored in Excel and not interpret dtype,
159 which will necessarily result in ``object`` dtype.
160 If converters are specified, they will be applied INSTEAD
161 of dtype conversion.
162 If you use ``None``, it will infer the dtype of each column based on the data.
163engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None
164 If io is not a buffer or path, this must be set to identify io.
165 Engine compatibility :
166
167 - ``openpyxl`` supports newer Excel file formats.
168 - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb)
169 and OpenDocument (.ods) file formats.
170 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
171 - ``pyxlsb`` supports Binary Excel files.
172 - ``xlrd`` supports old-style Excel files (.xls).
173
174 When ``engine=None``, the following logic will be used to determine the engine:
175
176 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
177 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
178 - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.
179 - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.
180 - Otherwise ``openpyxl`` will be used.
181converters : dict, default None
182 Dict of functions for converting values in certain columns. Keys can
183 either be integers or column labels, values are functions that take one
184 input argument, the Excel cell content, and return the transformed
185 content.
186true_values : list, default None
187 Values to consider as True.
188false_values : list, default None
189 Values to consider as False.
190skiprows : list-like, int, or callable, optional
191 Line numbers to skip (0-indexed) or number of lines to skip (int) at the
192 start of the file. If callable, the callable function will be evaluated
193 against the row indices, returning True if the row should be skipped and
194 False otherwise. An example of a valid callable argument would be ``lambda
195 x: x in [0, 2]``.
196nrows : int, default None
197 Number of rows to parse.
198na_values : scalar, str, list-like, or dict, default None
199 Additional strings to recognize as NA/NaN. If dict passed, specific
200 per-column NA values. By default the following values are interpreted
201 as NaN: '"""
202 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
203 + """'.
204keep_default_na : bool, default True
205 Whether or not to include the default NaN values when parsing the data.
206 Depending on whether ``na_values`` is passed in, the behavior is as follows:
207
208 * If ``keep_default_na`` is True, and ``na_values`` are specified,
209 ``na_values`` is appended to the default NaN values used for parsing.
210 * If ``keep_default_na`` is True, and ``na_values`` are not specified, only
211 the default NaN values are used for parsing.
212 * If ``keep_default_na`` is False, and ``na_values`` are specified, only
213 the NaN values specified ``na_values`` are used for parsing.
214 * If ``keep_default_na`` is False, and ``na_values`` are not specified, no
215 strings will be parsed as NaN.
216
217 Note that if `na_filter` is passed in as False, the ``keep_default_na`` and
218 ``na_values`` parameters will be ignored.
219na_filter : bool, default True
220 Detect missing value markers (empty strings and the value of na_values). In
221 data without any NAs, passing ``na_filter=False`` can improve the
222 performance of reading a large file.
223verbose : bool, default False
224 Indicate number of NA values placed in non-numeric columns.
225parse_dates : bool, list-like, or dict, default False
226 The behavior is as follows:
227
228 * ``bool``. If True -> try parsing the index.
229 * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
230 each as a separate date column.
231 * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
232 a single date column.
233 * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
234 result 'foo'
235
236 If a column or index contains an unparsable date, the entire column or
237 index will be returned unaltered as an object data type. If you don`t want to
238 parse some cells as date just change their type in Excel to "Text".
239 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
240
241 Note: A fast-path exists for iso8601-formatted dates.
242date_parser : function, optional
243 Function to use for converting a sequence of string columns to an array of
244 datetime instances. The default uses ``dateutil.parser.parser`` to do the
245 conversion. Pandas will try to call `date_parser` in three different ways,
246 advancing to the next if an exception occurs: 1) Pass one or more arrays
247 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
248 string values from the columns defined by `parse_dates` into a single array
249 and pass that; and 3) call `date_parser` once for each row using one or
250 more strings (corresponding to the columns defined by `parse_dates`) as
251 arguments.
252
253 .. deprecated:: 2.0.0
254 Use ``date_format`` instead, or read in as ``object`` and then apply
255 :func:`to_datetime` as-needed.
256date_format : str or dict of column -> format, default ``None``
257 If used in conjunction with ``parse_dates``, will parse dates according to this
258 format. For anything more complex,
259 please read in as ``object`` and then apply :func:`to_datetime` as-needed.
260
261 .. versionadded:: 2.0.0
262thousands : str, default None
263 Thousands separator for parsing string columns to numeric. Note that
264 this parameter is only necessary for columns stored as TEXT in Excel,
265 any numeric columns will automatically be parsed, regardless of display
266 format.
267decimal : str, default '.'
268 Character to recognize as decimal point for parsing string columns to numeric.
269 Note that this parameter is only necessary for columns stored as TEXT in Excel,
270 any numeric columns will automatically be parsed, regardless of display
271 format.(e.g. use ',' for European data).
272
273 .. versionadded:: 1.4.0
274
275comment : str, default None
276 Comments out remainder of line. Pass a character or characters to this
277 argument to indicate comments in the input file. Any data between the
278 comment string and the end of the current line is ignored.
279skipfooter : int, default 0
280 Rows at the end to skip (0-indexed).
281{storage_options}
282
283dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
284 Back-end data type applied to the resultant :class:`DataFrame`
285 (still experimental). Behaviour is as follows:
286
287 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
288 (default).
289 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
290 DataFrame.
291
292 .. versionadded:: 2.0
293
294engine_kwargs : dict, optional
295 Arbitrary keyword arguments passed to excel engine.
296
297Returns
298-------
299DataFrame or dict of DataFrames
300 DataFrame from the passed in Excel file. See notes in sheet_name
301 argument for more information on when a dict of DataFrames is returned.
302
303See Also
304--------
305DataFrame.to_excel : Write DataFrame to an Excel file.
306DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
307read_csv : Read a comma-separated values (csv) file into DataFrame.
308read_fwf : Read a table of fixed-width formatted lines into DataFrame.
309
310Notes
311-----
312For specific information on the methods used for each Excel engine, refer to the pandas
313:ref:`user guide <io.excel_reader>`
314
315Examples
316--------
317The file can be read using the file name as string or an open file object:
318
319>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
320 Name Value
3210 string1 1
3221 string2 2
3232 #Comment 3
324
325>>> pd.read_excel(open('tmp.xlsx', 'rb'),
326... sheet_name='Sheet3') # doctest: +SKIP
327 Unnamed: 0 Name Value
3280 0 string1 1
3291 1 string2 2
3302 2 #Comment 3
331
332Index and header can be specified via the `index_col` and `header` arguments
333
334>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
335 0 1 2
3360 NaN Name Value
3371 0.0 string1 1
3382 1.0 string2 2
3393 2.0 #Comment 3
340
341Column types are inferred but can be explicitly specified
342
343>>> pd.read_excel('tmp.xlsx', index_col=0,
344... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP
345 Name Value
3460 string1 1.0
3471 string2 2.0
3482 #Comment 3.0
349
350True, False, and NA values, and thousands separators have defaults,
351but can be explicitly specified, too. Supply the values you would like
352as strings or lists of strings!
353
354>>> pd.read_excel('tmp.xlsx', index_col=0,
355... na_values=['string1', 'string2']) # doctest: +SKIP
356 Name Value
3570 NaN 1
3581 NaN 2
3592 #Comment 3
360
361Comment lines in the excel input file can be skipped using the
362``comment`` kwarg.
363
364>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
365 Name Value
3660 string1 1.0
3671 string2 2.0
3682 None NaN
369"""
370)
371
372
373@overload
374def read_excel(
375 io,
376 # sheet name is str or int -> DataFrame
377 sheet_name: str | int = ...,
378 *,
379 header: int | Sequence[int] | None = ...,
380 names: SequenceNotStr[Hashable] | range | None = ...,
381 index_col: int | str | Sequence[int] | None = ...,
382 usecols: int
383 | str
384 | Sequence[int]
385 | Sequence[str]
386 | Callable[[str], bool]
387 | None = ...,
388 dtype: DtypeArg | None = ...,
389 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,
390 converters: dict[str, Callable] | dict[int, Callable] | None = ...,
391 true_values: Iterable[Hashable] | None = ...,
392 false_values: Iterable[Hashable] | None = ...,
393 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
394 nrows: int | None = ...,
395 na_values=...,
396 keep_default_na: bool = ...,
397 na_filter: bool = ...,
398 verbose: bool = ...,
399 parse_dates: list | dict | bool = ...,
400 date_parser: Callable | lib.NoDefault = ...,
401 date_format: dict[Hashable, str] | str | None = ...,
402 thousands: str | None = ...,
403 decimal: str = ...,
404 comment: str | None = ...,
405 skipfooter: int = ...,
406 storage_options: StorageOptions = ...,
407 dtype_backend: DtypeBackend | lib.NoDefault = ...,
408) -> DataFrame:
409 ...
410
411
412@overload
413def read_excel(
414 io,
415 # sheet name is list or None -> dict[IntStrT, DataFrame]
416 sheet_name: list[IntStrT] | None,
417 *,
418 header: int | Sequence[int] | None = ...,
419 names: SequenceNotStr[Hashable] | range | None = ...,
420 index_col: int | str | Sequence[int] | None = ...,
421 usecols: int
422 | str
423 | Sequence[int]
424 | Sequence[str]
425 | Callable[[str], bool]
426 | None = ...,
427 dtype: DtypeArg | None = ...,
428 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,
429 converters: dict[str, Callable] | dict[int, Callable] | None = ...,
430 true_values: Iterable[Hashable] | None = ...,
431 false_values: Iterable[Hashable] | None = ...,
432 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
433 nrows: int | None = ...,
434 na_values=...,
435 keep_default_na: bool = ...,
436 na_filter: bool = ...,
437 verbose: bool = ...,
438 parse_dates: list | dict | bool = ...,
439 date_parser: Callable | lib.NoDefault = ...,
440 date_format: dict[Hashable, str] | str | None = ...,
441 thousands: str | None = ...,
442 decimal: str = ...,
443 comment: str | None = ...,
444 skipfooter: int = ...,
445 storage_options: StorageOptions = ...,
446 dtype_backend: DtypeBackend | lib.NoDefault = ...,
447) -> dict[IntStrT, DataFrame]:
448 ...
449
450
451@doc(storage_options=_shared_docs["storage_options"])
452@Appender(_read_excel_doc)
453def read_excel(
454 io,
455 sheet_name: str | int | list[IntStrT] | None = 0,
456 *,
457 header: int | Sequence[int] | None = 0,
458 names: SequenceNotStr[Hashable] | range | None = None,
459 index_col: int | str | Sequence[int] | None = None,
460 usecols: int
461 | str
462 | Sequence[int]
463 | Sequence[str]
464 | Callable[[str], bool]
465 | None = None,
466 dtype: DtypeArg | None = None,
467 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None,
468 converters: dict[str, Callable] | dict[int, Callable] | None = None,
469 true_values: Iterable[Hashable] | None = None,
470 false_values: Iterable[Hashable] | None = None,
471 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
472 nrows: int | None = None,
473 na_values=None,
474 keep_default_na: bool = True,
475 na_filter: bool = True,
476 verbose: bool = False,
477 parse_dates: list | dict | bool = False,
478 date_parser: Callable | lib.NoDefault = lib.no_default,
479 date_format: dict[Hashable, str] | str | None = None,
480 thousands: str | None = None,
481 decimal: str = ".",
482 comment: str | None = None,
483 skipfooter: int = 0,
484 storage_options: StorageOptions | None = None,
485 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
486 engine_kwargs: dict | None = None,
487) -> DataFrame | dict[IntStrT, DataFrame]:
488 check_dtype_backend(dtype_backend)
489 should_close = False
490 if engine_kwargs is None:
491 engine_kwargs = {}
492
493 if not isinstance(io, ExcelFile):
494 should_close = True
495 io = ExcelFile(
496 io,
497 storage_options=storage_options,
498 engine=engine,
499 engine_kwargs=engine_kwargs,
500 )
501 elif engine and engine != io.engine:
502 raise ValueError(
503 "Engine should not be specified when passing "
504 "an ExcelFile - ExcelFile already has the engine set"
505 )
506
507 try:
508 data = io.parse(
509 sheet_name=sheet_name,
510 header=header,
511 names=names,
512 index_col=index_col,
513 usecols=usecols,
514 dtype=dtype,
515 converters=converters,
516 true_values=true_values,
517 false_values=false_values,
518 skiprows=skiprows,
519 nrows=nrows,
520 na_values=na_values,
521 keep_default_na=keep_default_na,
522 na_filter=na_filter,
523 verbose=verbose,
524 parse_dates=parse_dates,
525 date_parser=date_parser,
526 date_format=date_format,
527 thousands=thousands,
528 decimal=decimal,
529 comment=comment,
530 skipfooter=skipfooter,
531 dtype_backend=dtype_backend,
532 )
533 finally:
534 # make sure to close opened file handles
535 if should_close:
536 io.close()
537 return data
538
539
540_WorkbookT = TypeVar("_WorkbookT")
541
542
543class BaseExcelReader(Generic[_WorkbookT]):
544 book: _WorkbookT
545
546 def __init__(
547 self,
548 filepath_or_buffer,
549 storage_options: StorageOptions | None = None,
550 engine_kwargs: dict | None = None,
551 ) -> None:
552 if engine_kwargs is None:
553 engine_kwargs = {}
554
555 # First argument can also be bytes, so create a buffer
556 if isinstance(filepath_or_buffer, bytes):
557 filepath_or_buffer = BytesIO(filepath_or_buffer)
558
559 self.handles = IOHandles(
560 handle=filepath_or_buffer, compression={"method": None}
561 )
562 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
563 self.handles = get_handle(
564 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False
565 )
566
567 if isinstance(self.handles.handle, self._workbook_class):
568 self.book = self.handles.handle
569 elif hasattr(self.handles.handle, "read"):
570 # N.B. xlrd.Book has a read attribute too
571 self.handles.handle.seek(0)
572 try:
573 self.book = self.load_workbook(self.handles.handle, engine_kwargs)
574 except Exception:
575 self.close()
576 raise
577 else:
578 raise ValueError(
579 "Must explicitly set engine if not passing in buffer or path for io."
580 )
581
582 @property
583 def _workbook_class(self) -> type[_WorkbookT]:
584 raise NotImplementedError
585
586 def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT:
587 raise NotImplementedError
588
589 def close(self) -> None:
590 if hasattr(self, "book"):
591 if hasattr(self.book, "close"):
592 # pyxlsb: opens a TemporaryFile
593 # openpyxl: https://stackoverflow.com/questions/31416842/
594 # openpyxl-does-not-close-excel-workbook-in-read-only-mode
595 self.book.close()
596 elif hasattr(self.book, "release_resources"):
597 # xlrd
598 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548
599 self.book.release_resources()
600 self.handles.close()
601
602 @property
603 def sheet_names(self) -> list[str]:
604 raise NotImplementedError
605
606 def get_sheet_by_name(self, name: str):
607 raise NotImplementedError
608
609 def get_sheet_by_index(self, index: int):
610 raise NotImplementedError
611
612 def get_sheet_data(self, sheet, rows: int | None = None):
613 raise NotImplementedError
614
615 def raise_if_bad_sheet_by_index(self, index: int) -> None:
616 n_sheets = len(self.sheet_names)
617 if index >= n_sheets:
618 raise ValueError(
619 f"Worksheet index {index} is invalid, {n_sheets} worksheets found"
620 )
621
622 def raise_if_bad_sheet_by_name(self, name: str) -> None:
623 if name not in self.sheet_names:
624 raise ValueError(f"Worksheet named '{name}' not found")
625
626 def _check_skiprows_func(
627 self,
628 skiprows: Callable,
629 rows_to_use: int,
630 ) -> int:
631 """
632 Determine how many file rows are required to obtain `nrows` data
633 rows when `skiprows` is a function.
634
635 Parameters
636 ----------
637 skiprows : function
638 The function passed to read_excel by the user.
639 rows_to_use : int
640 The number of rows that will be needed for the header and
641 the data.
642
643 Returns
644 -------
645 int
646 """
647 i = 0
648 rows_used_so_far = 0
649 while rows_used_so_far < rows_to_use:
650 if not skiprows(i):
651 rows_used_so_far += 1
652 i += 1
653 return i
654
655 def _calc_rows(
656 self,
657 header: int | Sequence[int] | None,
658 index_col: int | Sequence[int] | None,
659 skiprows: Sequence[int] | int | Callable[[int], object] | None,
660 nrows: int | None,
661 ) -> int | None:
662 """
663 If nrows specified, find the number of rows needed from the
664 file, otherwise return None.
665
666
667 Parameters
668 ----------
669 header : int, list of int, or None
670 See read_excel docstring.
671 index_col : int, str, list of int, or None
672 See read_excel docstring.
673 skiprows : list-like, int, callable, or None
674 See read_excel docstring.
675 nrows : int or None
676 See read_excel docstring.
677
678 Returns
679 -------
680 int or None
681 """
682 if nrows is None:
683 return None
684 if header is None:
685 header_rows = 1
686 elif is_integer(header):
687 header = cast(int, header)
688 header_rows = 1 + header
689 else:
690 header = cast(Sequence, header)
691 header_rows = 1 + header[-1]
692 # If there is a MultiIndex header and an index then there is also
693 # a row containing just the index name(s)
694 if is_list_like(header) and index_col is not None:
695 header = cast(Sequence, header)
696 if len(header) > 1:
697 header_rows += 1
698 if skiprows is None:
699 return header_rows + nrows
700 if is_integer(skiprows):
701 skiprows = cast(int, skiprows)
702 return header_rows + nrows + skiprows
703 if is_list_like(skiprows):
704
705 def f(skiprows: Sequence, x: int) -> bool:
706 return x in skiprows
707
708 skiprows = cast(Sequence, skiprows)
709 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
710 if callable(skiprows):
711 return self._check_skiprows_func(
712 skiprows,
713 header_rows + nrows,
714 )
715 # else unexpected skiprows type: read_excel will not optimize
716 # the number of rows read from file
717 return None
718
719 def parse(
720 self,
721 sheet_name: str | int | list[int] | list[str] | None = 0,
722 header: int | Sequence[int] | None = 0,
723 names: SequenceNotStr[Hashable] | range | None = None,
724 index_col: int | Sequence[int] | None = None,
725 usecols=None,
726 dtype: DtypeArg | None = None,
727 true_values: Iterable[Hashable] | None = None,
728 false_values: Iterable[Hashable] | None = None,
729 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
730 nrows: int | None = None,
731 na_values=None,
732 verbose: bool = False,
733 parse_dates: list | dict | bool = False,
734 date_parser: Callable | lib.NoDefault = lib.no_default,
735 date_format: dict[Hashable, str] | str | None = None,
736 thousands: str | None = None,
737 decimal: str = ".",
738 comment: str | None = None,
739 skipfooter: int = 0,
740 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
741 **kwds,
742 ):
743 validate_header_arg(header)
744 validate_integer("nrows", nrows)
745
746 ret_dict = False
747
748 # Keep sheetname to maintain backwards compatibility.
749 sheets: list[int] | list[str]
750 if isinstance(sheet_name, list):
751 sheets = sheet_name
752 ret_dict = True
753 elif sheet_name is None:
754 sheets = self.sheet_names
755 ret_dict = True
756 elif isinstance(sheet_name, str):
757 sheets = [sheet_name]
758 else:
759 sheets = [sheet_name]
760
761 # handle same-type duplicates.
762 sheets = cast(Union[list[int], list[str]], list(dict.fromkeys(sheets).keys()))
763
764 output = {}
765
766 last_sheetname = None
767 for asheetname in sheets:
768 last_sheetname = asheetname
769 if verbose:
770 print(f"Reading sheet {asheetname}")
771
772 if isinstance(asheetname, str):
773 sheet = self.get_sheet_by_name(asheetname)
774 else: # assume an integer if not a string
775 sheet = self.get_sheet_by_index(asheetname)
776
777 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
778 data = self.get_sheet_data(sheet, file_rows_needed)
779 if hasattr(sheet, "close"):
780 # pyxlsb opens two TemporaryFiles
781 sheet.close()
782 usecols = maybe_convert_usecols(usecols)
783
784 if not data:
785 output[asheetname] = DataFrame()
786 continue
787
788 is_list_header = False
789 is_len_one_list_header = False
790 if is_list_like(header):
791 assert isinstance(header, Sequence)
792 is_list_header = True
793 if len(header) == 1:
794 is_len_one_list_header = True
795
796 if is_len_one_list_header:
797 header = cast(Sequence[int], header)[0]
798
799 # forward fill and pull out names for MultiIndex column
800 header_names = None
801 if header is not None and is_list_like(header):
802 assert isinstance(header, Sequence)
803
804 header_names = []
805 control_row = [True] * len(data[0])
806
807 for row in header:
808 if is_integer(skiprows):
809 assert isinstance(skiprows, int)
810 row += skiprows
811
812 if row > len(data) - 1:
813 raise ValueError(
814 f"header index {row} exceeds maximum index "
815 f"{len(data) - 1} of data.",
816 )
817
818 data[row], control_row = fill_mi_header(data[row], control_row)
819
820 if index_col is not None:
821 header_name, _ = pop_header_name(data[row], index_col)
822 header_names.append(header_name)
823
824 # If there is a MultiIndex header and an index then there is also
825 # a row containing just the index name(s)
826 has_index_names = False
827 if is_list_header and not is_len_one_list_header and index_col is not None:
828 index_col_list: Sequence[int]
829 if isinstance(index_col, int):
830 index_col_list = [index_col]
831 else:
832 assert isinstance(index_col, Sequence)
833 index_col_list = index_col
834
835 # We have to handle mi without names. If any of the entries in the data
836 # columns are not empty, this is a regular row
837 assert isinstance(header, Sequence)
838 if len(header) < len(data):
839 potential_index_names = data[len(header)]
840 potential_data = [
841 x
842 for i, x in enumerate(potential_index_names)
843 if not control_row[i] and i not in index_col_list
844 ]
845 has_index_names = all(x == "" or x is None for x in potential_data)
846
847 if is_list_like(index_col):
848 # Forward fill values for MultiIndex index.
849 if header is None:
850 offset = 0
851 elif isinstance(header, int):
852 offset = 1 + header
853 else:
854 offset = 1 + max(header)
855
856 # GH34673: if MultiIndex names present and not defined in the header,
857 # offset needs to be incremented so that forward filling starts
858 # from the first MI value instead of the name
859 if has_index_names:
860 offset += 1
861
862 # Check if we have an empty dataset
863 # before trying to collect data.
864 if offset < len(data):
865 assert isinstance(index_col, Sequence)
866
867 for col in index_col:
868 last = data[offset][col]
869
870 for row in range(offset + 1, len(data)):
871 if data[row][col] == "" or data[row][col] is None:
872 data[row][col] = last
873 else:
874 last = data[row][col]
875
876 # GH 12292 : error when read one empty column from excel file
877 try:
878 parser = TextParser(
879 data,
880 names=names,
881 header=header,
882 index_col=index_col,
883 has_index_names=has_index_names,
884 dtype=dtype,
885 true_values=true_values,
886 false_values=false_values,
887 skiprows=skiprows,
888 nrows=nrows,
889 na_values=na_values,
890 skip_blank_lines=False, # GH 39808
891 parse_dates=parse_dates,
892 date_parser=date_parser,
893 date_format=date_format,
894 thousands=thousands,
895 decimal=decimal,
896 comment=comment,
897 skipfooter=skipfooter,
898 usecols=usecols,
899 dtype_backend=dtype_backend,
900 **kwds,
901 )
902
903 output[asheetname] = parser.read(nrows=nrows)
904
905 if header_names:
906 output[asheetname].columns = output[asheetname].columns.set_names(
907 header_names
908 )
909
910 except EmptyDataError:
911 # No Data, return an empty DataFrame
912 output[asheetname] = DataFrame()
913
914 except Exception as err:
915 err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
916 raise err
917
918 if last_sheetname is None:
919 raise ValueError("Sheet name is an empty list")
920
921 if ret_dict:
922 return output
923 else:
924 return output[last_sheetname]
925
926
927@doc(storage_options=_shared_docs["storage_options"])
928class ExcelWriter(Generic[_WorkbookT]):
929 """
930 Class for writing DataFrame objects into excel sheets.
931
932 Default is to use:
933
934 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter
935 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__
936 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files
937
938 See ``DataFrame.to_excel`` for typical usage.
939
940 The writer should be used as a context manager. Otherwise, call `close()` to save
941 and close any opened file handles.
942
943 Parameters
944 ----------
945 path : str or typing.BinaryIO
946 Path to xls or xlsx or ods file.
947 engine : str (optional)
948 Engine to use for writing. If None, defaults to
949 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
950 argument.
951 date_format : str, default None
952 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
953 datetime_format : str, default None
954 Format string for datetime objects written into Excel files.
955 (e.g. 'YYYY-MM-DD HH:MM:SS').
956 mode : {{'w', 'a'}}, default 'w'
957 File mode to use (write or append). Append does not work with fsspec URLs.
958 {storage_options}
959
960 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error'
961 How to behave when trying to write to a sheet that already
962 exists (append mode only).
963
964 * error: raise a ValueError.
965 * new: Create a new sheet, with a name determined by the engine.
966 * replace: Delete the contents of the sheet before writing to it.
967 * overlay: Write contents to the existing sheet without first removing,
968 but possibly over top of, the existing contents.
969
970 .. versionadded:: 1.3.0
971
972 .. versionchanged:: 1.4.0
973
974 Added ``overlay`` option
975
976 engine_kwargs : dict, optional
977 Keyword arguments to be passed into the engine. These will be passed to
978 the following functions of the respective engines:
979
980 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)``
981 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)``
982 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)``
983 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)``
984
985 .. versionadded:: 1.3.0
986
987 Notes
988 -----
989 For compatibility with CSV writers, ExcelWriter serializes lists
990 and dicts to strings before writing.
991
992 Examples
993 --------
994 Default usage:
995
996 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
997 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
998 ... df.to_excel(writer) # doctest: +SKIP
999
1000 To write to separate sheets in a single file:
1001
1002 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP
1003 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
1004 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
1005 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
1006 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
1007
1008 You can set the date format or datetime format:
1009
1010 >>> from datetime import date, datetime # doctest: +SKIP
1011 >>> df = pd.DataFrame(
1012 ... [
1013 ... [date(2014, 1, 31), date(1999, 9, 24)],
1014 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)],
1015 ... ],
1016 ... index=["Date", "Datetime"],
1017 ... columns=["X", "Y"],
1018 ... ) # doctest: +SKIP
1019 >>> with pd.ExcelWriter(
1020 ... "path_to_file.xlsx",
1021 ... date_format="YYYY-MM-DD",
1022 ... datetime_format="YYYY-MM-DD HH:MM:SS"
1023 ... ) as writer:
1024 ... df.to_excel(writer) # doctest: +SKIP
1025
1026 You can also append to an existing Excel file:
1027
1028 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:
1029 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP
1030
1031 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it
1032 already exists:
1033
1034 >>> with ExcelWriter(
1035 ... "path_to_file.xlsx",
1036 ... mode="a",
1037 ... engine="openpyxl",
1038 ... if_sheet_exists="replace",
1039 ... ) as writer:
1040 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
1041
1042 You can also write multiple DataFrames to a single sheet. Note that the
1043 ``if_sheet_exists`` parameter needs to be set to ``overlay``:
1044
1045 >>> with ExcelWriter("path_to_file.xlsx",
1046 ... mode="a",
1047 ... engine="openpyxl",
1048 ... if_sheet_exists="overlay",
1049 ... ) as writer:
1050 ... df1.to_excel(writer, sheet_name="Sheet1")
1051 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP
1052
1053 You can store Excel file in RAM:
1054
1055 >>> import io
1056 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
1057 >>> buffer = io.BytesIO()
1058 >>> with pd.ExcelWriter(buffer) as writer:
1059 ... df.to_excel(writer)
1060
1061 You can pack Excel file into zip archive:
1062
1063 >>> import zipfile # doctest: +SKIP
1064 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
1065 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:
1066 ... with zf.open("filename.xlsx", "w") as buffer:
1067 ... with pd.ExcelWriter(buffer) as writer:
1068 ... df.to_excel(writer) # doctest: +SKIP
1069
1070 You can specify additional arguments to the underlying engine:
1071
1072 >>> with pd.ExcelWriter(
1073 ... "path_to_file.xlsx",
1074 ... engine="xlsxwriter",
1075 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}
1076 ... ) as writer:
1077 ... df.to_excel(writer) # doctest: +SKIP
1078
1079 In append mode, ``engine_kwargs`` are passed through to
1080 openpyxl's ``load_workbook``:
1081
1082 >>> with pd.ExcelWriter(
1083 ... "path_to_file.xlsx",
1084 ... engine="openpyxl",
1085 ... mode="a",
1086 ... engine_kwargs={{"keep_vba": True}}
1087 ... ) as writer:
1088 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
1089 """
1090
1091 # Defining an ExcelWriter implementation (see abstract methods for more...)
1092
1093 # - Mandatory
1094 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
1095 # --> called to write additional DataFrames to disk
1096 # - ``_supported_extensions`` (tuple of supported extensions), used to
1097 # check that engine supports the given extension.
1098 # - ``_engine`` - string that gives the engine name. Necessary to
1099 # instantiate class directly and bypass ``ExcelWriterMeta`` engine
1100 # lookup.
1101 # - ``save(self)`` --> called to save file to disk
1102 # - Mostly mandatory (i.e. should at least exist)
1103 # - book, cur_sheet, path
1104
1105 # - Optional:
1106 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called
1107 # with path as first argument.
1108
1109 # You also need to register the class with ``register_writer()``.
1110 # Technically, ExcelWriter implementations don't need to subclass
1111 # ExcelWriter.
1112
1113 _engine: str
1114 _supported_extensions: tuple[str, ...]
1115
1116 def __new__(
1117 cls,
1118 path: FilePath | WriteExcelBuffer | ExcelWriter,
1119 engine: str | None = None,
1120 date_format: str | None = None,
1121 datetime_format: str | None = None,
1122 mode: str = "w",
1123 storage_options: StorageOptions | None = None,
1124 if_sheet_exists: ExcelWriterIfSheetExists | None = None,
1125 engine_kwargs: dict | None = None,
1126 ) -> Self:
1127 # only switch class if generic(ExcelWriter)
1128 if cls is ExcelWriter:
1129 if engine is None or (isinstance(engine, str) and engine == "auto"):
1130 if isinstance(path, str):
1131 ext = os.path.splitext(path)[-1][1:]
1132 else:
1133 ext = "xlsx"
1134
1135 try:
1136 engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
1137 if engine == "auto":
1138 engine = get_default_engine(ext, mode="writer")
1139 except KeyError as err:
1140 raise ValueError(f"No engine for filetype: '{ext}'") from err
1141
1142 # for mypy
1143 assert engine is not None
1144 # error: Incompatible types in assignment (expression has type
1145 # "type[ExcelWriter[Any]]", variable has type "type[Self]")
1146 cls = get_writer(engine) # type: ignore[assignment]
1147
1148 return object.__new__(cls)
1149
1150 # declare external properties you can count on
1151 _path = None
1152
1153 @property
1154 def supported_extensions(self) -> tuple[str, ...]:
1155 """Extensions that writer engine supports."""
1156 return self._supported_extensions
1157
1158 @property
1159 def engine(self) -> str:
1160 """Name of engine."""
1161 return self._engine
1162
1163 @property
1164 def sheets(self) -> dict[str, Any]:
1165 """Mapping of sheet names to sheet objects."""
1166 raise NotImplementedError
1167
1168 @property
1169 def book(self) -> _WorkbookT:
1170 """
1171 Book instance. Class type will depend on the engine used.
1172
1173 This attribute can be used to access engine-specific features.
1174 """
1175 raise NotImplementedError
1176
1177 def _write_cells(
1178 self,
1179 cells,
1180 sheet_name: str | None = None,
1181 startrow: int = 0,
1182 startcol: int = 0,
1183 freeze_panes: tuple[int, int] | None = None,
1184 ) -> None:
1185 """
1186 Write given formatted cells into Excel an excel sheet
1187
1188 Parameters
1189 ----------
1190 cells : generator
1191 cell of formatted data to save to Excel sheet
1192 sheet_name : str, default None
1193 Name of Excel sheet, if None, then use self.cur_sheet
1194 startrow : upper left cell row to dump data frame
1195 startcol : upper left cell column to dump data frame
1196 freeze_panes: int tuple of length 2
1197 contains the bottom-most row and right-most column to freeze
1198 """
1199 raise NotImplementedError
1200
1201 def _save(self) -> None:
1202 """
1203 Save workbook to disk.
1204 """
1205 raise NotImplementedError
1206
1207 def __init__(
1208 self,
1209 path: FilePath | WriteExcelBuffer | ExcelWriter,
1210 engine: str | None = None,
1211 date_format: str | None = None,
1212 datetime_format: str | None = None,
1213 mode: str = "w",
1214 storage_options: StorageOptions | None = None,
1215 if_sheet_exists: ExcelWriterIfSheetExists | None = None,
1216 engine_kwargs: dict[str, Any] | None = None,
1217 ) -> None:
1218 # validate that this engine can handle the extension
1219 if isinstance(path, str):
1220 ext = os.path.splitext(path)[-1]
1221 self.check_extension(ext)
1222
1223 # use mode to open the file
1224 if "b" not in mode:
1225 mode += "b"
1226 # use "a" for the user to append data to excel but internally use "r+" to let
1227 # the excel backend first read the existing file and then write any data to it
1228 mode = mode.replace("a", "r+")
1229
1230 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"):
1231 raise ValueError(
1232 f"'{if_sheet_exists}' is not valid for if_sheet_exists. "
1233 "Valid options are 'error', 'new', 'replace' and 'overlay'."
1234 )
1235 if if_sheet_exists and "r+" not in mode:
1236 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')")
1237 if if_sheet_exists is None:
1238 if_sheet_exists = "error"
1239 self._if_sheet_exists = if_sheet_exists
1240
1241 # cast ExcelWriter to avoid adding 'if self._handles is not None'
1242 self._handles = IOHandles(
1243 cast(IO[bytes], path), compression={"compression": None}
1244 )
1245 if not isinstance(path, ExcelWriter):
1246 self._handles = get_handle(
1247 path, mode, storage_options=storage_options, is_text=False
1248 )
1249 self._cur_sheet = None
1250
1251 if date_format is None:
1252 self._date_format = "YYYY-MM-DD"
1253 else:
1254 self._date_format = date_format
1255 if datetime_format is None:
1256 self._datetime_format = "YYYY-MM-DD HH:MM:SS"
1257 else:
1258 self._datetime_format = datetime_format
1259
1260 self._mode = mode
1261
1262 @property
1263 def date_format(self) -> str:
1264 """
1265 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
1266 """
1267 return self._date_format
1268
1269 @property
1270 def datetime_format(self) -> str:
1271 """
1272 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
1273 """
1274 return self._datetime_format
1275
1276 @property
1277 def if_sheet_exists(self) -> str:
1278 """
1279 How to behave when writing to a sheet that already exists in append mode.
1280 """
1281 return self._if_sheet_exists
1282
1283 def __fspath__(self) -> str:
1284 return getattr(self._handles.handle, "name", "")
1285
1286 def _get_sheet_name(self, sheet_name: str | None) -> str:
1287 if sheet_name is None:
1288 sheet_name = self._cur_sheet
1289 if sheet_name is None: # pragma: no cover
1290 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property")
1291 return sheet_name
1292
1293 def _value_with_fmt(
1294 self, val
1295 ) -> tuple[
1296 int | float | bool | str | datetime.datetime | datetime.date, str | None
1297 ]:
1298 """
1299 Convert numpy types to Python types for the Excel writers.
1300
1301 Parameters
1302 ----------
1303 val : object
1304 Value to be written into cells
1305
1306 Returns
1307 -------
1308 Tuple with the first element being the converted value and the second
1309 being an optional format
1310 """
1311 fmt = None
1312
1313 if is_integer(val):
1314 val = int(val)
1315 elif is_float(val):
1316 val = float(val)
1317 elif is_bool(val):
1318 val = bool(val)
1319 elif isinstance(val, datetime.datetime):
1320 fmt = self._datetime_format
1321 elif isinstance(val, datetime.date):
1322 fmt = self._date_format
1323 elif isinstance(val, datetime.timedelta):
1324 val = val.total_seconds() / 86400
1325 fmt = "0"
1326 else:
1327 val = str(val)
1328
1329 return val, fmt
1330
1331 @classmethod
1332 def check_extension(cls, ext: str) -> Literal[True]:
1333 """
1334 checks that path's extension against the Writer's supported
1335 extensions. If it isn't supported, raises UnsupportedFiletypeError.
1336 """
1337 if ext.startswith("."):
1338 ext = ext[1:]
1339 if not any(ext in extension for extension in cls._supported_extensions):
1340 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")
1341 return True
1342
1343 # Allow use as a contextmanager
1344 def __enter__(self) -> Self:
1345 return self
1346
1347 def __exit__(
1348 self,
1349 exc_type: type[BaseException] | None,
1350 exc_value: BaseException | None,
1351 traceback: TracebackType | None,
1352 ) -> None:
1353 self.close()
1354
1355 def close(self) -> None:
1356 """synonym for save, to make it more file-like"""
1357 self._save()
1358 self._handles.close()
1359
1360
1361XLS_SIGNATURES = (
1362 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2
1363 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3
1364 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4
1365 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary
1366)
1367ZIP_SIGNATURE = b"PK\x03\x04"
1368PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))
1369
1370
1371@doc(storage_options=_shared_docs["storage_options"])
1372def inspect_excel_format(
1373 content_or_path: FilePath | ReadBuffer[bytes],
1374 storage_options: StorageOptions | None = None,
1375) -> str | None:
1376 """
1377 Inspect the path or content of an excel file and get its format.
1378
1379 Adopted from xlrd: https://github.com/python-excel/xlrd.
1380
1381 Parameters
1382 ----------
1383 content_or_path : str or file-like object
1384 Path to file or content of file to inspect. May be a URL.
1385 {storage_options}
1386
1387 Returns
1388 -------
1389 str or None
1390 Format of file if it can be determined.
1391
1392 Raises
1393 ------
1394 ValueError
1395 If resulting stream is empty.
1396 BadZipFile
1397 If resulting stream does not have an XLS signature and is not a valid zipfile.
1398 """
1399 if isinstance(content_or_path, bytes):
1400 content_or_path = BytesIO(content_or_path)
1401
1402 with get_handle(
1403 content_or_path, "rb", storage_options=storage_options, is_text=False
1404 ) as handle:
1405 stream = handle.handle
1406 stream.seek(0)
1407 buf = stream.read(PEEK_SIZE)
1408 if buf is None:
1409 raise ValueError("stream is empty")
1410 assert isinstance(buf, bytes)
1411 peek = buf
1412 stream.seek(0)
1413
1414 if any(peek.startswith(sig) for sig in XLS_SIGNATURES):
1415 return "xls"
1416 elif not peek.startswith(ZIP_SIGNATURE):
1417 return None
1418
1419 with zipfile.ZipFile(stream) as zf:
1420 # Workaround for some third party files that use forward slashes and
1421 # lower case names.
1422 component_names = [
1423 name.replace("\\", "/").lower() for name in zf.namelist()
1424 ]
1425
1426 if "xl/workbook.xml" in component_names:
1427 return "xlsx"
1428 if "xl/workbook.bin" in component_names:
1429 return "xlsb"
1430 if "content.xml" in component_names:
1431 return "ods"
1432 return "zip"
1433
1434
1435class ExcelFile:
1436 """
1437 Class for parsing tabular Excel sheets into DataFrame objects.
1438
1439 See read_excel for more documentation.
1440
1441 Parameters
1442 ----------
1443 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),
1444 A file-like object, xlrd workbook or openpyxl workbook.
1445 If a string or path object, expected to be a path to a
1446 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
1447 engine : str, default None
1448 If io is not a buffer or path, this must be set to identify io.
1449 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine``
1450 Engine compatibility :
1451
1452 - ``xlrd`` supports old-style Excel files (.xls).
1453 - ``openpyxl`` supports newer Excel file formats.
1454 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
1455 - ``pyxlsb`` supports Binary Excel files.
1456 - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb)
1457 and OpenDocument (.ods) file formats.
1458
1459 .. versionchanged:: 1.2.0
1460
1461 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
1462 now only supports old-style ``.xls`` files.
1463 When ``engine=None``, the following logic will be
1464 used to determine the engine:
1465
1466 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
1467 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
1468 - Otherwise if ``path_or_buffer`` is an xls format,
1469 ``xlrd`` will be used.
1470 - Otherwise if ``path_or_buffer`` is in xlsb format,
1471 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.
1472
1473 .. versionadded:: 1.3.0
1474
1475 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
1476 then ``openpyxl`` will be used.
1477 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
1478
1479 .. warning::
1480
1481 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
1482 This is not supported, switch to using ``openpyxl`` instead.
1483 engine_kwargs : dict, optional
1484 Arbitrary keyword arguments passed to excel engine.
1485
1486 Examples
1487 --------
1488 >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP
1489 >>> with pd.ExcelFile("myfile.xls") as xls: # doctest: +SKIP
1490 ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP
1491 """
1492
1493 from pandas.io.excel._calamine import CalamineReader
1494 from pandas.io.excel._odfreader import ODFReader
1495 from pandas.io.excel._openpyxl import OpenpyxlReader
1496 from pandas.io.excel._pyxlsb import PyxlsbReader
1497 from pandas.io.excel._xlrd import XlrdReader
1498
1499 _engines: Mapping[str, Any] = {
1500 "xlrd": XlrdReader,
1501 "openpyxl": OpenpyxlReader,
1502 "odf": ODFReader,
1503 "pyxlsb": PyxlsbReader,
1504 "calamine": CalamineReader,
1505 }
1506
1507 def __init__(
1508 self,
1509 path_or_buffer,
1510 engine: str | None = None,
1511 storage_options: StorageOptions | None = None,
1512 engine_kwargs: dict | None = None,
1513 ) -> None:
1514 if engine_kwargs is None:
1515 engine_kwargs = {}
1516
1517 if engine is not None and engine not in self._engines:
1518 raise ValueError(f"Unknown engine: {engine}")
1519
1520 # First argument can also be bytes, so create a buffer
1521 if isinstance(path_or_buffer, bytes):
1522 path_or_buffer = BytesIO(path_or_buffer)
1523 warnings.warn(
1524 "Passing bytes to 'read_excel' is deprecated and "
1525 "will be removed in a future version. To read from a "
1526 "byte string, wrap it in a `BytesIO` object.",
1527 FutureWarning,
1528 stacklevel=find_stack_level(),
1529 )
1530
1531 # Could be a str, ExcelFile, Book, etc.
1532 self.io = path_or_buffer
1533 # Always a string
1534 self._io = stringify_path(path_or_buffer)
1535
1536 # Determine xlrd version if installed
1537 if import_optional_dependency("xlrd", errors="ignore") is None:
1538 xlrd_version = None
1539 else:
1540 import xlrd
1541
1542 xlrd_version = Version(get_version(xlrd))
1543
1544 if engine is None:
1545 # Only determine ext if it is needed
1546 ext: str | None
1547 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1548 ext = "xls"
1549 else:
1550 ext = inspect_excel_format(
1551 content_or_path=path_or_buffer, storage_options=storage_options
1552 )
1553 if ext is None:
1554 raise ValueError(
1555 "Excel file format cannot be determined, you must specify "
1556 "an engine manually."
1557 )
1558
1559 engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
1560 if engine == "auto":
1561 engine = get_default_engine(ext, mode="reader")
1562
1563 assert engine is not None
1564 self.engine = engine
1565 self.storage_options = storage_options
1566
1567 self._reader = self._engines[engine](
1568 self._io,
1569 storage_options=storage_options,
1570 engine_kwargs=engine_kwargs,
1571 )
1572
1573 def __fspath__(self):
1574 return self._io
1575
1576 def parse(
1577 self,
1578 sheet_name: str | int | list[int] | list[str] | None = 0,
1579 header: int | Sequence[int] | None = 0,
1580 names: SequenceNotStr[Hashable] | range | None = None,
1581 index_col: int | Sequence[int] | None = None,
1582 usecols=None,
1583 converters=None,
1584 true_values: Iterable[Hashable] | None = None,
1585 false_values: Iterable[Hashable] | None = None,
1586 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
1587 nrows: int | None = None,
1588 na_values=None,
1589 parse_dates: list | dict | bool = False,
1590 date_parser: Callable | lib.NoDefault = lib.no_default,
1591 date_format: str | dict[Hashable, str] | None = None,
1592 thousands: str | None = None,
1593 comment: str | None = None,
1594 skipfooter: int = 0,
1595 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1596 **kwds,
1597 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
1598 """
1599 Parse specified sheet(s) into a DataFrame.
1600
1601 Equivalent to read_excel(ExcelFile, ...) See the read_excel
1602 docstring for more info on accepted parameters.
1603
1604 Returns
1605 -------
1606 DataFrame or dict of DataFrames
1607 DataFrame from the passed in Excel file.
1608
1609 Examples
1610 --------
1611 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
1612 >>> df.to_excel('myfile.xlsx') # doctest: +SKIP
1613 >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP
1614 >>> file.parse() # doctest: +SKIP
1615 """
1616 return self._reader.parse(
1617 sheet_name=sheet_name,
1618 header=header,
1619 names=names,
1620 index_col=index_col,
1621 usecols=usecols,
1622 converters=converters,
1623 true_values=true_values,
1624 false_values=false_values,
1625 skiprows=skiprows,
1626 nrows=nrows,
1627 na_values=na_values,
1628 parse_dates=parse_dates,
1629 date_parser=date_parser,
1630 date_format=date_format,
1631 thousands=thousands,
1632 comment=comment,
1633 skipfooter=skipfooter,
1634 dtype_backend=dtype_backend,
1635 **kwds,
1636 )
1637
1638 @property
1639 def book(self):
1640 return self._reader.book
1641
1642 @property
1643 def sheet_names(self):
1644 return self._reader.sheet_names
1645
1646 def close(self) -> None:
1647 """close io if necessary"""
1648 self._reader.close()
1649
1650 def __enter__(self) -> Self:
1651 return self
1652
1653 def __exit__(
1654 self,
1655 exc_type: type[BaseException] | None,
1656 exc_value: BaseException | None,
1657 traceback: TracebackType | None,
1658 ) -> None:
1659 self.close()