Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/excel/

1from __future__ import annotations

3from collections.abc import (

4 Hashable,

5 Iterable,

6 Mapping,

7 Sequence,

9import datetime

10from functools import partial

11from io import BytesIO

12import os

13from textwrap import fill

14from typing import (

15 IO,

16 TYPE_CHECKING,

17 Any,

18 Callable,

19 Generic,

20 Literal,

21 TypeVar,

22 Union,

23 cast,

24 overload,

25)

26import warnings

27import zipfile

29from pandas._config import config

31from pandas._libs import lib

32from pandas._libs.parsers import STR_NA_VALUES

33from pandas.compat._optional import (

34 get_version,

35 import_optional_dependency,

36)

37from pandas.errors import EmptyDataError

38from pandas.util._decorators import (

39 Appender,

40 doc,

41)

42from pandas.util._exceptions import find_stack_level

43from pandas.util._validators import check_dtype_backend

45from pandas.core.dtypes.common import (

46 is_bool,

47 is_float,

48 is_integer,

49 is_list_like,

50)

52from pandas.core.frame import DataFrame

53from pandas.core.shared_docs import _shared_docs

54from pandas.util.version import Version

56from pandas.io.common import (

57 IOHandles,

58 get_handle,

59 stringify_path,

60 validate_header_arg,

61)

62from pandas.io.excel._util import (

63 fill_mi_header,

64 get_default_engine,

65 get_writer,

66 maybe_convert_usecols,

67 pop_header_name,

68)

69from pandas.io.parsers import TextParser

70from pandas.io.parsers.readers import validate_integer

72if TYPE_CHECKING:

73 from types import TracebackType

75 from pandas._typing import (

76 DtypeArg,

77 DtypeBackend,

78 ExcelWriterIfSheetExists,

79 FilePath,

80 IntStrT,

81 ReadBuffer,

82 Self,

83 SequenceNotStr,

84 StorageOptions,

85 WriteExcelBuffer,

86 )

87_read_excel_doc = (

88 """

89Read an Excel file into a ``pandas`` ``DataFrame``.

91Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions

92read from a local filesystem or URL. Supports an option to read

93a single sheet or a list of sheets.

95Parameters

96----------

97io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object

98 Any valid string path is acceptable. The string could be a URL. Valid

99 URL schemes include http, ftp, s3, and file. For file URLs, a host is

100 expected. A local file could be: ``file://localhost/path/to/table.xlsx``.

101

102 If you want to pass in a path object, pandas accepts any ``os.PathLike``.

103

104 By file-like object, we refer to objects with a ``read()`` method,

105 such as a file handle (e.g. via builtin ``open`` function)

106 or ``StringIO``.

107

108 .. deprecated:: 2.1.0

109 Passing byte strings is deprecated. To read from a

110 byte string, wrap it in a ``BytesIO`` object.

111sheet_name : str, int, list, or None, default 0

112 Strings are used for sheet names. Integers are used in zero-indexed

113 sheet positions (chart sheets do not count as a sheet position).

114 Lists of strings/integers are used to request multiple sheets.

115 Specify ``None`` to get all worksheets.

116

117 Available cases:

118

119 * Defaults to ``0``: 1st sheet as a `DataFrame`

120 * ``1``: 2nd sheet as a `DataFrame`

121 * ``"Sheet1"``: Load sheet with name "Sheet1"

122 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"

123 as a dict of `DataFrame`

124 * ``None``: All worksheets.

125

126header : int, list of int, default 0

127 Row (0-indexed) to use for the column labels of the parsed

128 DataFrame. If a list of integers is passed those row positions will

129 be combined into a ``MultiIndex``. Use None if there is no header.

130names : array-like, default None

131 List of column names to use. If file contains no header row,

132 then you should explicitly pass header=None.

133index_col : int, str, list of int, default None

134 Column (0-indexed) to use as the row labels of the DataFrame.

135 Pass None if there is no such column. If a list is passed,

136 those columns will be combined into a ``MultiIndex``. If a

137 subset of data is selected with ``usecols``, index_col

138 is based on the subset.

139

140 Missing values will be forward filled to allow roundtripping with

141 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the

142 missing values use ``set_index`` after reading the data instead of

143 ``index_col``.

144usecols : str, list-like, or callable, default None

145 * If None, then parse all columns.

146 * If str, then indicates comma separated list of Excel column letters

147 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of

148 both sides.

149 * If list of int, then indicates list of column numbers to be parsed

150 (0-indexed).

151 * If list of string, then indicates list of column names to be parsed.

152 * If callable, then evaluate each column name against it and parse the

153 column if the callable returns ``True``.

154

155 Returns a subset of the columns according to behavior above.

156dtype : Type name or dict of column -> type, default None

157 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}

158 Use ``object`` to preserve data as stored in Excel and not interpret dtype,

159 which will necessarily result in ``object`` dtype.

160 If converters are specified, they will be applied INSTEAD

161 of dtype conversion.

162 If you use ``None``, it will infer the dtype of each column based on the data.

163engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None

164 If io is not a buffer or path, this must be set to identify io.

165 Engine compatibility :

166

167 - ``openpyxl`` supports newer Excel file formats.

168 - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb)

169 and OpenDocument (.ods) file formats.

170 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).

171 - ``pyxlsb`` supports Binary Excel files.

172 - ``xlrd`` supports old-style Excel files (.xls).

173

174 When ``engine=None``, the following logic will be used to determine the engine:

175

176 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),

177 then `odf <https://pypi.org/project/odfpy/>`_ will be used.

178 - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.

179 - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.

180 - Otherwise ``openpyxl`` will be used.

181converters : dict, default None

182 Dict of functions for converting values in certain columns. Keys can

183 either be integers or column labels, values are functions that take one

184 input argument, the Excel cell content, and return the transformed

185 content.

186true_values : list, default None

187 Values to consider as True.

188false_values : list, default None

189 Values to consider as False.

190skiprows : list-like, int, or callable, optional

191 Line numbers to skip (0-indexed) or number of lines to skip (int) at the

192 start of the file. If callable, the callable function will be evaluated

193 against the row indices, returning True if the row should be skipped and

194 False otherwise. An example of a valid callable argument would be ``lambda

195 x: x in [0, 2]``.

196nrows : int, default None

197 Number of rows to parse.

198na_values : scalar, str, list-like, or dict, default None

199 Additional strings to recognize as NA/NaN. If dict passed, specific

200 per-column NA values. By default the following values are interpreted

201 as NaN: '"""

202 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")

203 + """'.

204keep_default_na : bool, default True

205 Whether or not to include the default NaN values when parsing the data.

206 Depending on whether ``na_values`` is passed in, the behavior is as follows:

207

208 * If ``keep_default_na`` is True, and ``na_values`` are specified,

209 ``na_values`` is appended to the default NaN values used for parsing.

210 * If ``keep_default_na`` is True, and ``na_values`` are not specified, only

211 the default NaN values are used for parsing.

212 * If ``keep_default_na`` is False, and ``na_values`` are specified, only

213 the NaN values specified ``na_values`` are used for parsing.

214 * If ``keep_default_na`` is False, and ``na_values`` are not specified, no

215 strings will be parsed as NaN.

216

217 Note that if `na_filter` is passed in as False, the ``keep_default_na`` and

218 ``na_values`` parameters will be ignored.

219na_filter : bool, default True

220 Detect missing value markers (empty strings and the value of na_values). In

221 data without any NAs, passing ``na_filter=False`` can improve the

222 performance of reading a large file.

223verbose : bool, default False

224 Indicate number of NA values placed in non-numeric columns.

225parse_dates : bool, list-like, or dict, default False

226 The behavior is as follows:

227

228 * ``bool``. If True -> try parsing the index.

229 * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

230 each as a separate date column.

231 * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

232 a single date column.

233 * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

234 result 'foo'

235

236 If a column or index contains an unparsable date, the entire column or

237 index will be returned unaltered as an object data type. If you don`t want to

238 parse some cells as date just change their type in Excel to "Text".

239 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.

240

241 Note: A fast-path exists for iso8601-formatted dates.

242date_parser : function, optional

243 Function to use for converting a sequence of string columns to an array of

244 datetime instances. The default uses ``dateutil.parser.parser`` to do the

245 conversion. Pandas will try to call `date_parser` in three different ways,

246 advancing to the next if an exception occurs: 1) Pass one or more arrays

247 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the

248 string values from the columns defined by `parse_dates` into a single array

249 and pass that; and 3) call `date_parser` once for each row using one or

250 more strings (corresponding to the columns defined by `parse_dates`) as

251 arguments.

252

253 .. deprecated:: 2.0.0

254 Use ``date_format`` instead, or read in as ``object`` and then apply

255 :func:`to_datetime` as-needed.

256date_format : str or dict of column -> format, default ``None``

257 If used in conjunction with ``parse_dates``, will parse dates according to this

258 format. For anything more complex,

259 please read in as ``object`` and then apply :func:`to_datetime` as-needed.

260

261 .. versionadded:: 2.0.0

262thousands : str, default None

263 Thousands separator for parsing string columns to numeric. Note that

264 this parameter is only necessary for columns stored as TEXT in Excel,

265 any numeric columns will automatically be parsed, regardless of display

266 format.

267decimal : str, default '.'

268 Character to recognize as decimal point for parsing string columns to numeric.

269 Note that this parameter is only necessary for columns stored as TEXT in Excel,

270 any numeric columns will automatically be parsed, regardless of display

271 format.(e.g. use ',' for European data).

272

273 .. versionadded:: 1.4.0

274

275comment : str, default None

276 Comments out remainder of line. Pass a character or characters to this

277 argument to indicate comments in the input file. Any data between the

278 comment string and the end of the current line is ignored.

279skipfooter : int, default 0

280 Rows at the end to skip (0-indexed).

281{storage_options}

282

283dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'

284 Back-end data type applied to the resultant :class:`DataFrame`

285 (still experimental). Behaviour is as follows:

286

287 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

288 (default).

289 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

290 DataFrame.

291

292 .. versionadded:: 2.0

293

294engine_kwargs : dict, optional

295 Arbitrary keyword arguments passed to excel engine.

296

297Returns

298-------

299DataFrame or dict of DataFrames

300 DataFrame from the passed in Excel file. See notes in sheet_name

301 argument for more information on when a dict of DataFrames is returned.

302

303See Also

304--------

305DataFrame.to_excel : Write DataFrame to an Excel file.

306DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

307read_csv : Read a comma-separated values (csv) file into DataFrame.

308read_fwf : Read a table of fixed-width formatted lines into DataFrame.

309

310Notes

311-----

312For specific information on the methods used for each Excel engine, refer to the pandas

313:ref:`user guide <io.excel_reader>`

314

315Examples

316--------

317The file can be read using the file name as string or an open file object:

318

319>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP

320 Name Value

3210 string1 1

3221 string2 2

3232 #Comment 3

324

325>>> pd.read_excel(open('tmp.xlsx', 'rb'),

326... sheet_name='Sheet3') # doctest: +SKIP

327 Unnamed: 0 Name Value

3280 0 string1 1

3291 1 string2 2

3302 2 #Comment 3

331

332Index and header can be specified via the `index_col` and `header` arguments

333

334>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP

335 0 1 2

3360 NaN Name Value

3371 0.0 string1 1

3382 1.0 string2 2

3393 2.0 #Comment 3

340

341Column types are inferred but can be explicitly specified

342

343>>> pd.read_excel('tmp.xlsx', index_col=0,

344... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP

345 Name Value

3460 string1 1.0

3471 string2 2.0

3482 #Comment 3.0

349

350True, False, and NA values, and thousands separators have defaults,

351but can be explicitly specified, too. Supply the values you would like

352as strings or lists of strings!

353

354>>> pd.read_excel('tmp.xlsx', index_col=0,

355... na_values=['string1', 'string2']) # doctest: +SKIP

356 Name Value

3570 NaN 1

3581 NaN 2

3592 #Comment 3

360

361Comment lines in the excel input file can be skipped using the

362``comment`` kwarg.

363

364>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP

365 Name Value

3660 string1 1.0

3671 string2 2.0

3682 None NaN

369"""

370)

371

372

373@overload

374def read_excel(

375 io,

376 # sheet name is str or int -> DataFrame

377 sheet_name: str | int = ...,

378 *,

379 header: int | Sequence[int] | None = ...,

380 names: SequenceNotStr[Hashable] | range | None = ...,

381 index_col: int | str | Sequence[int] | None = ...,

382 usecols: int

383 | str

384 | Sequence[int]

385 | Sequence[str]

386 | Callable[[str], bool]

387 | None = ...,

388 dtype: DtypeArg | None = ...,

389 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,

390 converters: dict[str, Callable] | dict[int, Callable] | None = ...,

391 true_values: Iterable[Hashable] | None = ...,

392 false_values: Iterable[Hashable] | None = ...,

393 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,

394 nrows: int | None = ...,

395 na_values=...,

396 keep_default_na: bool = ...,

397 na_filter: bool = ...,

398 verbose: bool = ...,

399 parse_dates: list | dict | bool = ...,

400 date_parser: Callable | lib.NoDefault = ...,

401 date_format: dict[Hashable, str] | str | None = ...,

402 thousands: str | None = ...,

403 decimal: str = ...,

404 comment: str | None = ...,

405 skipfooter: int = ...,

406 storage_options: StorageOptions = ...,

407 dtype_backend: DtypeBackend | lib.NoDefault = ...,

408) -> DataFrame:

409 ...

410

411

412@overload

413def read_excel(

414 io,

415 # sheet name is list or None -> dict[IntStrT, DataFrame]

416 sheet_name: list[IntStrT] | None,

417 *,

418 header: int | Sequence[int] | None = ...,

419 names: SequenceNotStr[Hashable] | range | None = ...,

420 index_col: int | str | Sequence[int] | None = ...,

421 usecols: int

422 | str

423 | Sequence[int]

424 | Sequence[str]

425 | Callable[[str], bool]

426 | None = ...,

427 dtype: DtypeArg | None = ...,

428 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,

429 converters: dict[str, Callable] | dict[int, Callable] | None = ...,

430 true_values: Iterable[Hashable] | None = ...,

431 false_values: Iterable[Hashable] | None = ...,

432 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,

433 nrows: int | None = ...,

434 na_values=...,

435 keep_default_na: bool = ...,

436 na_filter: bool = ...,

437 verbose: bool = ...,

438 parse_dates: list | dict | bool = ...,

439 date_parser: Callable | lib.NoDefault = ...,

440 date_format: dict[Hashable, str] | str | None = ...,

441 thousands: str | None = ...,

442 decimal: str = ...,

443 comment: str | None = ...,

444 skipfooter: int = ...,

445 storage_options: StorageOptions = ...,

446 dtype_backend: DtypeBackend | lib.NoDefault = ...,

447) -> dict[IntStrT, DataFrame]:

448 ...

449

450

451@doc(storage_options=_shared_docs["storage_options"])

452@Appender(_read_excel_doc)

453def read_excel(

454 io,

455 sheet_name: str | int | list[IntStrT] | None = 0,

456 *,

457 header: int | Sequence[int] | None = 0,

458 names: SequenceNotStr[Hashable] | range | None = None,

459 index_col: int | str | Sequence[int] | None = None,

460 usecols: int

461 | str

462 | Sequence[int]

463 | Sequence[str]

464 | Callable[[str], bool]

465 | None = None,

466 dtype: DtypeArg | None = None,

467 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None,

468 converters: dict[str, Callable] | dict[int, Callable] | None = None,

469 true_values: Iterable[Hashable] | None = None,

470 false_values: Iterable[Hashable] | None = None,

471 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,

472 nrows: int | None = None,

473 na_values=None,

474 keep_default_na: bool = True,

475 na_filter: bool = True,

476 verbose: bool = False,

477 parse_dates: list | dict | bool = False,

478 date_parser: Callable | lib.NoDefault = lib.no_default,

479 date_format: dict[Hashable, str] | str | None = None,

480 thousands: str | None = None,

481 decimal: str = ".",

482 comment: str | None = None,

483 skipfooter: int = 0,

484 storage_options: StorageOptions | None = None,

485 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

486 engine_kwargs: dict | None = None,

487) -> DataFrame | dict[IntStrT, DataFrame]:

488 check_dtype_backend(dtype_backend)

489 should_close = False

490 if engine_kwargs is None:

491 engine_kwargs = {}

492

493 if not isinstance(io, ExcelFile):

494 should_close = True

495 io = ExcelFile(

496 io,

497 storage_options=storage_options,

498 engine=engine,

499 engine_kwargs=engine_kwargs,

500 )

501 elif engine and engine != io.engine:

502 raise ValueError(

503 "Engine should not be specified when passing "

504 "an ExcelFile - ExcelFile already has the engine set"

505 )

506

507 try:

508 data = io.parse(

509 sheet_name=sheet_name,

510 header=header,

511 names=names,

512 index_col=index_col,

513 usecols=usecols,

514 dtype=dtype,

515 converters=converters,

516 true_values=true_values,

517 false_values=false_values,

518 skiprows=skiprows,

519 nrows=nrows,

520 na_values=na_values,

521 keep_default_na=keep_default_na,

522 na_filter=na_filter,

523 verbose=verbose,

524 parse_dates=parse_dates,

525 date_parser=date_parser,

526 date_format=date_format,

527 thousands=thousands,

528 decimal=decimal,

529 comment=comment,

530 skipfooter=skipfooter,

531 dtype_backend=dtype_backend,

532 )

533 finally:

534 # make sure to close opened file handles

535 if should_close:

536 io.close()

537 return data

538

539

540_WorkbookT = TypeVar("_WorkbookT")

541

542

543class BaseExcelReader(Generic[_WorkbookT]):

544 book: _WorkbookT

545

546 def __init__(

547 self,

548 filepath_or_buffer,

549 storage_options: StorageOptions | None = None,

550 engine_kwargs: dict | None = None,

551 ) -> None:

552 if engine_kwargs is None:

553 engine_kwargs = {}

554

555 # First argument can also be bytes, so create a buffer

556 if isinstance(filepath_or_buffer, bytes):

557 filepath_or_buffer = BytesIO(filepath_or_buffer)

558

559 self.handles = IOHandles(

560 handle=filepath_or_buffer, compression={"method": None}

561 )

562 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):

563 self.handles = get_handle(

564 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False

565 )

566

567 if isinstance(self.handles.handle, self._workbook_class):

568 self.book = self.handles.handle

569 elif hasattr(self.handles.handle, "read"):

570 # N.B. xlrd.Book has a read attribute too

571 self.handles.handle.seek(0)

572 try:

573 self.book = self.load_workbook(self.handles.handle, engine_kwargs)

574 except Exception:

575 self.close()

576 raise

577 else:

578 raise ValueError(

579 "Must explicitly set engine if not passing in buffer or path for io."

580 )

581

582 @property

583 def _workbook_class(self) -> type[_WorkbookT]:

584 raise NotImplementedError

585

586 def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT:

587 raise NotImplementedError

588

589 def close(self) -> None:

590 if hasattr(self, "book"):

591 if hasattr(self.book, "close"):

592 # pyxlsb: opens a TemporaryFile

593 # openpyxl: https://stackoverflow.com/questions/31416842/

594 # openpyxl-does-not-close-excel-workbook-in-read-only-mode

595 self.book.close()

596 elif hasattr(self.book, "release_resources"):

597 # xlrd

598 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548

599 self.book.release_resources()

600 self.handles.close()

601

602 @property

603 def sheet_names(self) -> list[str]:

604 raise NotImplementedError

605

606 def get_sheet_by_name(self, name: str):

607 raise NotImplementedError

608

609 def get_sheet_by_index(self, index: int):

610 raise NotImplementedError

611

612 def get_sheet_data(self, sheet, rows: int | None = None):

613 raise NotImplementedError

614

615 def raise_if_bad_sheet_by_index(self, index: int) -> None:

616 n_sheets = len(self.sheet_names)

617 if index >= n_sheets:

618 raise ValueError(

619 f"Worksheet index {index} is invalid, {n_sheets} worksheets found"

620 )

621

622 def raise_if_bad_sheet_by_name(self, name: str) -> None:

623 if name not in self.sheet_names:

624 raise ValueError(f"Worksheet named '{name}' not found")

625

626 def _check_skiprows_func(

627 self,

628 skiprows: Callable,

629 rows_to_use: int,

630 ) -> int:

631 """

632 Determine how many file rows are required to obtain `nrows` data

633 rows when `skiprows` is a function.

634

635 Parameters

636 ----------

637 skiprows : function

638 The function passed to read_excel by the user.

639 rows_to_use : int

640 The number of rows that will be needed for the header and

641 the data.

642

643 Returns

644 -------

645 int

646 """

647 i = 0

648 rows_used_so_far = 0

649 while rows_used_so_far < rows_to_use:

650 if not skiprows(i):

651 rows_used_so_far += 1

652 i += 1

653 return i

654

655 def _calc_rows(

656 self,

657 header: int | Sequence[int] | None,

658 index_col: int | Sequence[int] | None,

659 skiprows: Sequence[int] | int | Callable[[int], object] | None,

660 nrows: int | None,

661 ) -> int | None:

662 """

663 If nrows specified, find the number of rows needed from the

664 file, otherwise return None.

665

666

667 Parameters

668 ----------

669 header : int, list of int, or None

670 See read_excel docstring.

671 index_col : int, str, list of int, or None

672 See read_excel docstring.

673 skiprows : list-like, int, callable, or None

674 See read_excel docstring.

675 nrows : int or None

676 See read_excel docstring.

677

678 Returns

679 -------

680 int or None

681 """

682 if nrows is None:

683 return None

684 if header is None:

685 header_rows = 1

686 elif is_integer(header):

687 header = cast(int, header)

688 header_rows = 1 + header

689 else:

690 header = cast(Sequence, header)

691 header_rows = 1 + header[-1]

692 # If there is a MultiIndex header and an index then there is also

693 # a row containing just the index name(s)

694 if is_list_like(header) and index_col is not None:

695 header = cast(Sequence, header)

696 if len(header) > 1:

697 header_rows += 1

698 if skiprows is None:

699 return header_rows + nrows

700 if is_integer(skiprows):

701 skiprows = cast(int, skiprows)

702 return header_rows + nrows + skiprows

703 if is_list_like(skiprows):

704

705 def f(skiprows: Sequence, x: int) -> bool:

706 return x in skiprows

707

708 skiprows = cast(Sequence, skiprows)

709 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)

710 if callable(skiprows):

711 return self._check_skiprows_func(

712 skiprows,

713 header_rows + nrows,

714 )

715 # else unexpected skiprows type: read_excel will not optimize

716 # the number of rows read from file

717 return None

718

719 def parse(

720 self,

721 sheet_name: str | int | list[int] | list[str] | None = 0,

722 header: int | Sequence[int] | None = 0,

723 names: SequenceNotStr[Hashable] | range | None = None,

724 index_col: int | Sequence[int] | None = None,

725 usecols=None,

726 dtype: DtypeArg | None = None,

727 true_values: Iterable[Hashable] | None = None,

728 false_values: Iterable[Hashable] | None = None,

729 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,

730 nrows: int | None = None,

731 na_values=None,

732 verbose: bool = False,

733 parse_dates: list | dict | bool = False,

734 date_parser: Callable | lib.NoDefault = lib.no_default,

735 date_format: dict[Hashable, str] | str | None = None,

736 thousands: str | None = None,

737 decimal: str = ".",

738 comment: str | None = None,

739 skipfooter: int = 0,

740 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

741 **kwds,

742 ):

743 validate_header_arg(header)

744 validate_integer("nrows", nrows)

745

746 ret_dict = False

747

748 # Keep sheetname to maintain backwards compatibility.

749 sheets: list[int] | list[str]

750 if isinstance(sheet_name, list):

751 sheets = sheet_name

752 ret_dict = True

753 elif sheet_name is None:

754 sheets = self.sheet_names

755 ret_dict = True

756 elif isinstance(sheet_name, str):

757 sheets = [sheet_name]

758 else:

759 sheets = [sheet_name]

760

761 # handle same-type duplicates.

762 sheets = cast(Union[list[int], list[str]], list(dict.fromkeys(sheets).keys()))

763

764 output = {}

765

766 last_sheetname = None

767 for asheetname in sheets:

768 last_sheetname = asheetname

769 if verbose:

770 print(f"Reading sheet {asheetname}")

771

772 if isinstance(asheetname, str):

773 sheet = self.get_sheet_by_name(asheetname)

774 else: # assume an integer if not a string

775 sheet = self.get_sheet_by_index(asheetname)

776

777 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)

778 data = self.get_sheet_data(sheet, file_rows_needed)

779 if hasattr(sheet, "close"):

780 # pyxlsb opens two TemporaryFiles

781 sheet.close()

782 usecols = maybe_convert_usecols(usecols)

783

784 if not data:

785 output[asheetname] = DataFrame()

786 continue

787

788 is_list_header = False

789 is_len_one_list_header = False

790 if is_list_like(header):

791 assert isinstance(header, Sequence)

792 is_list_header = True

793 if len(header) == 1:

794 is_len_one_list_header = True

795

796 if is_len_one_list_header:

797 header = cast(Sequence[int], header)[0]

798

799 # forward fill and pull out names for MultiIndex column

800 header_names = None

801 if header is not None and is_list_like(header):

802 assert isinstance(header, Sequence)

803

804 header_names = []

805 control_row = [True] * len(data[0])

806

807 for row in header:

808 if is_integer(skiprows):

809 assert isinstance(skiprows, int)

810 row += skiprows

811

812 if row > len(data) - 1:

813 raise ValueError(

814 f"header index {row} exceeds maximum index "

815 f"{len(data) - 1} of data.",

816 )

817

818 data[row], control_row = fill_mi_header(data[row], control_row)

819

820 if index_col is not None:

821 header_name, _ = pop_header_name(data[row], index_col)

822 header_names.append(header_name)

823

824 # If there is a MultiIndex header and an index then there is also

825 # a row containing just the index name(s)

826 has_index_names = False

827 if is_list_header and not is_len_one_list_header and index_col is not None:

828 index_col_list: Sequence[int]

829 if isinstance(index_col, int):

830 index_col_list = [index_col]

831 else:

832 assert isinstance(index_col, Sequence)

833 index_col_list = index_col

834

835 # We have to handle mi without names. If any of the entries in the data

836 # columns are not empty, this is a regular row

837 assert isinstance(header, Sequence)

838 if len(header) < len(data):

839 potential_index_names = data[len(header)]

840 potential_data = [

841 x

842 for i, x in enumerate(potential_index_names)

843 if not control_row[i] and i not in index_col_list

844 ]

845 has_index_names = all(x == "" or x is None for x in potential_data)

846

847 if is_list_like(index_col):

848 # Forward fill values for MultiIndex index.

849 if header is None:

850 offset = 0

851 elif isinstance(header, int):

852 offset = 1 + header

853 else:

854 offset = 1 + max(header)

855

856 # GH34673: if MultiIndex names present and not defined in the header,

857 # offset needs to be incremented so that forward filling starts

858 # from the first MI value instead of the name

859 if has_index_names:

860 offset += 1

861

862 # Check if we have an empty dataset

863 # before trying to collect data.

864 if offset < len(data):

865 assert isinstance(index_col, Sequence)

866

867 for col in index_col:

868 last = data[offset][col]

869

870 for row in range(offset + 1, len(data)):

871 if data[row][col] == "" or data[row][col] is None:

872 data[row][col] = last

873 else:

874 last = data[row][col]

875

876 # GH 12292 : error when read one empty column from excel file

877 try:

878 parser = TextParser(

879 data,

880 names=names,

881 header=header,

882 index_col=index_col,

883 has_index_names=has_index_names,

884 dtype=dtype,

885 true_values=true_values,

886 false_values=false_values,

887 skiprows=skiprows,

888 nrows=nrows,

889 na_values=na_values,

890 skip_blank_lines=False, # GH 39808

891 parse_dates=parse_dates,

892 date_parser=date_parser,

893 date_format=date_format,

894 thousands=thousands,

895 decimal=decimal,

896 comment=comment,

897 skipfooter=skipfooter,

898 usecols=usecols,

899 dtype_backend=dtype_backend,

900 **kwds,

901 )

902

903 output[asheetname] = parser.read(nrows=nrows)

904

905 if header_names:

906 output[asheetname].columns = output[asheetname].columns.set_names(

907 header_names

908 )

909

910 except EmptyDataError:

911 # No Data, return an empty DataFrame

912 output[asheetname] = DataFrame()

913

914 except Exception as err:

915 err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])

916 raise err

917

918 if last_sheetname is None:

919 raise ValueError("Sheet name is an empty list")

920

921 if ret_dict:

922 return output

923 else:

924 return output[last_sheetname]

925

926

927@doc(storage_options=_shared_docs["storage_options"])

928class ExcelWriter(Generic[_WorkbookT]):

929 """

930 Class for writing DataFrame objects into excel sheets.

931

932 Default is to use:

933

934 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter

935 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__

936 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files

937

938 See ``DataFrame.to_excel`` for typical usage.

939

940 The writer should be used as a context manager. Otherwise, call `close()` to save

941 and close any opened file handles.

942

943 Parameters

944 ----------

945 path : str or typing.BinaryIO

946 Path to xls or xlsx or ods file.

947 engine : str (optional)

948 Engine to use for writing. If None, defaults to

949 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword

950 argument.

951 date_format : str, default None

952 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').

953 datetime_format : str, default None

954 Format string for datetime objects written into Excel files.

955 (e.g. 'YYYY-MM-DD HH:MM:SS').

956 mode : {{'w', 'a'}}, default 'w'

957 File mode to use (write or append). Append does not work with fsspec URLs.

958 {storage_options}

959

960 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error'

961 How to behave when trying to write to a sheet that already

962 exists (append mode only).

963

964 * error: raise a ValueError.

965 * new: Create a new sheet, with a name determined by the engine.

966 * replace: Delete the contents of the sheet before writing to it.

967 * overlay: Write contents to the existing sheet without first removing,

968 but possibly over top of, the existing contents.

969

970 .. versionadded:: 1.3.0

971

972 .. versionchanged:: 1.4.0

973

974 Added ``overlay`` option

975

976 engine_kwargs : dict, optional

977 Keyword arguments to be passed into the engine. These will be passed to

978 the following functions of the respective engines:

979

980 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)``

981 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)``

982 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)``

983 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)``

984

985 .. versionadded:: 1.3.0

986

987 Notes

988 -----

989 For compatibility with CSV writers, ExcelWriter serializes lists

990 and dicts to strings before writing.

991

992 Examples

993 --------

994 Default usage:

995

996 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP

997 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:

998 ... df.to_excel(writer) # doctest: +SKIP

999

1000 To write to separate sheets in a single file:

1001

1002 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP

1003 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP

1004 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:

1005 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP

1006 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP

1007

1008 You can set the date format or datetime format:

1009

1010 >>> from datetime import date, datetime # doctest: +SKIP

1011 >>> df = pd.DataFrame(

1012 ... [

1013 ... [date(2014, 1, 31), date(1999, 9, 24)],

1014 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)],

1015 ... ],

1016 ... index=["Date", "Datetime"],

1017 ... columns=["X", "Y"],

1018 ... ) # doctest: +SKIP

1019 >>> with pd.ExcelWriter(

1020 ... "path_to_file.xlsx",

1021 ... date_format="YYYY-MM-DD",

1022 ... datetime_format="YYYY-MM-DD HH:MM:SS"

1023 ... ) as writer:

1024 ... df.to_excel(writer) # doctest: +SKIP

1025

1026 You can also append to an existing Excel file:

1027

1028 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:

1029 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP

1030

1031 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it

1032 already exists:

1033

1034 >>> with ExcelWriter(

1035 ... "path_to_file.xlsx",

1036 ... mode="a",

1037 ... engine="openpyxl",

1038 ... if_sheet_exists="replace",

1039 ... ) as writer:

1040 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP

1041

1042 You can also write multiple DataFrames to a single sheet. Note that the

1043 ``if_sheet_exists`` parameter needs to be set to ``overlay``:

1044

1045 >>> with ExcelWriter("path_to_file.xlsx",

1046 ... mode="a",

1047 ... engine="openpyxl",

1048 ... if_sheet_exists="overlay",

1049 ... ) as writer:

1050 ... df1.to_excel(writer, sheet_name="Sheet1")

1051 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP

1052

1053 You can store Excel file in RAM:

1054

1055 >>> import io

1056 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])

1057 >>> buffer = io.BytesIO()

1058 >>> with pd.ExcelWriter(buffer) as writer:

1059 ... df.to_excel(writer)

1060

1061 You can pack Excel file into zip archive:

1062

1063 >>> import zipfile # doctest: +SKIP

1064 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP

1065 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:

1066 ... with zf.open("filename.xlsx", "w") as buffer:

1067 ... with pd.ExcelWriter(buffer) as writer:

1068 ... df.to_excel(writer) # doctest: +SKIP

1069

1070 You can specify additional arguments to the underlying engine:

1071

1072 >>> with pd.ExcelWriter(

1073 ... "path_to_file.xlsx",

1074 ... engine="xlsxwriter",

1075 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}

1076 ... ) as writer:

1077 ... df.to_excel(writer) # doctest: +SKIP

1078

1079 In append mode, ``engine_kwargs`` are passed through to

1080 openpyxl's ``load_workbook``:

1081

1082 >>> with pd.ExcelWriter(

1083 ... "path_to_file.xlsx",

1084 ... engine="openpyxl",

1085 ... mode="a",

1086 ... engine_kwargs={{"keep_vba": True}}

1087 ... ) as writer:

1088 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP

1089 """

1090

1091 # Defining an ExcelWriter implementation (see abstract methods for more...)

1092

1093 # - Mandatory

1094 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``

1095 # --> called to write additional DataFrames to disk

1096 # - ``_supported_extensions`` (tuple of supported extensions), used to

1097 # check that engine supports the given extension.

1098 # - ``_engine`` - string that gives the engine name. Necessary to

1099 # instantiate class directly and bypass ``ExcelWriterMeta`` engine

1100 # lookup.

1101 # - ``save(self)`` --> called to save file to disk

1102 # - Mostly mandatory (i.e. should at least exist)

1103 # - book, cur_sheet, path

1104

1105 # - Optional:

1106 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called

1107 # with path as first argument.

1108

1109 # You also need to register the class with ``register_writer()``.

1110 # Technically, ExcelWriter implementations don't need to subclass

1111 # ExcelWriter.

1112

1113 _engine: str

1114 _supported_extensions: tuple[str, ...]

1115

1116 def __new__(

1117 cls,

1118 path: FilePath | WriteExcelBuffer | ExcelWriter,

1119 engine: str | None = None,

1120 date_format: str | None = None,

1121 datetime_format: str | None = None,

1122 mode: str = "w",

1123 storage_options: StorageOptions | None = None,

1124 if_sheet_exists: ExcelWriterIfSheetExists | None = None,

1125 engine_kwargs: dict | None = None,

1126 ) -> Self:

1127 # only switch class if generic(ExcelWriter)

1128 if cls is ExcelWriter:

1129 if engine is None or (isinstance(engine, str) and engine == "auto"):

1130 if isinstance(path, str):

1131 ext = os.path.splitext(path)[-1][1:]

1132 else:

1133 ext = "xlsx"

1134

1135 try:

1136 engine = config.get_option(f"io.excel.{ext}.writer", silent=True)

1137 if engine == "auto":

1138 engine = get_default_engine(ext, mode="writer")

1139 except KeyError as err:

1140 raise ValueError(f"No engine for filetype: '{ext}'") from err

1141

1142 # for mypy

1143 assert engine is not None

1144 # error: Incompatible types in assignment (expression has type

1145 # "type[ExcelWriter[Any]]", variable has type "type[Self]")

1146 cls = get_writer(engine) # type: ignore[assignment]

1147

1148 return object.__new__(cls)

1149

1150 # declare external properties you can count on

1151 _path = None

1152

1153 @property

1154 def supported_extensions(self) -> tuple[str, ...]:

1155 """Extensions that writer engine supports."""

1156 return self._supported_extensions

1157

1158 @property

1159 def engine(self) -> str:

1160 """Name of engine."""

1161 return self._engine

1162

1163 @property

1164 def sheets(self) -> dict[str, Any]:

1165 """Mapping of sheet names to sheet objects."""

1166 raise NotImplementedError

1167

1168 @property

1169 def book(self) -> _WorkbookT:

1170 """

1171 Book instance. Class type will depend on the engine used.

1172

1173 This attribute can be used to access engine-specific features.

1174 """

1175 raise NotImplementedError

1176

1177 def _write_cells(

1178 self,

1179 cells,

1180 sheet_name: str | None = None,

1181 startrow: int = 0,

1182 startcol: int = 0,

1183 freeze_panes: tuple[int, int] | None = None,

1184 ) -> None:

1185 """

1186 Write given formatted cells into Excel an excel sheet

1187

1188 Parameters

1189 ----------

1190 cells : generator

1191 cell of formatted data to save to Excel sheet

1192 sheet_name : str, default None

1193 Name of Excel sheet, if None, then use self.cur_sheet

1194 startrow : upper left cell row to dump data frame

1195 startcol : upper left cell column to dump data frame

1196 freeze_panes: int tuple of length 2

1197 contains the bottom-most row and right-most column to freeze

1198 """

1199 raise NotImplementedError

1200

1201 def _save(self) -> None:

1202 """

1203 Save workbook to disk.

1204 """

1205 raise NotImplementedError

1206

1207 def __init__(

1208 self,

1209 path: FilePath | WriteExcelBuffer | ExcelWriter,

1210 engine: str | None = None,

1211 date_format: str | None = None,

1212 datetime_format: str | None = None,

1213 mode: str = "w",

1214 storage_options: StorageOptions | None = None,

1215 if_sheet_exists: ExcelWriterIfSheetExists | None = None,

1216 engine_kwargs: dict[str, Any] | None = None,

1217 ) -> None:

1218 # validate that this engine can handle the extension

1219 if isinstance(path, str):

1220 ext = os.path.splitext(path)[-1]

1221 self.check_extension(ext)

1222

1223 # use mode to open the file

1224 if "b" not in mode:

1225 mode += "b"

1226 # use "a" for the user to append data to excel but internally use "r+" to let

1227 # the excel backend first read the existing file and then write any data to it

1228 mode = mode.replace("a", "r+")

1229

1230 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"):

1231 raise ValueError(

1232 f"'{if_sheet_exists}' is not valid for if_sheet_exists. "

1233 "Valid options are 'error', 'new', 'replace' and 'overlay'."

1234 )

1235 if if_sheet_exists and "r+" not in mode:

1236 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')")

1237 if if_sheet_exists is None:

1238 if_sheet_exists = "error"

1239 self._if_sheet_exists = if_sheet_exists

1240

1241 # cast ExcelWriter to avoid adding 'if self._handles is not None'

1242 self._handles = IOHandles(

1243 cast(IO[bytes], path), compression={"compression": None}

1244 )

1245 if not isinstance(path, ExcelWriter):

1246 self._handles = get_handle(

1247 path, mode, storage_options=storage_options, is_text=False

1248 )

1249 self._cur_sheet = None

1250

1251 if date_format is None:

1252 self._date_format = "YYYY-MM-DD"

1253 else:

1254 self._date_format = date_format

1255 if datetime_format is None:

1256 self._datetime_format = "YYYY-MM-DD HH:MM:SS"

1257 else:

1258 self._datetime_format = datetime_format

1259

1260 self._mode = mode

1261

1262 @property

1263 def date_format(self) -> str:

1264 """

1265 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').

1266 """

1267 return self._date_format

1268

1269 @property

1270 def datetime_format(self) -> str:

1271 """

1272 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').

1273 """

1274 return self._datetime_format

1275

1276 @property

1277 def if_sheet_exists(self) -> str:

1278 """

1279 How to behave when writing to a sheet that already exists in append mode.

1280 """

1281 return self._if_sheet_exists

1282

1283 def __fspath__(self) -> str:

1284 return getattr(self._handles.handle, "name", "")

1285

1286 def _get_sheet_name(self, sheet_name: str | None) -> str:

1287 if sheet_name is None:

1288 sheet_name = self._cur_sheet

1289 if sheet_name is None: # pragma: no cover

1290 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property")

1291 return sheet_name

1292

1293 def _value_with_fmt(

1294 self, val

1295 ) -> tuple[

1297 ]:

1298 """

1299 Convert numpy types to Python types for the Excel writers.

1300

1301 Parameters

1302 ----------

1303 val : object

1304 Value to be written into cells

1305

1306 Returns

1307 -------

1308 Tuple with the first element being the converted value and the second

1309 being an optional format

1310 """

1311 fmt = None

1312

1313 if is_integer(val):

1314 val = int(val)

1315 elif is_float(val):

1316 val = float(val)

1317 elif is_bool(val):

1318 val = bool(val)

1319 elif isinstance(val, datetime.datetime):

1320 fmt = self._datetime_format

1321 elif isinstance(val, datetime.date):

1322 fmt = self._date_format

1323 elif isinstance(val, datetime.timedelta):

1324 val = val.total_seconds() / 86400

1325 fmt = "0"

1326 else:

1327 val = str(val)

1328

1329 return val, fmt

1330

1331 @classmethod

1332 def check_extension(cls, ext: str) -> Literal[True]:

1333 """

1334 checks that path's extension against the Writer's supported

1335 extensions. If it isn't supported, raises UnsupportedFiletypeError.

1336 """

1337 if ext.startswith("."):

1338 ext = ext[1:]

1339 if not any(ext in extension for extension in cls._supported_extensions):

1340 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")

1341 return True

1342

1343 # Allow use as a contextmanager

1344 def __enter__(self) -> Self:

1345 return self

1346

1347 def __exit__(

1348 self,

1349 exc_type: type[BaseException] | None,

1350 exc_value: BaseException | None,

1351 traceback: TracebackType | None,

1352 ) -> None:

1353 self.close()

1354

1355 def close(self) -> None:

1356 """synonym for save, to make it more file-like"""

1357 self._save()

1358 self._handles.close()

1359

1360

1361XLS_SIGNATURES = (

1362 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2

1363 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3

1364 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4

1365 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary

1366)

1367ZIP_SIGNATURE = b"PK\x03\x04"

1368PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))

1369

1370

1371@doc(storage_options=_shared_docs["storage_options"])

1372def inspect_excel_format(

1373 content_or_path: FilePath | ReadBuffer[bytes],

1374 storage_options: StorageOptions | None = None,

1375) -> str | None:

1376 """

1377 Inspect the path or content of an excel file and get its format.

1378

1379 Adopted from xlrd: https://github.com/python-excel/xlrd.

1380

1381 Parameters

1382 ----------

1383 content_or_path : str or file-like object

1384 Path to file or content of file to inspect. May be a URL.

1385 {storage_options}

1386

1387 Returns

1388 -------

1389 str or None

1390 Format of file if it can be determined.

1391

1392 Raises

1393 ------

1394 ValueError

1395 If resulting stream is empty.

1396 BadZipFile

1397 If resulting stream does not have an XLS signature and is not a valid zipfile.

1398 """

1399 if isinstance(content_or_path, bytes):

1400 content_or_path = BytesIO(content_or_path)

1401

1402 with get_handle(

1403 content_or_path, "rb", storage_options=storage_options, is_text=False

1404 ) as handle:

1405 stream = handle.handle

1406 stream.seek(0)

1407 buf = stream.read(PEEK_SIZE)

1408 if buf is None:

1409 raise ValueError("stream is empty")

1410 assert isinstance(buf, bytes)

1411 peek = buf

1412 stream.seek(0)

1413

1414 if any(peek.startswith(sig) for sig in XLS_SIGNATURES):

1415 return "xls"

1416 elif not peek.startswith(ZIP_SIGNATURE):

1417 return None

1418

1419 with zipfile.ZipFile(stream) as zf:

1420 # Workaround for some third party files that use forward slashes and

1421 # lower case names.

1422 component_names = [

1423 name.replace("\\", "/").lower() for name in zf.namelist()

1424 ]

1425

1426 if "xl/workbook.xml" in component_names:

1427 return "xlsx"

1428 if "xl/workbook.bin" in component_names:

1429 return "xlsb"

1430 if "content.xml" in component_names:

1431 return "ods"

1432 return "zip"

1433

1434

1435class ExcelFile:

1436 """

1437 Class for parsing tabular Excel sheets into DataFrame objects.

1438

1439 See read_excel for more documentation.

1440

1441 Parameters

1442 ----------

1443 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),

1444 A file-like object, xlrd workbook or openpyxl workbook.

1445 If a string or path object, expected to be a path to a

1446 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.

1447 engine : str, default None

1448 If io is not a buffer or path, this must be set to identify io.

1449 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine``

1450 Engine compatibility :

1451

1452 - ``xlrd`` supports old-style Excel files (.xls).

1453 - ``openpyxl`` supports newer Excel file formats.

1454 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).

1455 - ``pyxlsb`` supports Binary Excel files.

1456 - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb)

1457 and OpenDocument (.ods) file formats.

1458

1459 .. versionchanged:: 1.2.0

1460

1461 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_

1462 now only supports old-style ``.xls`` files.

1463 When ``engine=None``, the following logic will be

1464 used to determine the engine:

1465

1466 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),

1467 then `odf <https://pypi.org/project/odfpy/>`_ will be used.

1468 - Otherwise if ``path_or_buffer`` is an xls format,

1469 ``xlrd`` will be used.

1470 - Otherwise if ``path_or_buffer`` is in xlsb format,

1471 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.

1472

1473 .. versionadded:: 1.3.0

1474

1475 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,

1476 then ``openpyxl`` will be used.

1477 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.

1478

1479 .. warning::

1480

1481 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.

1482 This is not supported, switch to using ``openpyxl`` instead.

1483 engine_kwargs : dict, optional

1484 Arbitrary keyword arguments passed to excel engine.

1485

1486 Examples

1487 --------

1488 >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP

1489 >>> with pd.ExcelFile("myfile.xls") as xls: # doctest: +SKIP

1490 ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP

1491 """

1492

1493 from pandas.io.excel._calamine import CalamineReader

1494 from pandas.io.excel._odfreader import ODFReader

1495 from pandas.io.excel._openpyxl import OpenpyxlReader

1496 from pandas.io.excel._pyxlsb import PyxlsbReader

1497 from pandas.io.excel._xlrd import XlrdReader

1498

1499 _engines: Mapping[str, Any] = {

1500 "xlrd": XlrdReader,

1501 "openpyxl": OpenpyxlReader,

1502 "odf": ODFReader,

1503 "pyxlsb": PyxlsbReader,

1504 "calamine": CalamineReader,

1505 }

1506

1507 def __init__(

1508 self,

1509 path_or_buffer,

1510 engine: str | None = None,

1511 storage_options: StorageOptions | None = None,

1512 engine_kwargs: dict | None = None,

1513 ) -> None:

1514 if engine_kwargs is None:

1515 engine_kwargs = {}

1516

1517 if engine is not None and engine not in self._engines:

1518 raise ValueError(f"Unknown engine: {engine}")

1519

1520 # First argument can also be bytes, so create a buffer

1521 if isinstance(path_or_buffer, bytes):

1522 path_or_buffer = BytesIO(path_or_buffer)

1523 warnings.warn(

1524 "Passing bytes to 'read_excel' is deprecated and "

1525 "will be removed in a future version. To read from a "

1526 "byte string, wrap it in a `BytesIO` object.",

1527 FutureWarning,

1528 stacklevel=find_stack_level(),

1529 )

1530

1531 # Could be a str, ExcelFile, Book, etc.

1532 self.io = path_or_buffer

1533 # Always a string

1534 self._io = stringify_path(path_or_buffer)

1535

1536 # Determine xlrd version if installed

1537 if import_optional_dependency("xlrd", errors="ignore") is None:

1538 xlrd_version = None

1539 else:

1540 import xlrd

1541

1542 xlrd_version = Version(get_version(xlrd))

1543

1544 if engine is None:

1545 # Only determine ext if it is needed

1546 ext: str | None

1547 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):

1548 ext = "xls"

1549 else:

1550 ext = inspect_excel_format(

1551 content_or_path=path_or_buffer, storage_options=storage_options

1552 )

1553 if ext is None:

1554 raise ValueError(

1555 "Excel file format cannot be determined, you must specify "

1556 "an engine manually."

1557 )

1558

1559 engine = config.get_option(f"io.excel.{ext}.reader", silent=True)

1560 if engine == "auto":

1561 engine = get_default_engine(ext, mode="reader")

1562

1563 assert engine is not None

1564 self.engine = engine

1565 self.storage_options = storage_options

1566

1567 self._reader = self._engines[engine](

1568 self._io,

1569 storage_options=storage_options,

1570 engine_kwargs=engine_kwargs,

1571 )

1572

1573 def __fspath__(self):

1574 return self._io

1575

1576 def parse(

1577 self,

1578 sheet_name: str | int | list[int] | list[str] | None = 0,

1579 header: int | Sequence[int] | None = 0,

1580 names: SequenceNotStr[Hashable] | range | None = None,

1581 index_col: int | Sequence[int] | None = None,

1582 usecols=None,

1583 converters=None,

1584 true_values: Iterable[Hashable] | None = None,

1585 false_values: Iterable[Hashable] | None = None,

1586 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,

1587 nrows: int | None = None,

1588 na_values=None,

1589 parse_dates: list | dict | bool = False,

1590 date_parser: Callable | lib.NoDefault = lib.no_default,

1591 date_format: str | dict[Hashable, str] | None = None,

1592 thousands: str | None = None,

1593 comment: str | None = None,

1594 skipfooter: int = 0,

1595 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1596 **kwds,

1597 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:

1598 """

1599 Parse specified sheet(s) into a DataFrame.

1600

1601 Equivalent to read_excel(ExcelFile, ...) See the read_excel

1602 docstring for more info on accepted parameters.

1603

1604 Returns

1605 -------

1606 DataFrame or dict of DataFrames

1607 DataFrame from the passed in Excel file.

1608

1609 Examples

1610 --------

1611 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])

1612 >>> df.to_excel('myfile.xlsx') # doctest: +SKIP

1613 >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP

1614 >>> file.parse() # doctest: +SKIP

1615 """

1616 return self._reader.parse(

1617 sheet_name=sheet_name,

1618 header=header,

1619 names=names,

1620 index_col=index_col,

1621 usecols=usecols,

1622 converters=converters,

1623 true_values=true_values,

1624 false_values=false_values,

1625 skiprows=skiprows,

1626 nrows=nrows,

1627 na_values=na_values,

1628 parse_dates=parse_dates,

1629 date_parser=date_parser,

1630 date_format=date_format,

1631 thousands=thousands,

1632 comment=comment,

1633 skipfooter=skipfooter,

1634 dtype_backend=dtype_backend,

1635 **kwds,

1636 )

1637

1638 @property

1639 def book(self):

1640 return self._reader.book

1641

1642 @property

1643 def sheet_names(self):

1644 return self._reader.sheet_names

1645

1646 def close(self) -> None:

1647 """close io if necessary"""

1648 self._reader.close()

1649

1650 def __enter__(self) -> Self:

1651 return self

1652

1653 def __exit__(

1654 self,

1655 exc_type: type[BaseException] | None,

1656 exc_value: BaseException | None,

1657 traceback: TracebackType | None,

1658 ) -> None:

1659 self.close()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/excel/_base.py: 25%

420 statements