Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/excel/_base.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

420 statements  

1from __future__ import annotations 

2 

3from collections.abc import ( 

4 Hashable, 

5 Iterable, 

6 Mapping, 

7 Sequence, 

8) 

9import datetime 

10from functools import partial 

11from io import BytesIO 

12import os 

13from textwrap import fill 

14from typing import ( 

15 IO, 

16 TYPE_CHECKING, 

17 Any, 

18 Callable, 

19 Generic, 

20 Literal, 

21 TypeVar, 

22 Union, 

23 cast, 

24 overload, 

25) 

26import warnings 

27import zipfile 

28 

29from pandas._config import config 

30 

31from pandas._libs import lib 

32from pandas._libs.parsers import STR_NA_VALUES 

33from pandas.compat._optional import ( 

34 get_version, 

35 import_optional_dependency, 

36) 

37from pandas.errors import EmptyDataError 

38from pandas.util._decorators import ( 

39 Appender, 

40 doc, 

41) 

42from pandas.util._exceptions import find_stack_level 

43from pandas.util._validators import check_dtype_backend 

44 

45from pandas.core.dtypes.common import ( 

46 is_bool, 

47 is_float, 

48 is_integer, 

49 is_list_like, 

50) 

51 

52from pandas.core.frame import DataFrame 

53from pandas.core.shared_docs import _shared_docs 

54from pandas.util.version import Version 

55 

56from pandas.io.common import ( 

57 IOHandles, 

58 get_handle, 

59 stringify_path, 

60 validate_header_arg, 

61) 

62from pandas.io.excel._util import ( 

63 fill_mi_header, 

64 get_default_engine, 

65 get_writer, 

66 maybe_convert_usecols, 

67 pop_header_name, 

68) 

69from pandas.io.parsers import TextParser 

70from pandas.io.parsers.readers import validate_integer 

71 

72if TYPE_CHECKING: 

73 from types import TracebackType 

74 

75 from pandas._typing import ( 

76 DtypeArg, 

77 DtypeBackend, 

78 ExcelWriterIfSheetExists, 

79 FilePath, 

80 IntStrT, 

81 ReadBuffer, 

82 Self, 

83 SequenceNotStr, 

84 StorageOptions, 

85 WriteExcelBuffer, 

86 ) 

87_read_excel_doc = ( 

88 """ 

89Read an Excel file into a ``pandas`` ``DataFrame``. 

90 

91Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions 

92read from a local filesystem or URL. Supports an option to read 

93a single sheet or a list of sheets. 

94 

95Parameters 

96---------- 

97io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object 

98 Any valid string path is acceptable. The string could be a URL. Valid 

99 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

100 expected. A local file could be: ``file://localhost/path/to/table.xlsx``. 

101 

102 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

103 

104 By file-like object, we refer to objects with a ``read()`` method, 

105 such as a file handle (e.g. via builtin ``open`` function) 

106 or ``StringIO``. 

107 

108 .. deprecated:: 2.1.0 

109 Passing byte strings is deprecated. To read from a 

110 byte string, wrap it in a ``BytesIO`` object. 

111sheet_name : str, int, list, or None, default 0 

112 Strings are used for sheet names. Integers are used in zero-indexed 

113 sheet positions (chart sheets do not count as a sheet position). 

114 Lists of strings/integers are used to request multiple sheets. 

115 Specify ``None`` to get all worksheets. 

116 

117 Available cases: 

118 

119 * Defaults to ``0``: 1st sheet as a `DataFrame` 

120 * ``1``: 2nd sheet as a `DataFrame` 

121 * ``"Sheet1"``: Load sheet with name "Sheet1" 

122 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" 

123 as a dict of `DataFrame` 

124 * ``None``: All worksheets. 

125 

126header : int, list of int, default 0 

127 Row (0-indexed) to use for the column labels of the parsed 

128 DataFrame. If a list of integers is passed those row positions will 

129 be combined into a ``MultiIndex``. Use None if there is no header. 

130names : array-like, default None 

131 List of column names to use. If file contains no header row, 

132 then you should explicitly pass header=None. 

133index_col : int, str, list of int, default None 

134 Column (0-indexed) to use as the row labels of the DataFrame. 

135 Pass None if there is no such column. If a list is passed, 

136 those columns will be combined into a ``MultiIndex``. If a 

137 subset of data is selected with ``usecols``, index_col 

138 is based on the subset. 

139 

140 Missing values will be forward filled to allow roundtripping with 

141 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the 

142 missing values use ``set_index`` after reading the data instead of 

143 ``index_col``. 

144usecols : str, list-like, or callable, default None 

145 * If None, then parse all columns. 

146 * If str, then indicates comma separated list of Excel column letters 

147 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of 

148 both sides. 

149 * If list of int, then indicates list of column numbers to be parsed 

150 (0-indexed). 

151 * If list of string, then indicates list of column names to be parsed. 

152 * If callable, then evaluate each column name against it and parse the 

153 column if the callable returns ``True``. 

154 

155 Returns a subset of the columns according to behavior above. 

156dtype : Type name or dict of column -> type, default None 

157 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} 

158 Use ``object`` to preserve data as stored in Excel and not interpret dtype, 

159 which will necessarily result in ``object`` dtype. 

160 If converters are specified, they will be applied INSTEAD 

161 of dtype conversion. 

162 If you use ``None``, it will infer the dtype of each column based on the data. 

163engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None 

164 If io is not a buffer or path, this must be set to identify io. 

165 Engine compatibility : 

166 

167 - ``openpyxl`` supports newer Excel file formats. 

168 - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) 

169 and OpenDocument (.ods) file formats. 

170 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). 

171 - ``pyxlsb`` supports Binary Excel files. 

172 - ``xlrd`` supports old-style Excel files (.xls). 

173 

174 When ``engine=None``, the following logic will be used to determine the engine: 

175 

176 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 

177 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 

178 - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. 

179 - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. 

180 - Otherwise ``openpyxl`` will be used. 

181converters : dict, default None 

182 Dict of functions for converting values in certain columns. Keys can 

183 either be integers or column labels, values are functions that take one 

184 input argument, the Excel cell content, and return the transformed 

185 content. 

186true_values : list, default None 

187 Values to consider as True. 

188false_values : list, default None 

189 Values to consider as False. 

190skiprows : list-like, int, or callable, optional 

191 Line numbers to skip (0-indexed) or number of lines to skip (int) at the 

192 start of the file. If callable, the callable function will be evaluated 

193 against the row indices, returning True if the row should be skipped and 

194 False otherwise. An example of a valid callable argument would be ``lambda 

195 x: x in [0, 2]``. 

196nrows : int, default None 

197 Number of rows to parse. 

198na_values : scalar, str, list-like, or dict, default None 

199 Additional strings to recognize as NA/NaN. If dict passed, specific 

200 per-column NA values. By default the following values are interpreted 

201 as NaN: '""" 

202 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

203 + """'. 

204keep_default_na : bool, default True 

205 Whether or not to include the default NaN values when parsing the data. 

206 Depending on whether ``na_values`` is passed in, the behavior is as follows: 

207 

208 * If ``keep_default_na`` is True, and ``na_values`` are specified, 

209 ``na_values`` is appended to the default NaN values used for parsing. 

210 * If ``keep_default_na`` is True, and ``na_values`` are not specified, only 

211 the default NaN values are used for parsing. 

212 * If ``keep_default_na`` is False, and ``na_values`` are specified, only 

213 the NaN values specified ``na_values`` are used for parsing. 

214 * If ``keep_default_na`` is False, and ``na_values`` are not specified, no 

215 strings will be parsed as NaN. 

216 

217 Note that if `na_filter` is passed in as False, the ``keep_default_na`` and 

218 ``na_values`` parameters will be ignored. 

219na_filter : bool, default True 

220 Detect missing value markers (empty strings and the value of na_values). In 

221 data without any NAs, passing ``na_filter=False`` can improve the 

222 performance of reading a large file. 

223verbose : bool, default False 

224 Indicate number of NA values placed in non-numeric columns. 

225parse_dates : bool, list-like, or dict, default False 

226 The behavior is as follows: 

227 

228 * ``bool``. If True -> try parsing the index. 

229 * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

230 each as a separate date column. 

231 * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

232 a single date column. 

233 * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

234 result 'foo' 

235 

236 If a column or index contains an unparsable date, the entire column or 

237 index will be returned unaltered as an object data type. If you don`t want to 

238 parse some cells as date just change their type in Excel to "Text". 

239 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. 

240 

241 Note: A fast-path exists for iso8601-formatted dates. 

242date_parser : function, optional 

243 Function to use for converting a sequence of string columns to an array of 

244 datetime instances. The default uses ``dateutil.parser.parser`` to do the 

245 conversion. Pandas will try to call `date_parser` in three different ways, 

246 advancing to the next if an exception occurs: 1) Pass one or more arrays 

247 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 

248 string values from the columns defined by `parse_dates` into a single array 

249 and pass that; and 3) call `date_parser` once for each row using one or 

250 more strings (corresponding to the columns defined by `parse_dates`) as 

251 arguments. 

252 

253 .. deprecated:: 2.0.0 

254 Use ``date_format`` instead, or read in as ``object`` and then apply 

255 :func:`to_datetime` as-needed. 

256date_format : str or dict of column -> format, default ``None`` 

257 If used in conjunction with ``parse_dates``, will parse dates according to this 

258 format. For anything more complex, 

259 please read in as ``object`` and then apply :func:`to_datetime` as-needed. 

260 

261 .. versionadded:: 2.0.0 

262thousands : str, default None 

263 Thousands separator for parsing string columns to numeric. Note that 

264 this parameter is only necessary for columns stored as TEXT in Excel, 

265 any numeric columns will automatically be parsed, regardless of display 

266 format. 

267decimal : str, default '.' 

268 Character to recognize as decimal point for parsing string columns to numeric. 

269 Note that this parameter is only necessary for columns stored as TEXT in Excel, 

270 any numeric columns will automatically be parsed, regardless of display 

271 format.(e.g. use ',' for European data). 

272 

273 .. versionadded:: 1.4.0 

274 

275comment : str, default None 

276 Comments out remainder of line. Pass a character or characters to this 

277 argument to indicate comments in the input file. Any data between the 

278 comment string and the end of the current line is ignored. 

279skipfooter : int, default 0 

280 Rows at the end to skip (0-indexed). 

281{storage_options} 

282 

283dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' 

284 Back-end data type applied to the resultant :class:`DataFrame` 

285 (still experimental). Behaviour is as follows: 

286 

287 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

288 (default). 

289 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

290 DataFrame. 

291 

292 .. versionadded:: 2.0 

293 

294engine_kwargs : dict, optional 

295 Arbitrary keyword arguments passed to excel engine. 

296 

297Returns 

298------- 

299DataFrame or dict of DataFrames 

300 DataFrame from the passed in Excel file. See notes in sheet_name 

301 argument for more information on when a dict of DataFrames is returned. 

302 

303See Also 

304-------- 

305DataFrame.to_excel : Write DataFrame to an Excel file. 

306DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

307read_csv : Read a comma-separated values (csv) file into DataFrame. 

308read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

309 

310Notes 

311----- 

312For specific information on the methods used for each Excel engine, refer to the pandas 

313:ref:`user guide <io.excel_reader>` 

314 

315Examples 

316-------- 

317The file can be read using the file name as string or an open file object: 

318 

319>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP 

320 Name Value 

3210 string1 1 

3221 string2 2 

3232 #Comment 3 

324 

325>>> pd.read_excel(open('tmp.xlsx', 'rb'), 

326... sheet_name='Sheet3') # doctest: +SKIP 

327 Unnamed: 0 Name Value 

3280 0 string1 1 

3291 1 string2 2 

3302 2 #Comment 3 

331 

332Index and header can be specified via the `index_col` and `header` arguments 

333 

334>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP 

335 0 1 2 

3360 NaN Name Value 

3371 0.0 string1 1 

3382 1.0 string2 2 

3393 2.0 #Comment 3 

340 

341Column types are inferred but can be explicitly specified 

342 

343>>> pd.read_excel('tmp.xlsx', index_col=0, 

344... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP 

345 Name Value 

3460 string1 1.0 

3471 string2 2.0 

3482 #Comment 3.0 

349 

350True, False, and NA values, and thousands separators have defaults, 

351but can be explicitly specified, too. Supply the values you would like 

352as strings or lists of strings! 

353 

354>>> pd.read_excel('tmp.xlsx', index_col=0, 

355... na_values=['string1', 'string2']) # doctest: +SKIP 

356 Name Value 

3570 NaN 1 

3581 NaN 2 

3592 #Comment 3 

360 

361Comment lines in the excel input file can be skipped using the 

362``comment`` kwarg. 

363 

364>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP 

365 Name Value 

3660 string1 1.0 

3671 string2 2.0 

3682 None NaN 

369""" 

370) 

371 

372 

373@overload 

374def read_excel( 

375 io, 

376 # sheet name is str or int -> DataFrame 

377 sheet_name: str | int = ..., 

378 *, 

379 header: int | Sequence[int] | None = ..., 

380 names: SequenceNotStr[Hashable] | range | None = ..., 

381 index_col: int | str | Sequence[int] | None = ..., 

382 usecols: int 

383 | str 

384 | Sequence[int] 

385 | Sequence[str] 

386 | Callable[[str], bool] 

387 | None = ..., 

388 dtype: DtypeArg | None = ..., 

389 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., 

390 converters: dict[str, Callable] | dict[int, Callable] | None = ..., 

391 true_values: Iterable[Hashable] | None = ..., 

392 false_values: Iterable[Hashable] | None = ..., 

393 skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., 

394 nrows: int | None = ..., 

395 na_values=..., 

396 keep_default_na: bool = ..., 

397 na_filter: bool = ..., 

398 verbose: bool = ..., 

399 parse_dates: list | dict | bool = ..., 

400 date_parser: Callable | lib.NoDefault = ..., 

401 date_format: dict[Hashable, str] | str | None = ..., 

402 thousands: str | None = ..., 

403 decimal: str = ..., 

404 comment: str | None = ..., 

405 skipfooter: int = ..., 

406 storage_options: StorageOptions = ..., 

407 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

408) -> DataFrame: 

409 ... 

410 

411 

412@overload 

413def read_excel( 

414 io, 

415 # sheet name is list or None -> dict[IntStrT, DataFrame] 

416 sheet_name: list[IntStrT] | None, 

417 *, 

418 header: int | Sequence[int] | None = ..., 

419 names: SequenceNotStr[Hashable] | range | None = ..., 

420 index_col: int | str | Sequence[int] | None = ..., 

421 usecols: int 

422 | str 

423 | Sequence[int] 

424 | Sequence[str] 

425 | Callable[[str], bool] 

426 | None = ..., 

427 dtype: DtypeArg | None = ..., 

428 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., 

429 converters: dict[str, Callable] | dict[int, Callable] | None = ..., 

430 true_values: Iterable[Hashable] | None = ..., 

431 false_values: Iterable[Hashable] | None = ..., 

432 skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., 

433 nrows: int | None = ..., 

434 na_values=..., 

435 keep_default_na: bool = ..., 

436 na_filter: bool = ..., 

437 verbose: bool = ..., 

438 parse_dates: list | dict | bool = ..., 

439 date_parser: Callable | lib.NoDefault = ..., 

440 date_format: dict[Hashable, str] | str | None = ..., 

441 thousands: str | None = ..., 

442 decimal: str = ..., 

443 comment: str | None = ..., 

444 skipfooter: int = ..., 

445 storage_options: StorageOptions = ..., 

446 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

447) -> dict[IntStrT, DataFrame]: 

448 ... 

449 

450 

451@doc(storage_options=_shared_docs["storage_options"]) 

452@Appender(_read_excel_doc) 

453def read_excel( 

454 io, 

455 sheet_name: str | int | list[IntStrT] | None = 0, 

456 *, 

457 header: int | Sequence[int] | None = 0, 

458 names: SequenceNotStr[Hashable] | range | None = None, 

459 index_col: int | str | Sequence[int] | None = None, 

460 usecols: int 

461 | str 

462 | Sequence[int] 

463 | Sequence[str] 

464 | Callable[[str], bool] 

465 | None = None, 

466 dtype: DtypeArg | None = None, 

467 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, 

468 converters: dict[str, Callable] | dict[int, Callable] | None = None, 

469 true_values: Iterable[Hashable] | None = None, 

470 false_values: Iterable[Hashable] | None = None, 

471 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

472 nrows: int | None = None, 

473 na_values=None, 

474 keep_default_na: bool = True, 

475 na_filter: bool = True, 

476 verbose: bool = False, 

477 parse_dates: list | dict | bool = False, 

478 date_parser: Callable | lib.NoDefault = lib.no_default, 

479 date_format: dict[Hashable, str] | str | None = None, 

480 thousands: str | None = None, 

481 decimal: str = ".", 

482 comment: str | None = None, 

483 skipfooter: int = 0, 

484 storage_options: StorageOptions | None = None, 

485 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

486 engine_kwargs: dict | None = None, 

487) -> DataFrame | dict[IntStrT, DataFrame]: 

488 check_dtype_backend(dtype_backend) 

489 should_close = False 

490 if engine_kwargs is None: 

491 engine_kwargs = {} 

492 

493 if not isinstance(io, ExcelFile): 

494 should_close = True 

495 io = ExcelFile( 

496 io, 

497 storage_options=storage_options, 

498 engine=engine, 

499 engine_kwargs=engine_kwargs, 

500 ) 

501 elif engine and engine != io.engine: 

502 raise ValueError( 

503 "Engine should not be specified when passing " 

504 "an ExcelFile - ExcelFile already has the engine set" 

505 ) 

506 

507 try: 

508 data = io.parse( 

509 sheet_name=sheet_name, 

510 header=header, 

511 names=names, 

512 index_col=index_col, 

513 usecols=usecols, 

514 dtype=dtype, 

515 converters=converters, 

516 true_values=true_values, 

517 false_values=false_values, 

518 skiprows=skiprows, 

519 nrows=nrows, 

520 na_values=na_values, 

521 keep_default_na=keep_default_na, 

522 na_filter=na_filter, 

523 verbose=verbose, 

524 parse_dates=parse_dates, 

525 date_parser=date_parser, 

526 date_format=date_format, 

527 thousands=thousands, 

528 decimal=decimal, 

529 comment=comment, 

530 skipfooter=skipfooter, 

531 dtype_backend=dtype_backend, 

532 ) 

533 finally: 

534 # make sure to close opened file handles 

535 if should_close: 

536 io.close() 

537 return data 

538 

539 

540_WorkbookT = TypeVar("_WorkbookT") 

541 

542 

543class BaseExcelReader(Generic[_WorkbookT]): 

544 book: _WorkbookT 

545 

546 def __init__( 

547 self, 

548 filepath_or_buffer, 

549 storage_options: StorageOptions | None = None, 

550 engine_kwargs: dict | None = None, 

551 ) -> None: 

552 if engine_kwargs is None: 

553 engine_kwargs = {} 

554 

555 # First argument can also be bytes, so create a buffer 

556 if isinstance(filepath_or_buffer, bytes): 

557 filepath_or_buffer = BytesIO(filepath_or_buffer) 

558 

559 self.handles = IOHandles( 

560 handle=filepath_or_buffer, compression={"method": None} 

561 ) 

562 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): 

563 self.handles = get_handle( 

564 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False 

565 ) 

566 

567 if isinstance(self.handles.handle, self._workbook_class): 

568 self.book = self.handles.handle 

569 elif hasattr(self.handles.handle, "read"): 

570 # N.B. xlrd.Book has a read attribute too 

571 self.handles.handle.seek(0) 

572 try: 

573 self.book = self.load_workbook(self.handles.handle, engine_kwargs) 

574 except Exception: 

575 self.close() 

576 raise 

577 else: 

578 raise ValueError( 

579 "Must explicitly set engine if not passing in buffer or path for io." 

580 ) 

581 

582 @property 

583 def _workbook_class(self) -> type[_WorkbookT]: 

584 raise NotImplementedError 

585 

586 def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT: 

587 raise NotImplementedError 

588 

589 def close(self) -> None: 

590 if hasattr(self, "book"): 

591 if hasattr(self.book, "close"): 

592 # pyxlsb: opens a TemporaryFile 

593 # openpyxl: https://stackoverflow.com/questions/31416842/ 

594 # openpyxl-does-not-close-excel-workbook-in-read-only-mode 

595 self.book.close() 

596 elif hasattr(self.book, "release_resources"): 

597 # xlrd 

598 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548 

599 self.book.release_resources() 

600 self.handles.close() 

601 

602 @property 

603 def sheet_names(self) -> list[str]: 

604 raise NotImplementedError 

605 

606 def get_sheet_by_name(self, name: str): 

607 raise NotImplementedError 

608 

609 def get_sheet_by_index(self, index: int): 

610 raise NotImplementedError 

611 

612 def get_sheet_data(self, sheet, rows: int | None = None): 

613 raise NotImplementedError 

614 

615 def raise_if_bad_sheet_by_index(self, index: int) -> None: 

616 n_sheets = len(self.sheet_names) 

617 if index >= n_sheets: 

618 raise ValueError( 

619 f"Worksheet index {index} is invalid, {n_sheets} worksheets found" 

620 ) 

621 

622 def raise_if_bad_sheet_by_name(self, name: str) -> None: 

623 if name not in self.sheet_names: 

624 raise ValueError(f"Worksheet named '{name}' not found") 

625 

626 def _check_skiprows_func( 

627 self, 

628 skiprows: Callable, 

629 rows_to_use: int, 

630 ) -> int: 

631 """ 

632 Determine how many file rows are required to obtain `nrows` data 

633 rows when `skiprows` is a function. 

634 

635 Parameters 

636 ---------- 

637 skiprows : function 

638 The function passed to read_excel by the user. 

639 rows_to_use : int 

640 The number of rows that will be needed for the header and 

641 the data. 

642 

643 Returns 

644 ------- 

645 int 

646 """ 

647 i = 0 

648 rows_used_so_far = 0 

649 while rows_used_so_far < rows_to_use: 

650 if not skiprows(i): 

651 rows_used_so_far += 1 

652 i += 1 

653 return i 

654 

655 def _calc_rows( 

656 self, 

657 header: int | Sequence[int] | None, 

658 index_col: int | Sequence[int] | None, 

659 skiprows: Sequence[int] | int | Callable[[int], object] | None, 

660 nrows: int | None, 

661 ) -> int | None: 

662 """ 

663 If nrows specified, find the number of rows needed from the 

664 file, otherwise return None. 

665 

666 

667 Parameters 

668 ---------- 

669 header : int, list of int, or None 

670 See read_excel docstring. 

671 index_col : int, str, list of int, or None 

672 See read_excel docstring. 

673 skiprows : list-like, int, callable, or None 

674 See read_excel docstring. 

675 nrows : int or None 

676 See read_excel docstring. 

677 

678 Returns 

679 ------- 

680 int or None 

681 """ 

682 if nrows is None: 

683 return None 

684 if header is None: 

685 header_rows = 1 

686 elif is_integer(header): 

687 header = cast(int, header) 

688 header_rows = 1 + header 

689 else: 

690 header = cast(Sequence, header) 

691 header_rows = 1 + header[-1] 

692 # If there is a MultiIndex header and an index then there is also 

693 # a row containing just the index name(s) 

694 if is_list_like(header) and index_col is not None: 

695 header = cast(Sequence, header) 

696 if len(header) > 1: 

697 header_rows += 1 

698 if skiprows is None: 

699 return header_rows + nrows 

700 if is_integer(skiprows): 

701 skiprows = cast(int, skiprows) 

702 return header_rows + nrows + skiprows 

703 if is_list_like(skiprows): 

704 

705 def f(skiprows: Sequence, x: int) -> bool: 

706 return x in skiprows 

707 

708 skiprows = cast(Sequence, skiprows) 

709 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows) 

710 if callable(skiprows): 

711 return self._check_skiprows_func( 

712 skiprows, 

713 header_rows + nrows, 

714 ) 

715 # else unexpected skiprows type: read_excel will not optimize 

716 # the number of rows read from file 

717 return None 

718 

719 def parse( 

720 self, 

721 sheet_name: str | int | list[int] | list[str] | None = 0, 

722 header: int | Sequence[int] | None = 0, 

723 names: SequenceNotStr[Hashable] | range | None = None, 

724 index_col: int | Sequence[int] | None = None, 

725 usecols=None, 

726 dtype: DtypeArg | None = None, 

727 true_values: Iterable[Hashable] | None = None, 

728 false_values: Iterable[Hashable] | None = None, 

729 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

730 nrows: int | None = None, 

731 na_values=None, 

732 verbose: bool = False, 

733 parse_dates: list | dict | bool = False, 

734 date_parser: Callable | lib.NoDefault = lib.no_default, 

735 date_format: dict[Hashable, str] | str | None = None, 

736 thousands: str | None = None, 

737 decimal: str = ".", 

738 comment: str | None = None, 

739 skipfooter: int = 0, 

740 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

741 **kwds, 

742 ): 

743 validate_header_arg(header) 

744 validate_integer("nrows", nrows) 

745 

746 ret_dict = False 

747 

748 # Keep sheetname to maintain backwards compatibility. 

749 sheets: list[int] | list[str] 

750 if isinstance(sheet_name, list): 

751 sheets = sheet_name 

752 ret_dict = True 

753 elif sheet_name is None: 

754 sheets = self.sheet_names 

755 ret_dict = True 

756 elif isinstance(sheet_name, str): 

757 sheets = [sheet_name] 

758 else: 

759 sheets = [sheet_name] 

760 

761 # handle same-type duplicates. 

762 sheets = cast(Union[list[int], list[str]], list(dict.fromkeys(sheets).keys())) 

763 

764 output = {} 

765 

766 last_sheetname = None 

767 for asheetname in sheets: 

768 last_sheetname = asheetname 

769 if verbose: 

770 print(f"Reading sheet {asheetname}") 

771 

772 if isinstance(asheetname, str): 

773 sheet = self.get_sheet_by_name(asheetname) 

774 else: # assume an integer if not a string 

775 sheet = self.get_sheet_by_index(asheetname) 

776 

777 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) 

778 data = self.get_sheet_data(sheet, file_rows_needed) 

779 if hasattr(sheet, "close"): 

780 # pyxlsb opens two TemporaryFiles 

781 sheet.close() 

782 usecols = maybe_convert_usecols(usecols) 

783 

784 if not data: 

785 output[asheetname] = DataFrame() 

786 continue 

787 

788 is_list_header = False 

789 is_len_one_list_header = False 

790 if is_list_like(header): 

791 assert isinstance(header, Sequence) 

792 is_list_header = True 

793 if len(header) == 1: 

794 is_len_one_list_header = True 

795 

796 if is_len_one_list_header: 

797 header = cast(Sequence[int], header)[0] 

798 

799 # forward fill and pull out names for MultiIndex column 

800 header_names = None 

801 if header is not None and is_list_like(header): 

802 assert isinstance(header, Sequence) 

803 

804 header_names = [] 

805 control_row = [True] * len(data[0]) 

806 

807 for row in header: 

808 if is_integer(skiprows): 

809 assert isinstance(skiprows, int) 

810 row += skiprows 

811 

812 if row > len(data) - 1: 

813 raise ValueError( 

814 f"header index {row} exceeds maximum index " 

815 f"{len(data) - 1} of data.", 

816 ) 

817 

818 data[row], control_row = fill_mi_header(data[row], control_row) 

819 

820 if index_col is not None: 

821 header_name, _ = pop_header_name(data[row], index_col) 

822 header_names.append(header_name) 

823 

824 # If there is a MultiIndex header and an index then there is also 

825 # a row containing just the index name(s) 

826 has_index_names = False 

827 if is_list_header and not is_len_one_list_header and index_col is not None: 

828 index_col_list: Sequence[int] 

829 if isinstance(index_col, int): 

830 index_col_list = [index_col] 

831 else: 

832 assert isinstance(index_col, Sequence) 

833 index_col_list = index_col 

834 

835 # We have to handle mi without names. If any of the entries in the data 

836 # columns are not empty, this is a regular row 

837 assert isinstance(header, Sequence) 

838 if len(header) < len(data): 

839 potential_index_names = data[len(header)] 

840 potential_data = [ 

841 x 

842 for i, x in enumerate(potential_index_names) 

843 if not control_row[i] and i not in index_col_list 

844 ] 

845 has_index_names = all(x == "" or x is None for x in potential_data) 

846 

847 if is_list_like(index_col): 

848 # Forward fill values for MultiIndex index. 

849 if header is None: 

850 offset = 0 

851 elif isinstance(header, int): 

852 offset = 1 + header 

853 else: 

854 offset = 1 + max(header) 

855 

856 # GH34673: if MultiIndex names present and not defined in the header, 

857 # offset needs to be incremented so that forward filling starts 

858 # from the first MI value instead of the name 

859 if has_index_names: 

860 offset += 1 

861 

862 # Check if we have an empty dataset 

863 # before trying to collect data. 

864 if offset < len(data): 

865 assert isinstance(index_col, Sequence) 

866 

867 for col in index_col: 

868 last = data[offset][col] 

869 

870 for row in range(offset + 1, len(data)): 

871 if data[row][col] == "" or data[row][col] is None: 

872 data[row][col] = last 

873 else: 

874 last = data[row][col] 

875 

876 # GH 12292 : error when read one empty column from excel file 

877 try: 

878 parser = TextParser( 

879 data, 

880 names=names, 

881 header=header, 

882 index_col=index_col, 

883 has_index_names=has_index_names, 

884 dtype=dtype, 

885 true_values=true_values, 

886 false_values=false_values, 

887 skiprows=skiprows, 

888 nrows=nrows, 

889 na_values=na_values, 

890 skip_blank_lines=False, # GH 39808 

891 parse_dates=parse_dates, 

892 date_parser=date_parser, 

893 date_format=date_format, 

894 thousands=thousands, 

895 decimal=decimal, 

896 comment=comment, 

897 skipfooter=skipfooter, 

898 usecols=usecols, 

899 dtype_backend=dtype_backend, 

900 **kwds, 

901 ) 

902 

903 output[asheetname] = parser.read(nrows=nrows) 

904 

905 if header_names: 

906 output[asheetname].columns = output[asheetname].columns.set_names( 

907 header_names 

908 ) 

909 

910 except EmptyDataError: 

911 # No Data, return an empty DataFrame 

912 output[asheetname] = DataFrame() 

913 

914 except Exception as err: 

915 err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) 

916 raise err 

917 

918 if last_sheetname is None: 

919 raise ValueError("Sheet name is an empty list") 

920 

921 if ret_dict: 

922 return output 

923 else: 

924 return output[last_sheetname] 

925 

926 

927@doc(storage_options=_shared_docs["storage_options"]) 

928class ExcelWriter(Generic[_WorkbookT]): 

929 """ 

930 Class for writing DataFrame objects into excel sheets. 

931 

932 Default is to use: 

933 

934 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter 

935 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__ 

936 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files 

937 

938 See ``DataFrame.to_excel`` for typical usage. 

939 

940 The writer should be used as a context manager. Otherwise, call `close()` to save 

941 and close any opened file handles. 

942 

943 Parameters 

944 ---------- 

945 path : str or typing.BinaryIO 

946 Path to xls or xlsx or ods file. 

947 engine : str (optional) 

948 Engine to use for writing. If None, defaults to 

949 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword 

950 argument. 

951 date_format : str, default None 

952 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). 

953 datetime_format : str, default None 

954 Format string for datetime objects written into Excel files. 

955 (e.g. 'YYYY-MM-DD HH:MM:SS'). 

956 mode : {{'w', 'a'}}, default 'w' 

957 File mode to use (write or append). Append does not work with fsspec URLs. 

958 {storage_options} 

959 

960 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error' 

961 How to behave when trying to write to a sheet that already 

962 exists (append mode only). 

963 

964 * error: raise a ValueError. 

965 * new: Create a new sheet, with a name determined by the engine. 

966 * replace: Delete the contents of the sheet before writing to it. 

967 * overlay: Write contents to the existing sheet without first removing, 

968 but possibly over top of, the existing contents. 

969 

970 .. versionadded:: 1.3.0 

971 

972 .. versionchanged:: 1.4.0 

973 

974 Added ``overlay`` option 

975 

976 engine_kwargs : dict, optional 

977 Keyword arguments to be passed into the engine. These will be passed to 

978 the following functions of the respective engines: 

979 

980 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` 

981 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` 

982 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` 

983 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` 

984 

985 .. versionadded:: 1.3.0 

986 

987 Notes 

988 ----- 

989 For compatibility with CSV writers, ExcelWriter serializes lists 

990 and dicts to strings before writing. 

991 

992 Examples 

993 -------- 

994 Default usage: 

995 

996 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

997 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: 

998 ... df.to_excel(writer) # doctest: +SKIP 

999 

1000 To write to separate sheets in a single file: 

1001 

1002 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP 

1003 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

1004 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: 

1005 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP 

1006 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP 

1007 

1008 You can set the date format or datetime format: 

1009 

1010 >>> from datetime import date, datetime # doctest: +SKIP 

1011 >>> df = pd.DataFrame( 

1012 ... [ 

1013 ... [date(2014, 1, 31), date(1999, 9, 24)], 

1014 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], 

1015 ... ], 

1016 ... index=["Date", "Datetime"], 

1017 ... columns=["X", "Y"], 

1018 ... ) # doctest: +SKIP 

1019 >>> with pd.ExcelWriter( 

1020 ... "path_to_file.xlsx", 

1021 ... date_format="YYYY-MM-DD", 

1022 ... datetime_format="YYYY-MM-DD HH:MM:SS" 

1023 ... ) as writer: 

1024 ... df.to_excel(writer) # doctest: +SKIP 

1025 

1026 You can also append to an existing Excel file: 

1027 

1028 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer: 

1029 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP 

1030 

1031 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it 

1032 already exists: 

1033 

1034 >>> with ExcelWriter( 

1035 ... "path_to_file.xlsx", 

1036 ... mode="a", 

1037 ... engine="openpyxl", 

1038 ... if_sheet_exists="replace", 

1039 ... ) as writer: 

1040 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP 

1041 

1042 You can also write multiple DataFrames to a single sheet. Note that the 

1043 ``if_sheet_exists`` parameter needs to be set to ``overlay``: 

1044 

1045 >>> with ExcelWriter("path_to_file.xlsx", 

1046 ... mode="a", 

1047 ... engine="openpyxl", 

1048 ... if_sheet_exists="overlay", 

1049 ... ) as writer: 

1050 ... df1.to_excel(writer, sheet_name="Sheet1") 

1051 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP 

1052 

1053 You can store Excel file in RAM: 

1054 

1055 >>> import io 

1056 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) 

1057 >>> buffer = io.BytesIO() 

1058 >>> with pd.ExcelWriter(buffer) as writer: 

1059 ... df.to_excel(writer) 

1060 

1061 You can pack Excel file into zip archive: 

1062 

1063 >>> import zipfile # doctest: +SKIP 

1064 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

1065 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf: 

1066 ... with zf.open("filename.xlsx", "w") as buffer: 

1067 ... with pd.ExcelWriter(buffer) as writer: 

1068 ... df.to_excel(writer) # doctest: +SKIP 

1069 

1070 You can specify additional arguments to the underlying engine: 

1071 

1072 >>> with pd.ExcelWriter( 

1073 ... "path_to_file.xlsx", 

1074 ... engine="xlsxwriter", 

1075 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}} 

1076 ... ) as writer: 

1077 ... df.to_excel(writer) # doctest: +SKIP 

1078 

1079 In append mode, ``engine_kwargs`` are passed through to 

1080 openpyxl's ``load_workbook``: 

1081 

1082 >>> with pd.ExcelWriter( 

1083 ... "path_to_file.xlsx", 

1084 ... engine="openpyxl", 

1085 ... mode="a", 

1086 ... engine_kwargs={{"keep_vba": True}} 

1087 ... ) as writer: 

1088 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP 

1089 """ 

1090 

1091 # Defining an ExcelWriter implementation (see abstract methods for more...) 

1092 

1093 # - Mandatory 

1094 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)`` 

1095 # --> called to write additional DataFrames to disk 

1096 # - ``_supported_extensions`` (tuple of supported extensions), used to 

1097 # check that engine supports the given extension. 

1098 # - ``_engine`` - string that gives the engine name. Necessary to 

1099 # instantiate class directly and bypass ``ExcelWriterMeta`` engine 

1100 # lookup. 

1101 # - ``save(self)`` --> called to save file to disk 

1102 # - Mostly mandatory (i.e. should at least exist) 

1103 # - book, cur_sheet, path 

1104 

1105 # - Optional: 

1106 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called 

1107 # with path as first argument. 

1108 

1109 # You also need to register the class with ``register_writer()``. 

1110 # Technically, ExcelWriter implementations don't need to subclass 

1111 # ExcelWriter. 

1112 

1113 _engine: str 

1114 _supported_extensions: tuple[str, ...] 

1115 

1116 def __new__( 

1117 cls, 

1118 path: FilePath | WriteExcelBuffer | ExcelWriter, 

1119 engine: str | None = None, 

1120 date_format: str | None = None, 

1121 datetime_format: str | None = None, 

1122 mode: str = "w", 

1123 storage_options: StorageOptions | None = None, 

1124 if_sheet_exists: ExcelWriterIfSheetExists | None = None, 

1125 engine_kwargs: dict | None = None, 

1126 ) -> Self: 

1127 # only switch class if generic(ExcelWriter) 

1128 if cls is ExcelWriter: 

1129 if engine is None or (isinstance(engine, str) and engine == "auto"): 

1130 if isinstance(path, str): 

1131 ext = os.path.splitext(path)[-1][1:] 

1132 else: 

1133 ext = "xlsx" 

1134 

1135 try: 

1136 engine = config.get_option(f"io.excel.{ext}.writer", silent=True) 

1137 if engine == "auto": 

1138 engine = get_default_engine(ext, mode="writer") 

1139 except KeyError as err: 

1140 raise ValueError(f"No engine for filetype: '{ext}'") from err 

1141 

1142 # for mypy 

1143 assert engine is not None 

1144 # error: Incompatible types in assignment (expression has type 

1145 # "type[ExcelWriter[Any]]", variable has type "type[Self]") 

1146 cls = get_writer(engine) # type: ignore[assignment] 

1147 

1148 return object.__new__(cls) 

1149 

1150 # declare external properties you can count on 

1151 _path = None 

1152 

1153 @property 

1154 def supported_extensions(self) -> tuple[str, ...]: 

1155 """Extensions that writer engine supports.""" 

1156 return self._supported_extensions 

1157 

1158 @property 

1159 def engine(self) -> str: 

1160 """Name of engine.""" 

1161 return self._engine 

1162 

1163 @property 

1164 def sheets(self) -> dict[str, Any]: 

1165 """Mapping of sheet names to sheet objects.""" 

1166 raise NotImplementedError 

1167 

1168 @property 

1169 def book(self) -> _WorkbookT: 

1170 """ 

1171 Book instance. Class type will depend on the engine used. 

1172 

1173 This attribute can be used to access engine-specific features. 

1174 """ 

1175 raise NotImplementedError 

1176 

1177 def _write_cells( 

1178 self, 

1179 cells, 

1180 sheet_name: str | None = None, 

1181 startrow: int = 0, 

1182 startcol: int = 0, 

1183 freeze_panes: tuple[int, int] | None = None, 

1184 ) -> None: 

1185 """ 

1186 Write given formatted cells into Excel an excel sheet 

1187 

1188 Parameters 

1189 ---------- 

1190 cells : generator 

1191 cell of formatted data to save to Excel sheet 

1192 sheet_name : str, default None 

1193 Name of Excel sheet, if None, then use self.cur_sheet 

1194 startrow : upper left cell row to dump data frame 

1195 startcol : upper left cell column to dump data frame 

1196 freeze_panes: int tuple of length 2 

1197 contains the bottom-most row and right-most column to freeze 

1198 """ 

1199 raise NotImplementedError 

1200 

1201 def _save(self) -> None: 

1202 """ 

1203 Save workbook to disk. 

1204 """ 

1205 raise NotImplementedError 

1206 

1207 def __init__( 

1208 self, 

1209 path: FilePath | WriteExcelBuffer | ExcelWriter, 

1210 engine: str | None = None, 

1211 date_format: str | None = None, 

1212 datetime_format: str | None = None, 

1213 mode: str = "w", 

1214 storage_options: StorageOptions | None = None, 

1215 if_sheet_exists: ExcelWriterIfSheetExists | None = None, 

1216 engine_kwargs: dict[str, Any] | None = None, 

1217 ) -> None: 

1218 # validate that this engine can handle the extension 

1219 if isinstance(path, str): 

1220 ext = os.path.splitext(path)[-1] 

1221 self.check_extension(ext) 

1222 

1223 # use mode to open the file 

1224 if "b" not in mode: 

1225 mode += "b" 

1226 # use "a" for the user to append data to excel but internally use "r+" to let 

1227 # the excel backend first read the existing file and then write any data to it 

1228 mode = mode.replace("a", "r+") 

1229 

1230 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"): 

1231 raise ValueError( 

1232 f"'{if_sheet_exists}' is not valid for if_sheet_exists. " 

1233 "Valid options are 'error', 'new', 'replace' and 'overlay'." 

1234 ) 

1235 if if_sheet_exists and "r+" not in mode: 

1236 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')") 

1237 if if_sheet_exists is None: 

1238 if_sheet_exists = "error" 

1239 self._if_sheet_exists = if_sheet_exists 

1240 

1241 # cast ExcelWriter to avoid adding 'if self._handles is not None' 

1242 self._handles = IOHandles( 

1243 cast(IO[bytes], path), compression={"compression": None} 

1244 ) 

1245 if not isinstance(path, ExcelWriter): 

1246 self._handles = get_handle( 

1247 path, mode, storage_options=storage_options, is_text=False 

1248 ) 

1249 self._cur_sheet = None 

1250 

1251 if date_format is None: 

1252 self._date_format = "YYYY-MM-DD" 

1253 else: 

1254 self._date_format = date_format 

1255 if datetime_format is None: 

1256 self._datetime_format = "YYYY-MM-DD HH:MM:SS" 

1257 else: 

1258 self._datetime_format = datetime_format 

1259 

1260 self._mode = mode 

1261 

1262 @property 

1263 def date_format(self) -> str: 

1264 """ 

1265 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). 

1266 """ 

1267 return self._date_format 

1268 

1269 @property 

1270 def datetime_format(self) -> str: 

1271 """ 

1272 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). 

1273 """ 

1274 return self._datetime_format 

1275 

1276 @property 

1277 def if_sheet_exists(self) -> str: 

1278 """ 

1279 How to behave when writing to a sheet that already exists in append mode. 

1280 """ 

1281 return self._if_sheet_exists 

1282 

1283 def __fspath__(self) -> str: 

1284 return getattr(self._handles.handle, "name", "") 

1285 

1286 def _get_sheet_name(self, sheet_name: str | None) -> str: 

1287 if sheet_name is None: 

1288 sheet_name = self._cur_sheet 

1289 if sheet_name is None: # pragma: no cover 

1290 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property") 

1291 return sheet_name 

1292 

1293 def _value_with_fmt( 

1294 self, val 

1295 ) -> tuple[ 

1296 int | float | bool | str | datetime.datetime | datetime.date, str | None 

1297 ]: 

1298 """ 

1299 Convert numpy types to Python types for the Excel writers. 

1300 

1301 Parameters 

1302 ---------- 

1303 val : object 

1304 Value to be written into cells 

1305 

1306 Returns 

1307 ------- 

1308 Tuple with the first element being the converted value and the second 

1309 being an optional format 

1310 """ 

1311 fmt = None 

1312 

1313 if is_integer(val): 

1314 val = int(val) 

1315 elif is_float(val): 

1316 val = float(val) 

1317 elif is_bool(val): 

1318 val = bool(val) 

1319 elif isinstance(val, datetime.datetime): 

1320 fmt = self._datetime_format 

1321 elif isinstance(val, datetime.date): 

1322 fmt = self._date_format 

1323 elif isinstance(val, datetime.timedelta): 

1324 val = val.total_seconds() / 86400 

1325 fmt = "0" 

1326 else: 

1327 val = str(val) 

1328 

1329 return val, fmt 

1330 

1331 @classmethod 

1332 def check_extension(cls, ext: str) -> Literal[True]: 

1333 """ 

1334 checks that path's extension against the Writer's supported 

1335 extensions. If it isn't supported, raises UnsupportedFiletypeError. 

1336 """ 

1337 if ext.startswith("."): 

1338 ext = ext[1:] 

1339 if not any(ext in extension for extension in cls._supported_extensions): 

1340 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") 

1341 return True 

1342 

1343 # Allow use as a contextmanager 

1344 def __enter__(self) -> Self: 

1345 return self 

1346 

1347 def __exit__( 

1348 self, 

1349 exc_type: type[BaseException] | None, 

1350 exc_value: BaseException | None, 

1351 traceback: TracebackType | None, 

1352 ) -> None: 

1353 self.close() 

1354 

1355 def close(self) -> None: 

1356 """synonym for save, to make it more file-like""" 

1357 self._save() 

1358 self._handles.close() 

1359 

1360 

1361XLS_SIGNATURES = ( 

1362 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 

1363 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 

1364 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 

1365 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary 

1366) 

1367ZIP_SIGNATURE = b"PK\x03\x04" 

1368PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) 

1369 

1370 

1371@doc(storage_options=_shared_docs["storage_options"]) 

1372def inspect_excel_format( 

1373 content_or_path: FilePath | ReadBuffer[bytes], 

1374 storage_options: StorageOptions | None = None, 

1375) -> str | None: 

1376 """ 

1377 Inspect the path or content of an excel file and get its format. 

1378 

1379 Adopted from xlrd: https://github.com/python-excel/xlrd. 

1380 

1381 Parameters 

1382 ---------- 

1383 content_or_path : str or file-like object 

1384 Path to file or content of file to inspect. May be a URL. 

1385 {storage_options} 

1386 

1387 Returns 

1388 ------- 

1389 str or None 

1390 Format of file if it can be determined. 

1391 

1392 Raises 

1393 ------ 

1394 ValueError 

1395 If resulting stream is empty. 

1396 BadZipFile 

1397 If resulting stream does not have an XLS signature and is not a valid zipfile. 

1398 """ 

1399 if isinstance(content_or_path, bytes): 

1400 content_or_path = BytesIO(content_or_path) 

1401 

1402 with get_handle( 

1403 content_or_path, "rb", storage_options=storage_options, is_text=False 

1404 ) as handle: 

1405 stream = handle.handle 

1406 stream.seek(0) 

1407 buf = stream.read(PEEK_SIZE) 

1408 if buf is None: 

1409 raise ValueError("stream is empty") 

1410 assert isinstance(buf, bytes) 

1411 peek = buf 

1412 stream.seek(0) 

1413 

1414 if any(peek.startswith(sig) for sig in XLS_SIGNATURES): 

1415 return "xls" 

1416 elif not peek.startswith(ZIP_SIGNATURE): 

1417 return None 

1418 

1419 with zipfile.ZipFile(stream) as zf: 

1420 # Workaround for some third party files that use forward slashes and 

1421 # lower case names. 

1422 component_names = [ 

1423 name.replace("\\", "/").lower() for name in zf.namelist() 

1424 ] 

1425 

1426 if "xl/workbook.xml" in component_names: 

1427 return "xlsx" 

1428 if "xl/workbook.bin" in component_names: 

1429 return "xlsb" 

1430 if "content.xml" in component_names: 

1431 return "ods" 

1432 return "zip" 

1433 

1434 

1435class ExcelFile: 

1436 """ 

1437 Class for parsing tabular Excel sheets into DataFrame objects. 

1438 

1439 See read_excel for more documentation. 

1440 

1441 Parameters 

1442 ---------- 

1443 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath), 

1444 A file-like object, xlrd workbook or openpyxl workbook. 

1445 If a string or path object, expected to be a path to a 

1446 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. 

1447 engine : str, default None 

1448 If io is not a buffer or path, this must be set to identify io. 

1449 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` 

1450 Engine compatibility : 

1451 

1452 - ``xlrd`` supports old-style Excel files (.xls). 

1453 - ``openpyxl`` supports newer Excel file formats. 

1454 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). 

1455 - ``pyxlsb`` supports Binary Excel files. 

1456 - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) 

1457 and OpenDocument (.ods) file formats. 

1458 

1459 .. versionchanged:: 1.2.0 

1460 

1461 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 

1462 now only supports old-style ``.xls`` files. 

1463 When ``engine=None``, the following logic will be 

1464 used to determine the engine: 

1465 

1466 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 

1467 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 

1468 - Otherwise if ``path_or_buffer`` is an xls format, 

1469 ``xlrd`` will be used. 

1470 - Otherwise if ``path_or_buffer`` is in xlsb format, 

1471 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used. 

1472 

1473 .. versionadded:: 1.3.0 

1474 

1475 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, 

1476 then ``openpyxl`` will be used. 

1477 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. 

1478 

1479 .. warning:: 

1480 

1481 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. 

1482 This is not supported, switch to using ``openpyxl`` instead. 

1483 engine_kwargs : dict, optional 

1484 Arbitrary keyword arguments passed to excel engine. 

1485 

1486 Examples 

1487 -------- 

1488 >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP 

1489 >>> with pd.ExcelFile("myfile.xls") as xls: # doctest: +SKIP 

1490 ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP 

1491 """ 

1492 

1493 from pandas.io.excel._calamine import CalamineReader 

1494 from pandas.io.excel._odfreader import ODFReader 

1495 from pandas.io.excel._openpyxl import OpenpyxlReader 

1496 from pandas.io.excel._pyxlsb import PyxlsbReader 

1497 from pandas.io.excel._xlrd import XlrdReader 

1498 

1499 _engines: Mapping[str, Any] = { 

1500 "xlrd": XlrdReader, 

1501 "openpyxl": OpenpyxlReader, 

1502 "odf": ODFReader, 

1503 "pyxlsb": PyxlsbReader, 

1504 "calamine": CalamineReader, 

1505 } 

1506 

1507 def __init__( 

1508 self, 

1509 path_or_buffer, 

1510 engine: str | None = None, 

1511 storage_options: StorageOptions | None = None, 

1512 engine_kwargs: dict | None = None, 

1513 ) -> None: 

1514 if engine_kwargs is None: 

1515 engine_kwargs = {} 

1516 

1517 if engine is not None and engine not in self._engines: 

1518 raise ValueError(f"Unknown engine: {engine}") 

1519 

1520 # First argument can also be bytes, so create a buffer 

1521 if isinstance(path_or_buffer, bytes): 

1522 path_or_buffer = BytesIO(path_or_buffer) 

1523 warnings.warn( 

1524 "Passing bytes to 'read_excel' is deprecated and " 

1525 "will be removed in a future version. To read from a " 

1526 "byte string, wrap it in a `BytesIO` object.", 

1527 FutureWarning, 

1528 stacklevel=find_stack_level(), 

1529 ) 

1530 

1531 # Could be a str, ExcelFile, Book, etc. 

1532 self.io = path_or_buffer 

1533 # Always a string 

1534 self._io = stringify_path(path_or_buffer) 

1535 

1536 # Determine xlrd version if installed 

1537 if import_optional_dependency("xlrd", errors="ignore") is None: 

1538 xlrd_version = None 

1539 else: 

1540 import xlrd 

1541 

1542 xlrd_version = Version(get_version(xlrd)) 

1543 

1544 if engine is None: 

1545 # Only determine ext if it is needed 

1546 ext: str | None 

1547 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): 

1548 ext = "xls" 

1549 else: 

1550 ext = inspect_excel_format( 

1551 content_or_path=path_or_buffer, storage_options=storage_options 

1552 ) 

1553 if ext is None: 

1554 raise ValueError( 

1555 "Excel file format cannot be determined, you must specify " 

1556 "an engine manually." 

1557 ) 

1558 

1559 engine = config.get_option(f"io.excel.{ext}.reader", silent=True) 

1560 if engine == "auto": 

1561 engine = get_default_engine(ext, mode="reader") 

1562 

1563 assert engine is not None 

1564 self.engine = engine 

1565 self.storage_options = storage_options 

1566 

1567 self._reader = self._engines[engine]( 

1568 self._io, 

1569 storage_options=storage_options, 

1570 engine_kwargs=engine_kwargs, 

1571 ) 

1572 

1573 def __fspath__(self): 

1574 return self._io 

1575 

1576 def parse( 

1577 self, 

1578 sheet_name: str | int | list[int] | list[str] | None = 0, 

1579 header: int | Sequence[int] | None = 0, 

1580 names: SequenceNotStr[Hashable] | range | None = None, 

1581 index_col: int | Sequence[int] | None = None, 

1582 usecols=None, 

1583 converters=None, 

1584 true_values: Iterable[Hashable] | None = None, 

1585 false_values: Iterable[Hashable] | None = None, 

1586 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

1587 nrows: int | None = None, 

1588 na_values=None, 

1589 parse_dates: list | dict | bool = False, 

1590 date_parser: Callable | lib.NoDefault = lib.no_default, 

1591 date_format: str | dict[Hashable, str] | None = None, 

1592 thousands: str | None = None, 

1593 comment: str | None = None, 

1594 skipfooter: int = 0, 

1595 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1596 **kwds, 

1597 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: 

1598 """ 

1599 Parse specified sheet(s) into a DataFrame. 

1600 

1601 Equivalent to read_excel(ExcelFile, ...) See the read_excel 

1602 docstring for more info on accepted parameters. 

1603 

1604 Returns 

1605 ------- 

1606 DataFrame or dict of DataFrames 

1607 DataFrame from the passed in Excel file. 

1608 

1609 Examples 

1610 -------- 

1611 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) 

1612 >>> df.to_excel('myfile.xlsx') # doctest: +SKIP 

1613 >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP 

1614 >>> file.parse() # doctest: +SKIP 

1615 """ 

1616 return self._reader.parse( 

1617 sheet_name=sheet_name, 

1618 header=header, 

1619 names=names, 

1620 index_col=index_col, 

1621 usecols=usecols, 

1622 converters=converters, 

1623 true_values=true_values, 

1624 false_values=false_values, 

1625 skiprows=skiprows, 

1626 nrows=nrows, 

1627 na_values=na_values, 

1628 parse_dates=parse_dates, 

1629 date_parser=date_parser, 

1630 date_format=date_format, 

1631 thousands=thousands, 

1632 comment=comment, 

1633 skipfooter=skipfooter, 

1634 dtype_backend=dtype_backend, 

1635 **kwds, 

1636 ) 

1637 

1638 @property 

1639 def book(self): 

1640 return self._reader.book 

1641 

1642 @property 

1643 def sheet_names(self): 

1644 return self._reader.sheet_names 

1645 

1646 def close(self) -> None: 

1647 """close io if necessary""" 

1648 self._reader.close() 

1649 

1650 def __enter__(self) -> Self: 

1651 return self 

1652 

1653 def __exit__( 

1654 self, 

1655 exc_type: type[BaseException] | None, 

1656 exc_value: BaseException | None, 

1657 traceback: TracebackType | None, 

1658 ) -> None: 

1659 self.close()