Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/excel/_base.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

413 statements  

1from __future__ import annotations 

2 

3import abc 

4import datetime 

5from functools import partial 

6from io import BytesIO 

7import os 

8from textwrap import fill 

9from types import TracebackType 

10from typing import ( 

11 IO, 

12 Any, 

13 Callable, 

14 Hashable, 

15 Iterable, 

16 List, 

17 Literal, 

18 Mapping, 

19 Sequence, 

20 Union, 

21 cast, 

22 overload, 

23) 

24import zipfile 

25 

26from pandas._config import config 

27 

28from pandas._libs import lib 

29from pandas._libs.parsers import STR_NA_VALUES 

30from pandas._typing import ( 

31 DtypeArg, 

32 DtypeBackend, 

33 FilePath, 

34 IntStrT, 

35 ReadBuffer, 

36 StorageOptions, 

37 WriteExcelBuffer, 

38) 

39from pandas.compat._optional import ( 

40 get_version, 

41 import_optional_dependency, 

42) 

43from pandas.errors import EmptyDataError 

44from pandas.util._decorators import ( 

45 Appender, 

46 doc, 

47) 

48from pandas.util._validators import check_dtype_backend 

49 

50from pandas.core.dtypes.common import ( 

51 is_bool, 

52 is_float, 

53 is_integer, 

54 is_list_like, 

55) 

56 

57from pandas.core.frame import DataFrame 

58from pandas.core.shared_docs import _shared_docs 

59from pandas.util.version import Version 

60 

61from pandas.io.common import ( 

62 IOHandles, 

63 get_handle, 

64 stringify_path, 

65 validate_header_arg, 

66) 

67from pandas.io.excel._util import ( 

68 fill_mi_header, 

69 get_default_engine, 

70 get_writer, 

71 maybe_convert_usecols, 

72 pop_header_name, 

73) 

74from pandas.io.parsers import TextParser 

75from pandas.io.parsers.readers import validate_integer 

76 

77_read_excel_doc = ( 

78 """ 

79Read an Excel file into a pandas DataFrame. 

80 

81Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions 

82read from a local filesystem or URL. Supports an option to read 

83a single sheet or a list of sheets. 

84 

85Parameters 

86---------- 

87io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object 

88 Any valid string path is acceptable. The string could be a URL. Valid 

89 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

90 expected. A local file could be: ``file://localhost/path/to/table.xlsx``. 

91 

92 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

93 

94 By file-like object, we refer to objects with a ``read()`` method, 

95 such as a file handle (e.g. via builtin ``open`` function) 

96 or ``StringIO``. 

97sheet_name : str, int, list, or None, default 0 

98 Strings are used for sheet names. Integers are used in zero-indexed 

99 sheet positions (chart sheets do not count as a sheet position). 

100 Lists of strings/integers are used to request multiple sheets. 

101 Specify None to get all worksheets. 

102 

103 Available cases: 

104 

105 * Defaults to ``0``: 1st sheet as a `DataFrame` 

106 * ``1``: 2nd sheet as a `DataFrame` 

107 * ``"Sheet1"``: Load sheet with name "Sheet1" 

108 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" 

109 as a dict of `DataFrame` 

110 * None: All worksheets. 

111 

112header : int, list of int, default 0 

113 Row (0-indexed) to use for the column labels of the parsed 

114 DataFrame. If a list of integers is passed those row positions will 

115 be combined into a ``MultiIndex``. Use None if there is no header. 

116names : array-like, default None 

117 List of column names to use. If file contains no header row, 

118 then you should explicitly pass header=None. 

119index_col : int, list of int, default None 

120 Column (0-indexed) to use as the row labels of the DataFrame. 

121 Pass None if there is no such column. If a list is passed, 

122 those columns will be combined into a ``MultiIndex``. If a 

123 subset of data is selected with ``usecols``, index_col 

124 is based on the subset. 

125 

126 Missing values will be forward filled to allow roundtripping with 

127 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the 

128 missing values use ``set_index`` after reading the data instead of 

129 ``index_col``. 

130usecols : str, list-like, or callable, default None 

131 * If None, then parse all columns. 

132 * If str, then indicates comma separated list of Excel column letters 

133 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of 

134 both sides. 

135 * If list of int, then indicates list of column numbers to be parsed 

136 (0-indexed). 

137 * If list of string, then indicates list of column names to be parsed. 

138 * If callable, then evaluate each column name against it and parse the 

139 column if the callable returns ``True``. 

140 

141 Returns a subset of the columns according to behavior above. 

142dtype : Type name or dict of column -> type, default None 

143 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} 

144 Use `object` to preserve data as stored in Excel and not interpret dtype. 

145 If converters are specified, they will be applied INSTEAD 

146 of dtype conversion. 

147engine : str, default None 

148 If io is not a buffer or path, this must be set to identify io. 

149 Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". 

150 Engine compatibility : 

151 

152 - "xlrd" supports old-style Excel files (.xls). 

153 - "openpyxl" supports newer Excel file formats. 

154 - "odf" supports OpenDocument file formats (.odf, .ods, .odt). 

155 - "pyxlsb" supports Binary Excel files. 

156 

157 .. versionchanged:: 1.2.0 

158 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 

159 now only supports old-style ``.xls`` files. 

160 When ``engine=None``, the following logic will be 

161 used to determine the engine: 

162 

163 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 

164 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 

165 - Otherwise if ``path_or_buffer`` is an xls format, 

166 ``xlrd`` will be used. 

167 - Otherwise if ``path_or_buffer`` is in xlsb format, 

168 ``pyxlsb`` will be used. 

169 

170 .. versionadded:: 1.3.0 

171 - Otherwise ``openpyxl`` will be used. 

172 

173 .. versionchanged:: 1.3.0 

174 

175converters : dict, default None 

176 Dict of functions for converting values in certain columns. Keys can 

177 either be integers or column labels, values are functions that take one 

178 input argument, the Excel cell content, and return the transformed 

179 content. 

180true_values : list, default None 

181 Values to consider as True. 

182false_values : list, default None 

183 Values to consider as False. 

184skiprows : list-like, int, or callable, optional 

185 Line numbers to skip (0-indexed) or number of lines to skip (int) at the 

186 start of the file. If callable, the callable function will be evaluated 

187 against the row indices, returning True if the row should be skipped and 

188 False otherwise. An example of a valid callable argument would be ``lambda 

189 x: x in [0, 2]``. 

190nrows : int, default None 

191 Number of rows to parse. 

192na_values : scalar, str, list-like, or dict, default None 

193 Additional strings to recognize as NA/NaN. If dict passed, specific 

194 per-column NA values. By default the following values are interpreted 

195 as NaN: '""" 

196 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

197 + """'. 

198keep_default_na : bool, default True 

199 Whether or not to include the default NaN values when parsing the data. 

200 Depending on whether `na_values` is passed in, the behavior is as follows: 

201 

202 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 

203 is appended to the default NaN values used for parsing. 

204 * If `keep_default_na` is True, and `na_values` are not specified, only 

205 the default NaN values are used for parsing. 

206 * If `keep_default_na` is False, and `na_values` are specified, only 

207 the NaN values specified `na_values` are used for parsing. 

208 * If `keep_default_na` is False, and `na_values` are not specified, no 

209 strings will be parsed as NaN. 

210 

211 Note that if `na_filter` is passed in as False, the `keep_default_na` and 

212 `na_values` parameters will be ignored. 

213na_filter : bool, default True 

214 Detect missing value markers (empty strings and the value of na_values). In 

215 data without any NAs, passing na_filter=False can improve the performance 

216 of reading a large file. 

217verbose : bool, default False 

218 Indicate number of NA values placed in non-numeric columns. 

219parse_dates : bool, list-like, or dict, default False 

220 The behavior is as follows: 

221 

222 * bool. If True -> try parsing the index. 

223 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

224 each as a separate date column. 

225 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

226 a single date column. 

227 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

228 result 'foo' 

229 

230 If a column or index contains an unparsable date, the entire column or 

231 index will be returned unaltered as an object data type. If you don`t want to 

232 parse some cells as date just change their type in Excel to "Text". 

233 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. 

234 

235 Note: A fast-path exists for iso8601-formatted dates. 

236date_parser : function, optional 

237 Function to use for converting a sequence of string columns to an array of 

238 datetime instances. The default uses ``dateutil.parser.parser`` to do the 

239 conversion. Pandas will try to call `date_parser` in three different ways, 

240 advancing to the next if an exception occurs: 1) Pass one or more arrays 

241 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 

242 string values from the columns defined by `parse_dates` into a single array 

243 and pass that; and 3) call `date_parser` once for each row using one or 

244 more strings (corresponding to the columns defined by `parse_dates`) as 

245 arguments. 

246 

247 .. deprecated:: 2.0.0 

248 Use ``date_format`` instead, or read in as ``object`` and then apply 

249 :func:`to_datetime` as-needed. 

250date_format : str or dict of column -> format, default ``None`` 

251 If used in conjunction with ``parse_dates``, will parse dates according to this 

252 format. For anything more complex, 

253 please read in as ``object`` and then apply :func:`to_datetime` as-needed. 

254 

255 .. versionadded:: 2.0.0 

256thousands : str, default None 

257 Thousands separator for parsing string columns to numeric. Note that 

258 this parameter is only necessary for columns stored as TEXT in Excel, 

259 any numeric columns will automatically be parsed, regardless of display 

260 format. 

261decimal : str, default '.' 

262 Character to recognize as decimal point for parsing string columns to numeric. 

263 Note that this parameter is only necessary for columns stored as TEXT in Excel, 

264 any numeric columns will automatically be parsed, regardless of display 

265 format.(e.g. use ',' for European data). 

266 

267 .. versionadded:: 1.4.0 

268 

269comment : str, default None 

270 Comments out remainder of line. Pass a character or characters to this 

271 argument to indicate comments in the input file. Any data between the 

272 comment string and the end of the current line is ignored. 

273skipfooter : int, default 0 

274 Rows at the end to skip (0-indexed). 

275{storage_options} 

276 

277 .. versionadded:: 1.2.0 

278 

279dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames 

280 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

281 arrays, nullable dtypes are used for all dtypes that have a nullable 

282 implementation when "numpy_nullable" is set, pyarrow is used for all 

283 dtypes if "pyarrow" is set. 

284 

285 The dtype_backends are still experimential. 

286 

287 .. versionadded:: 2.0 

288 

289Returns 

290------- 

291DataFrame or dict of DataFrames 

292 DataFrame from the passed in Excel file. See notes in sheet_name 

293 argument for more information on when a dict of DataFrames is returned. 

294 

295See Also 

296-------- 

297DataFrame.to_excel : Write DataFrame to an Excel file. 

298DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

299read_csv : Read a comma-separated values (csv) file into DataFrame. 

300read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

301 

302Examples 

303-------- 

304The file can be read using the file name as string or an open file object: 

305 

306>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP 

307 Name Value 

3080 string1 1 

3091 string2 2 

3102 #Comment 3 

311 

312>>> pd.read_excel(open('tmp.xlsx', 'rb'), 

313... sheet_name='Sheet3') # doctest: +SKIP 

314 Unnamed: 0 Name Value 

3150 0 string1 1 

3161 1 string2 2 

3172 2 #Comment 3 

318 

319Index and header can be specified via the `index_col` and `header` arguments 

320 

321>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP 

322 0 1 2 

3230 NaN Name Value 

3241 0.0 string1 1 

3252 1.0 string2 2 

3263 2.0 #Comment 3 

327 

328Column types are inferred but can be explicitly specified 

329 

330>>> pd.read_excel('tmp.xlsx', index_col=0, 

331... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP 

332 Name Value 

3330 string1 1.0 

3341 string2 2.0 

3352 #Comment 3.0 

336 

337True, False, and NA values, and thousands separators have defaults, 

338but can be explicitly specified, too. Supply the values you would like 

339as strings or lists of strings! 

340 

341>>> pd.read_excel('tmp.xlsx', index_col=0, 

342... na_values=['string1', 'string2']) # doctest: +SKIP 

343 Name Value 

3440 NaN 1 

3451 NaN 2 

3462 #Comment 3 

347 

348Comment lines in the excel input file can be skipped using the `comment` kwarg 

349 

350>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP 

351 Name Value 

3520 string1 1.0 

3531 string2 2.0 

3542 None NaN 

355""" 

356) 

357 

358 

359@overload 

360def read_excel( 

361 io, 

362 # sheet name is str or int -> DataFrame 

363 sheet_name: str | int = ..., 

364 *, 

365 header: int | Sequence[int] | None = ..., 

366 names: list[str] | None = ..., 

367 index_col: int | Sequence[int] | None = ..., 

368 usecols: int 

369 | str 

370 | Sequence[int] 

371 | Sequence[str] 

372 | Callable[[str], bool] 

373 | None = ..., 

374 dtype: DtypeArg | None = ..., 

375 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., 

376 converters: dict[str, Callable] | dict[int, Callable] | None = ..., 

377 true_values: Iterable[Hashable] | None = ..., 

378 false_values: Iterable[Hashable] | None = ..., 

379 skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., 

380 nrows: int | None = ..., 

381 na_values=..., 

382 keep_default_na: bool = ..., 

383 na_filter: bool = ..., 

384 verbose: bool = ..., 

385 parse_dates: list | dict | bool = ..., 

386 date_parser: Callable | lib.NoDefault = ..., 

387 date_format: dict[Hashable, str] | str | None = ..., 

388 thousands: str | None = ..., 

389 decimal: str = ..., 

390 comment: str | None = ..., 

391 skipfooter: int = ..., 

392 storage_options: StorageOptions = ..., 

393 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

394) -> DataFrame: 

395 ... 

396 

397 

398@overload 

399def read_excel( 

400 io, 

401 # sheet name is list or None -> dict[IntStrT, DataFrame] 

402 sheet_name: list[IntStrT] | None, 

403 *, 

404 header: int | Sequence[int] | None = ..., 

405 names: list[str] | None = ..., 

406 index_col: int | Sequence[int] | None = ..., 

407 usecols: int 

408 | str 

409 | Sequence[int] 

410 | Sequence[str] 

411 | Callable[[str], bool] 

412 | None = ..., 

413 dtype: DtypeArg | None = ..., 

414 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., 

415 converters: dict[str, Callable] | dict[int, Callable] | None = ..., 

416 true_values: Iterable[Hashable] | None = ..., 

417 false_values: Iterable[Hashable] | None = ..., 

418 skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., 

419 nrows: int | None = ..., 

420 na_values=..., 

421 keep_default_na: bool = ..., 

422 na_filter: bool = ..., 

423 verbose: bool = ..., 

424 parse_dates: list | dict | bool = ..., 

425 date_parser: Callable | lib.NoDefault = ..., 

426 date_format: dict[Hashable, str] | str | None = ..., 

427 thousands: str | None = ..., 

428 decimal: str = ..., 

429 comment: str | None = ..., 

430 skipfooter: int = ..., 

431 storage_options: StorageOptions = ..., 

432 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

433) -> dict[IntStrT, DataFrame]: 

434 ... 

435 

436 

437@doc(storage_options=_shared_docs["storage_options"]) 

438@Appender(_read_excel_doc) 

439def read_excel( 

440 io, 

441 sheet_name: str | int | list[IntStrT] | None = 0, 

442 *, 

443 header: int | Sequence[int] | None = 0, 

444 names: list[str] | None = None, 

445 index_col: int | Sequence[int] | None = None, 

446 usecols: int 

447 | str 

448 | Sequence[int] 

449 | Sequence[str] 

450 | Callable[[str], bool] 

451 | None = None, 

452 dtype: DtypeArg | None = None, 

453 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, 

454 converters: dict[str, Callable] | dict[int, Callable] | None = None, 

455 true_values: Iterable[Hashable] | None = None, 

456 false_values: Iterable[Hashable] | None = None, 

457 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

458 nrows: int | None = None, 

459 na_values=None, 

460 keep_default_na: bool = True, 

461 na_filter: bool = True, 

462 verbose: bool = False, 

463 parse_dates: list | dict | bool = False, 

464 date_parser: Callable | lib.NoDefault = lib.no_default, 

465 date_format: dict[Hashable, str] | str | None = None, 

466 thousands: str | None = None, 

467 decimal: str = ".", 

468 comment: str | None = None, 

469 skipfooter: int = 0, 

470 storage_options: StorageOptions = None, 

471 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

472) -> DataFrame | dict[IntStrT, DataFrame]: 

473 check_dtype_backend(dtype_backend) 

474 

475 should_close = False 

476 if not isinstance(io, ExcelFile): 

477 should_close = True 

478 io = ExcelFile(io, storage_options=storage_options, engine=engine) 

479 elif engine and engine != io.engine: 

480 raise ValueError( 

481 "Engine should not be specified when passing " 

482 "an ExcelFile - ExcelFile already has the engine set" 

483 ) 

484 

485 try: 

486 data = io.parse( 

487 sheet_name=sheet_name, 

488 header=header, 

489 names=names, 

490 index_col=index_col, 

491 usecols=usecols, 

492 dtype=dtype, 

493 converters=converters, 

494 true_values=true_values, 

495 false_values=false_values, 

496 skiprows=skiprows, 

497 nrows=nrows, 

498 na_values=na_values, 

499 keep_default_na=keep_default_na, 

500 na_filter=na_filter, 

501 verbose=verbose, 

502 parse_dates=parse_dates, 

503 date_parser=date_parser, 

504 date_format=date_format, 

505 thousands=thousands, 

506 decimal=decimal, 

507 comment=comment, 

508 skipfooter=skipfooter, 

509 dtype_backend=dtype_backend, 

510 ) 

511 finally: 

512 # make sure to close opened file handles 

513 if should_close: 

514 io.close() 

515 return data 

516 

517 

518class BaseExcelReader(metaclass=abc.ABCMeta): 

519 def __init__( 

520 self, filepath_or_buffer, storage_options: StorageOptions = None 

521 ) -> None: 

522 # First argument can also be bytes, so create a buffer 

523 if isinstance(filepath_or_buffer, bytes): 

524 filepath_or_buffer = BytesIO(filepath_or_buffer) 

525 

526 self.handles = IOHandles( 

527 handle=filepath_or_buffer, compression={"method": None} 

528 ) 

529 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): 

530 self.handles = get_handle( 

531 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False 

532 ) 

533 

534 if isinstance(self.handles.handle, self._workbook_class): 

535 self.book = self.handles.handle 

536 elif hasattr(self.handles.handle, "read"): 

537 # N.B. xlrd.Book has a read attribute too 

538 self.handles.handle.seek(0) 

539 try: 

540 self.book = self.load_workbook(self.handles.handle) 

541 except Exception: 

542 self.close() 

543 raise 

544 else: 

545 raise ValueError( 

546 "Must explicitly set engine if not passing in buffer or path for io." 

547 ) 

548 

549 @property 

550 @abc.abstractmethod 

551 def _workbook_class(self): 

552 pass 

553 

554 @abc.abstractmethod 

555 def load_workbook(self, filepath_or_buffer): 

556 pass 

557 

558 def close(self) -> None: 

559 if hasattr(self, "book"): 

560 if hasattr(self.book, "close"): 

561 # pyxlsb: opens a TemporaryFile 

562 # openpyxl: https://stackoverflow.com/questions/31416842/ 

563 # openpyxl-does-not-close-excel-workbook-in-read-only-mode 

564 self.book.close() 

565 elif hasattr(self.book, "release_resources"): 

566 # xlrd 

567 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548 

568 self.book.release_resources() 

569 self.handles.close() 

570 

571 @property 

572 @abc.abstractmethod 

573 def sheet_names(self) -> list[str]: 

574 pass 

575 

576 @abc.abstractmethod 

577 def get_sheet_by_name(self, name: str): 

578 pass 

579 

580 @abc.abstractmethod 

581 def get_sheet_by_index(self, index: int): 

582 pass 

583 

584 @abc.abstractmethod 

585 def get_sheet_data(self, sheet, rows: int | None = None): 

586 pass 

587 

588 def raise_if_bad_sheet_by_index(self, index: int) -> None: 

589 n_sheets = len(self.sheet_names) 

590 if index >= n_sheets: 

591 raise ValueError( 

592 f"Worksheet index {index} is invalid, {n_sheets} worksheets found" 

593 ) 

594 

595 def raise_if_bad_sheet_by_name(self, name: str) -> None: 

596 if name not in self.sheet_names: 

597 raise ValueError(f"Worksheet named '{name}' not found") 

598 

599 def _check_skiprows_func( 

600 self, 

601 skiprows: Callable, 

602 rows_to_use: int, 

603 ) -> int: 

604 """ 

605 Determine how many file rows are required to obtain `nrows` data 

606 rows when `skiprows` is a function. 

607 

608 Parameters 

609 ---------- 

610 skiprows : function 

611 The function passed to read_excel by the user. 

612 rows_to_use : int 

613 The number of rows that will be needed for the header and 

614 the data. 

615 

616 Returns 

617 ------- 

618 int 

619 """ 

620 i = 0 

621 rows_used_so_far = 0 

622 while rows_used_so_far < rows_to_use: 

623 if not skiprows(i): 

624 rows_used_so_far += 1 

625 i += 1 

626 return i 

627 

628 def _calc_rows( 

629 self, 

630 header: int | Sequence[int] | None, 

631 index_col: int | Sequence[int] | None, 

632 skiprows: Sequence[int] | int | Callable[[int], object] | None, 

633 nrows: int | None, 

634 ) -> int | None: 

635 """ 

636 If nrows specified, find the number of rows needed from the 

637 file, otherwise return None. 

638 

639 

640 Parameters 

641 ---------- 

642 header : int, list of int, or None 

643 See read_excel docstring. 

644 index_col : int, list of int, or None 

645 See read_excel docstring. 

646 skiprows : list-like, int, callable, or None 

647 See read_excel docstring. 

648 nrows : int or None 

649 See read_excel docstring. 

650 

651 Returns 

652 ------- 

653 int or None 

654 """ 

655 if nrows is None: 

656 return None 

657 if header is None: 

658 header_rows = 1 

659 elif is_integer(header): 

660 header = cast(int, header) 

661 header_rows = 1 + header 

662 else: 

663 header = cast(Sequence, header) 

664 header_rows = 1 + header[-1] 

665 # If there is a MultiIndex header and an index then there is also 

666 # a row containing just the index name(s) 

667 if is_list_like(header) and index_col is not None: 

668 header = cast(Sequence, header) 

669 if len(header) > 1: 

670 header_rows += 1 

671 if skiprows is None: 

672 return header_rows + nrows 

673 if is_integer(skiprows): 

674 skiprows = cast(int, skiprows) 

675 return header_rows + nrows + skiprows 

676 if is_list_like(skiprows): 

677 

678 def f(skiprows: Sequence, x: int) -> bool: 

679 return x in skiprows 

680 

681 skiprows = cast(Sequence, skiprows) 

682 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows) 

683 if callable(skiprows): 

684 return self._check_skiprows_func( 

685 skiprows, 

686 header_rows + nrows, 

687 ) 

688 # else unexpected skiprows type: read_excel will not optimize 

689 # the number of rows read from file 

690 return None 

691 

692 def parse( 

693 self, 

694 sheet_name: str | int | list[int] | list[str] | None = 0, 

695 header: int | Sequence[int] | None = 0, 

696 names=None, 

697 index_col: int | Sequence[int] | None = None, 

698 usecols=None, 

699 dtype: DtypeArg | None = None, 

700 true_values: Iterable[Hashable] | None = None, 

701 false_values: Iterable[Hashable] | None = None, 

702 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

703 nrows: int | None = None, 

704 na_values=None, 

705 verbose: bool = False, 

706 parse_dates: list | dict | bool = False, 

707 date_parser: Callable | lib.NoDefault = lib.no_default, 

708 date_format: dict[Hashable, str] | str | None = None, 

709 thousands: str | None = None, 

710 decimal: str = ".", 

711 comment: str | None = None, 

712 skipfooter: int = 0, 

713 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

714 **kwds, 

715 ): 

716 validate_header_arg(header) 

717 validate_integer("nrows", nrows) 

718 

719 ret_dict = False 

720 

721 # Keep sheetname to maintain backwards compatibility. 

722 sheets: list[int] | list[str] 

723 if isinstance(sheet_name, list): 

724 sheets = sheet_name 

725 ret_dict = True 

726 elif sheet_name is None: 

727 sheets = self.sheet_names 

728 ret_dict = True 

729 elif isinstance(sheet_name, str): 

730 sheets = [sheet_name] 

731 else: 

732 sheets = [sheet_name] 

733 

734 # handle same-type duplicates. 

735 sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys())) 

736 

737 output = {} 

738 

739 last_sheetname = None 

740 for asheetname in sheets: 

741 last_sheetname = asheetname 

742 if verbose: 

743 print(f"Reading sheet {asheetname}") 

744 

745 if isinstance(asheetname, str): 

746 sheet = self.get_sheet_by_name(asheetname) 

747 else: # assume an integer if not a string 

748 sheet = self.get_sheet_by_index(asheetname) 

749 

750 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) 

751 data = self.get_sheet_data(sheet, file_rows_needed) 

752 if hasattr(sheet, "close"): 

753 # pyxlsb opens two TemporaryFiles 

754 sheet.close() 

755 usecols = maybe_convert_usecols(usecols) 

756 

757 if not data: 

758 output[asheetname] = DataFrame() 

759 continue 

760 

761 is_list_header = False 

762 is_len_one_list_header = False 

763 if is_list_like(header): 

764 assert isinstance(header, Sequence) 

765 is_list_header = True 

766 if len(header) == 1: 

767 is_len_one_list_header = True 

768 

769 if is_len_one_list_header: 

770 header = cast(Sequence[int], header)[0] 

771 

772 # forward fill and pull out names for MultiIndex column 

773 header_names = None 

774 if header is not None and is_list_like(header): 

775 assert isinstance(header, Sequence) 

776 

777 header_names = [] 

778 control_row = [True] * len(data[0]) 

779 

780 for row in header: 

781 if is_integer(skiprows): 

782 assert isinstance(skiprows, int) 

783 row += skiprows 

784 

785 if row > len(data) - 1: 

786 raise ValueError( 

787 f"header index {row} exceeds maximum index " 

788 f"{len(data) - 1} of data.", 

789 ) 

790 

791 data[row], control_row = fill_mi_header(data[row], control_row) 

792 

793 if index_col is not None: 

794 header_name, _ = pop_header_name(data[row], index_col) 

795 header_names.append(header_name) 

796 

797 # If there is a MultiIndex header and an index then there is also 

798 # a row containing just the index name(s) 

799 has_index_names = False 

800 if is_list_header and not is_len_one_list_header and index_col is not None: 

801 index_col_list: Sequence[int] 

802 if isinstance(index_col, int): 

803 index_col_list = [index_col] 

804 else: 

805 assert isinstance(index_col, Sequence) 

806 index_col_list = index_col 

807 

808 # We have to handle mi without names. If any of the entries in the data 

809 # columns are not empty, this is a regular row 

810 assert isinstance(header, Sequence) 

811 if len(header) < len(data): 

812 potential_index_names = data[len(header)] 

813 potential_data = [ 

814 x 

815 for i, x in enumerate(potential_index_names) 

816 if not control_row[i] and i not in index_col_list 

817 ] 

818 has_index_names = all(x == "" or x is None for x in potential_data) 

819 

820 if is_list_like(index_col): 

821 # Forward fill values for MultiIndex index. 

822 if header is None: 

823 offset = 0 

824 elif isinstance(header, int): 

825 offset = 1 + header 

826 else: 

827 offset = 1 + max(header) 

828 

829 # GH34673: if MultiIndex names present and not defined in the header, 

830 # offset needs to be incremented so that forward filling starts 

831 # from the first MI value instead of the name 

832 if has_index_names: 

833 offset += 1 

834 

835 # Check if we have an empty dataset 

836 # before trying to collect data. 

837 if offset < len(data): 

838 assert isinstance(index_col, Sequence) 

839 

840 for col in index_col: 

841 last = data[offset][col] 

842 

843 for row in range(offset + 1, len(data)): 

844 if data[row][col] == "" or data[row][col] is None: 

845 data[row][col] = last 

846 else: 

847 last = data[row][col] 

848 

849 # GH 12292 : error when read one empty column from excel file 

850 try: 

851 parser = TextParser( 

852 data, 

853 names=names, 

854 header=header, 

855 index_col=index_col, 

856 has_index_names=has_index_names, 

857 dtype=dtype, 

858 true_values=true_values, 

859 false_values=false_values, 

860 skiprows=skiprows, 

861 nrows=nrows, 

862 na_values=na_values, 

863 skip_blank_lines=False, # GH 39808 

864 parse_dates=parse_dates, 

865 date_parser=date_parser, 

866 date_format=date_format, 

867 thousands=thousands, 

868 decimal=decimal, 

869 comment=comment, 

870 skipfooter=skipfooter, 

871 usecols=usecols, 

872 dtype_backend=dtype_backend, 

873 **kwds, 

874 ) 

875 

876 output[asheetname] = parser.read(nrows=nrows) 

877 

878 if header_names: 

879 output[asheetname].columns = output[asheetname].columns.set_names( 

880 header_names 

881 ) 

882 

883 except EmptyDataError: 

884 # No Data, return an empty DataFrame 

885 output[asheetname] = DataFrame() 

886 

887 except Exception as err: 

888 err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) 

889 raise err 

890 

891 if last_sheetname is None: 

892 raise ValueError("Sheet name is an empty list") 

893 

894 if ret_dict: 

895 return output 

896 else: 

897 return output[last_sheetname] 

898 

899 

900@doc(storage_options=_shared_docs["storage_options"]) 

901class ExcelWriter(metaclass=abc.ABCMeta): 

902 """ 

903 Class for writing DataFrame objects into excel sheets. 

904 

905 Default is to use: 

906 

907 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter 

908 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__ 

909 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files 

910 

911 See ``DataFrame.to_excel`` for typical usage. 

912 

913 The writer should be used as a context manager. Otherwise, call `close()` to save 

914 and close any opened file handles. 

915 

916 Parameters 

917 ---------- 

918 path : str or typing.BinaryIO 

919 Path to xls or xlsx or ods file. 

920 engine : str (optional) 

921 Engine to use for writing. If None, defaults to 

922 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword 

923 argument. 

924 date_format : str, default None 

925 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). 

926 datetime_format : str, default None 

927 Format string for datetime objects written into Excel files. 

928 (e.g. 'YYYY-MM-DD HH:MM:SS'). 

929 mode : {{'w', 'a'}}, default 'w' 

930 File mode to use (write or append). Append does not work with fsspec URLs. 

931 {storage_options} 

932 

933 .. versionadded:: 1.2.0 

934 

935 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error' 

936 How to behave when trying to write to a sheet that already 

937 exists (append mode only). 

938 

939 * error: raise a ValueError. 

940 * new: Create a new sheet, with a name determined by the engine. 

941 * replace: Delete the contents of the sheet before writing to it. 

942 * overlay: Write contents to the existing sheet without removing the old 

943 contents. 

944 

945 .. versionadded:: 1.3.0 

946 

947 .. versionchanged:: 1.4.0 

948 

949 Added ``overlay`` option 

950 

951 engine_kwargs : dict, optional 

952 Keyword arguments to be passed into the engine. These will be passed to 

953 the following functions of the respective engines: 

954 

955 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` 

956 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` 

957 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` 

958 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` 

959 

960 .. versionadded:: 1.3.0 

961 

962 Notes 

963 ----- 

964 For compatibility with CSV writers, ExcelWriter serializes lists 

965 and dicts to strings before writing. 

966 

967 Examples 

968 -------- 

969 Default usage: 

970 

971 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

972 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: 

973 ... df.to_excel(writer) # doctest: +SKIP 

974 

975 To write to separate sheets in a single file: 

976 

977 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP 

978 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

979 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: 

980 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP 

981 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP 

982 

983 You can set the date format or datetime format: 

984 

985 >>> from datetime import date, datetime # doctest: +SKIP 

986 >>> df = pd.DataFrame( 

987 ... [ 

988 ... [date(2014, 1, 31), date(1999, 9, 24)], 

989 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], 

990 ... ], 

991 ... index=["Date", "Datetime"], 

992 ... columns=["X", "Y"], 

993 ... ) # doctest: +SKIP 

994 >>> with pd.ExcelWriter( 

995 ... "path_to_file.xlsx", 

996 ... date_format="YYYY-MM-DD", 

997 ... datetime_format="YYYY-MM-DD HH:MM:SS" 

998 ... ) as writer: 

999 ... df.to_excel(writer) # doctest: +SKIP 

1000 

1001 You can also append to an existing Excel file: 

1002 

1003 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer: 

1004 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP 

1005 

1006 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it 

1007 already exists: 

1008 

1009 >>> with ExcelWriter( 

1010 ... "path_to_file.xlsx", 

1011 ... mode="a", 

1012 ... engine="openpyxl", 

1013 ... if_sheet_exists="replace", 

1014 ... ) as writer: 

1015 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP 

1016 

1017 You can also write multiple DataFrames to a single sheet. Note that the 

1018 ``if_sheet_exists`` parameter needs to be set to ``overlay``: 

1019 

1020 >>> with ExcelWriter("path_to_file.xlsx", 

1021 ... mode="a", 

1022 ... engine="openpyxl", 

1023 ... if_sheet_exists="overlay", 

1024 ... ) as writer: 

1025 ... df1.to_excel(writer, sheet_name="Sheet1") 

1026 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP 

1027 

1028 You can store Excel file in RAM: 

1029 

1030 >>> import io 

1031 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) 

1032 >>> buffer = io.BytesIO() 

1033 >>> with pd.ExcelWriter(buffer) as writer: 

1034 ... df.to_excel(writer) 

1035 

1036 You can pack Excel file into zip archive: 

1037 

1038 >>> import zipfile # doctest: +SKIP 

1039 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

1040 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf: 

1041 ... with zf.open("filename.xlsx", "w") as buffer: 

1042 ... with pd.ExcelWriter(buffer) as writer: 

1043 ... df.to_excel(writer) # doctest: +SKIP 

1044 

1045 You can specify additional arguments to the underlying engine: 

1046 

1047 >>> with pd.ExcelWriter( 

1048 ... "path_to_file.xlsx", 

1049 ... engine="xlsxwriter", 

1050 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}} 

1051 ... ) as writer: 

1052 ... df.to_excel(writer) # doctest: +SKIP 

1053 

1054 In append mode, ``engine_kwargs`` are passed through to 

1055 openpyxl's ``load_workbook``: 

1056 

1057 >>> with pd.ExcelWriter( 

1058 ... "path_to_file.xlsx", 

1059 ... engine="openpyxl", 

1060 ... mode="a", 

1061 ... engine_kwargs={{"keep_vba": True}} 

1062 ... ) as writer: 

1063 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP 

1064 """ 

1065 

1066 # Defining an ExcelWriter implementation (see abstract methods for more...) 

1067 

1068 # - Mandatory 

1069 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)`` 

1070 # --> called to write additional DataFrames to disk 

1071 # - ``_supported_extensions`` (tuple of supported extensions), used to 

1072 # check that engine supports the given extension. 

1073 # - ``_engine`` - string that gives the engine name. Necessary to 

1074 # instantiate class directly and bypass ``ExcelWriterMeta`` engine 

1075 # lookup. 

1076 # - ``save(self)`` --> called to save file to disk 

1077 # - Mostly mandatory (i.e. should at least exist) 

1078 # - book, cur_sheet, path 

1079 

1080 # - Optional: 

1081 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called 

1082 # with path as first argument. 

1083 

1084 # You also need to register the class with ``register_writer()``. 

1085 # Technically, ExcelWriter implementations don't need to subclass 

1086 # ExcelWriter. 

1087 

1088 _engine: str 

1089 _supported_extensions: tuple[str, ...] 

1090 

1091 def __new__( 

1092 cls: type[ExcelWriter], 

1093 path: FilePath | WriteExcelBuffer | ExcelWriter, 

1094 engine: str | None = None, 

1095 date_format: str | None = None, 

1096 datetime_format: str | None = None, 

1097 mode: str = "w", 

1098 storage_options: StorageOptions = None, 

1099 if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None, 

1100 engine_kwargs: dict | None = None, 

1101 ) -> ExcelWriter: 

1102 # only switch class if generic(ExcelWriter) 

1103 if cls is ExcelWriter: 

1104 if engine is None or (isinstance(engine, str) and engine == "auto"): 

1105 if isinstance(path, str): 

1106 ext = os.path.splitext(path)[-1][1:] 

1107 else: 

1108 ext = "xlsx" 

1109 

1110 try: 

1111 engine = config.get_option(f"io.excel.{ext}.writer", silent=True) 

1112 if engine == "auto": 

1113 engine = get_default_engine(ext, mode="writer") 

1114 except KeyError as err: 

1115 raise ValueError(f"No engine for filetype: '{ext}'") from err 

1116 

1117 # for mypy 

1118 assert engine is not None 

1119 cls = get_writer(engine) 

1120 

1121 return object.__new__(cls) 

1122 

1123 # declare external properties you can count on 

1124 _path = None 

1125 

1126 @property 

1127 def supported_extensions(self) -> tuple[str, ...]: 

1128 """Extensions that writer engine supports.""" 

1129 return self._supported_extensions 

1130 

1131 @property 

1132 def engine(self) -> str: 

1133 """Name of engine.""" 

1134 return self._engine 

1135 

1136 @property 

1137 @abc.abstractmethod 

1138 def sheets(self) -> dict[str, Any]: 

1139 """Mapping of sheet names to sheet objects.""" 

1140 

1141 @property 

1142 @abc.abstractmethod 

1143 def book(self): 

1144 """ 

1145 Book instance. Class type will depend on the engine used. 

1146 

1147 This attribute can be used to access engine-specific features. 

1148 """ 

1149 

1150 @abc.abstractmethod 

1151 def _write_cells( 

1152 self, 

1153 cells, 

1154 sheet_name: str | None = None, 

1155 startrow: int = 0, 

1156 startcol: int = 0, 

1157 freeze_panes: tuple[int, int] | None = None, 

1158 ) -> None: 

1159 """ 

1160 Write given formatted cells into Excel an excel sheet 

1161 

1162 Parameters 

1163 ---------- 

1164 cells : generator 

1165 cell of formatted data to save to Excel sheet 

1166 sheet_name : str, default None 

1167 Name of Excel sheet, if None, then use self.cur_sheet 

1168 startrow : upper left cell row to dump data frame 

1169 startcol : upper left cell column to dump data frame 

1170 freeze_panes: int tuple of length 2 

1171 contains the bottom-most row and right-most column to freeze 

1172 """ 

1173 

1174 @abc.abstractmethod 

1175 def _save(self) -> None: 

1176 """ 

1177 Save workbook to disk. 

1178 """ 

1179 

1180 def __init__( 

1181 self, 

1182 path: FilePath | WriteExcelBuffer | ExcelWriter, 

1183 engine: str | None = None, 

1184 date_format: str | None = None, 

1185 datetime_format: str | None = None, 

1186 mode: str = "w", 

1187 storage_options: StorageOptions = None, 

1188 if_sheet_exists: str | None = None, 

1189 engine_kwargs: dict[str, Any] | None = None, 

1190 ) -> None: 

1191 # validate that this engine can handle the extension 

1192 if isinstance(path, str): 

1193 ext = os.path.splitext(path)[-1] 

1194 self.check_extension(ext) 

1195 

1196 # use mode to open the file 

1197 if "b" not in mode: 

1198 mode += "b" 

1199 # use "a" for the user to append data to excel but internally use "r+" to let 

1200 # the excel backend first read the existing file and then write any data to it 

1201 mode = mode.replace("a", "r+") 

1202 

1203 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"): 

1204 raise ValueError( 

1205 f"'{if_sheet_exists}' is not valid for if_sheet_exists. " 

1206 "Valid options are 'error', 'new', 'replace' and 'overlay'." 

1207 ) 

1208 if if_sheet_exists and "r+" not in mode: 

1209 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')") 

1210 if if_sheet_exists is None: 

1211 if_sheet_exists = "error" 

1212 self._if_sheet_exists = if_sheet_exists 

1213 

1214 # cast ExcelWriter to avoid adding 'if self._handles is not None' 

1215 self._handles = IOHandles( 

1216 cast(IO[bytes], path), compression={"compression": None} 

1217 ) 

1218 if not isinstance(path, ExcelWriter): 

1219 self._handles = get_handle( 

1220 path, mode, storage_options=storage_options, is_text=False 

1221 ) 

1222 self._cur_sheet = None 

1223 

1224 if date_format is None: 

1225 self._date_format = "YYYY-MM-DD" 

1226 else: 

1227 self._date_format = date_format 

1228 if datetime_format is None: 

1229 self._datetime_format = "YYYY-MM-DD HH:MM:SS" 

1230 else: 

1231 self._datetime_format = datetime_format 

1232 

1233 self._mode = mode 

1234 

1235 @property 

1236 def date_format(self) -> str: 

1237 """ 

1238 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’). 

1239 """ 

1240 return self._date_format 

1241 

1242 @property 

1243 def datetime_format(self) -> str: 

1244 """ 

1245 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’). 

1246 """ 

1247 return self._datetime_format 

1248 

1249 @property 

1250 def if_sheet_exists(self) -> str: 

1251 """ 

1252 How to behave when writing to a sheet that already exists in append mode. 

1253 """ 

1254 return self._if_sheet_exists 

1255 

1256 def __fspath__(self) -> str: 

1257 return getattr(self._handles.handle, "name", "") 

1258 

1259 def _get_sheet_name(self, sheet_name: str | None) -> str: 

1260 if sheet_name is None: 

1261 sheet_name = self._cur_sheet 

1262 if sheet_name is None: # pragma: no cover 

1263 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property") 

1264 return sheet_name 

1265 

1266 def _value_with_fmt(self, val) -> tuple[object, str | None]: 

1267 """ 

1268 Convert numpy types to Python types for the Excel writers. 

1269 

1270 Parameters 

1271 ---------- 

1272 val : object 

1273 Value to be written into cells 

1274 

1275 Returns 

1276 ------- 

1277 Tuple with the first element being the converted value and the second 

1278 being an optional format 

1279 """ 

1280 fmt = None 

1281 

1282 if is_integer(val): 

1283 val = int(val) 

1284 elif is_float(val): 

1285 val = float(val) 

1286 elif is_bool(val): 

1287 val = bool(val) 

1288 elif isinstance(val, datetime.datetime): 

1289 fmt = self._datetime_format 

1290 elif isinstance(val, datetime.date): 

1291 fmt = self._date_format 

1292 elif isinstance(val, datetime.timedelta): 

1293 val = val.total_seconds() / 86400 

1294 fmt = "0" 

1295 else: 

1296 val = str(val) 

1297 

1298 return val, fmt 

1299 

1300 @classmethod 

1301 def check_extension(cls, ext: str) -> Literal[True]: 

1302 """ 

1303 checks that path's extension against the Writer's supported 

1304 extensions. If it isn't supported, raises UnsupportedFiletypeError. 

1305 """ 

1306 if ext.startswith("."): 

1307 ext = ext[1:] 

1308 if not any(ext in extension for extension in cls._supported_extensions): 

1309 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") 

1310 return True 

1311 

1312 # Allow use as a contextmanager 

1313 def __enter__(self) -> ExcelWriter: 

1314 return self 

1315 

1316 def __exit__( 

1317 self, 

1318 exc_type: type[BaseException] | None, 

1319 exc_value: BaseException | None, 

1320 traceback: TracebackType | None, 

1321 ) -> None: 

1322 self.close() 

1323 

1324 def close(self) -> None: 

1325 """synonym for save, to make it more file-like""" 

1326 self._save() 

1327 self._handles.close() 

1328 

1329 

1330XLS_SIGNATURES = ( 

1331 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 

1332 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 

1333 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 

1334 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary 

1335) 

1336ZIP_SIGNATURE = b"PK\x03\x04" 

1337PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) 

1338 

1339 

1340@doc(storage_options=_shared_docs["storage_options"]) 

1341def inspect_excel_format( 

1342 content_or_path: FilePath | ReadBuffer[bytes], 

1343 storage_options: StorageOptions = None, 

1344) -> str | None: 

1345 """ 

1346 Inspect the path or content of an excel file and get its format. 

1347 

1348 Adopted from xlrd: https://github.com/python-excel/xlrd. 

1349 

1350 Parameters 

1351 ---------- 

1352 content_or_path : str or file-like object 

1353 Path to file or content of file to inspect. May be a URL. 

1354 {storage_options} 

1355 

1356 Returns 

1357 ------- 

1358 str or None 

1359 Format of file if it can be determined. 

1360 

1361 Raises 

1362 ------ 

1363 ValueError 

1364 If resulting stream is empty. 

1365 BadZipFile 

1366 If resulting stream does not have an XLS signature and is not a valid zipfile. 

1367 """ 

1368 if isinstance(content_or_path, bytes): 

1369 content_or_path = BytesIO(content_or_path) 

1370 

1371 with get_handle( 

1372 content_or_path, "rb", storage_options=storage_options, is_text=False 

1373 ) as handle: 

1374 stream = handle.handle 

1375 stream.seek(0) 

1376 buf = stream.read(PEEK_SIZE) 

1377 if buf is None: 

1378 raise ValueError("stream is empty") 

1379 assert isinstance(buf, bytes) 

1380 peek = buf 

1381 stream.seek(0) 

1382 

1383 if any(peek.startswith(sig) for sig in XLS_SIGNATURES): 

1384 return "xls" 

1385 elif not peek.startswith(ZIP_SIGNATURE): 

1386 return None 

1387 

1388 with zipfile.ZipFile(stream) as zf: 

1389 # Workaround for some third party files that use forward slashes and 

1390 # lower case names. 

1391 component_names = [ 

1392 name.replace("\\", "/").lower() for name in zf.namelist() 

1393 ] 

1394 

1395 if "xl/workbook.xml" in component_names: 

1396 return "xlsx" 

1397 if "xl/workbook.bin" in component_names: 

1398 return "xlsb" 

1399 if "content.xml" in component_names: 

1400 return "ods" 

1401 return "zip" 

1402 

1403 

1404class ExcelFile: 

1405 """ 

1406 Class for parsing tabular Excel sheets into DataFrame objects. 

1407 

1408 See read_excel for more documentation. 

1409 

1410 Parameters 

1411 ---------- 

1412 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath), 

1413 A file-like object, xlrd workbook or openpyxl workbook. 

1414 If a string or path object, expected to be a path to a 

1415 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. 

1416 engine : str, default None 

1417 If io is not a buffer or path, this must be set to identify io. 

1418 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` 

1419 Engine compatibility : 

1420 

1421 - ``xlrd`` supports old-style Excel files (.xls). 

1422 - ``openpyxl`` supports newer Excel file formats. 

1423 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). 

1424 - ``pyxlsb`` supports Binary Excel files. 

1425 

1426 .. versionchanged:: 1.2.0 

1427 

1428 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 

1429 now only supports old-style ``.xls`` files. 

1430 When ``engine=None``, the following logic will be 

1431 used to determine the engine: 

1432 

1433 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 

1434 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 

1435 - Otherwise if ``path_or_buffer`` is an xls format, 

1436 ``xlrd`` will be used. 

1437 - Otherwise if ``path_or_buffer`` is in xlsb format, 

1438 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used. 

1439 

1440 .. versionadded:: 1.3.0 

1441 

1442 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, 

1443 then ``openpyxl`` will be used. 

1444 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. 

1445 

1446 .. warning:: 

1447 

1448 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. 

1449 This is not supported, switch to using ``openpyxl`` instead. 

1450 """ 

1451 

1452 from pandas.io.excel._odfreader import ODFReader 

1453 from pandas.io.excel._openpyxl import OpenpyxlReader 

1454 from pandas.io.excel._pyxlsb import PyxlsbReader 

1455 from pandas.io.excel._xlrd import XlrdReader 

1456 

1457 _engines: Mapping[str, Any] = { 

1458 "xlrd": XlrdReader, 

1459 "openpyxl": OpenpyxlReader, 

1460 "odf": ODFReader, 

1461 "pyxlsb": PyxlsbReader, 

1462 } 

1463 

1464 def __init__( 

1465 self, 

1466 path_or_buffer, 

1467 engine: str | None = None, 

1468 storage_options: StorageOptions = None, 

1469 ) -> None: 

1470 if engine is not None and engine not in self._engines: 

1471 raise ValueError(f"Unknown engine: {engine}") 

1472 

1473 # First argument can also be bytes, so create a buffer 

1474 if isinstance(path_or_buffer, bytes): 

1475 path_or_buffer = BytesIO(path_or_buffer) 

1476 

1477 # Could be a str, ExcelFile, Book, etc. 

1478 self.io = path_or_buffer 

1479 # Always a string 

1480 self._io = stringify_path(path_or_buffer) 

1481 

1482 # Determine xlrd version if installed 

1483 if import_optional_dependency("xlrd", errors="ignore") is None: 

1484 xlrd_version = None 

1485 else: 

1486 import xlrd 

1487 

1488 xlrd_version = Version(get_version(xlrd)) 

1489 

1490 if engine is None: 

1491 # Only determine ext if it is needed 

1492 ext: str | None 

1493 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): 

1494 ext = "xls" 

1495 else: 

1496 ext = inspect_excel_format( 

1497 content_or_path=path_or_buffer, storage_options=storage_options 

1498 ) 

1499 if ext is None: 

1500 raise ValueError( 

1501 "Excel file format cannot be determined, you must specify " 

1502 "an engine manually." 

1503 ) 

1504 

1505 engine = config.get_option(f"io.excel.{ext}.reader", silent=True) 

1506 if engine == "auto": 

1507 engine = get_default_engine(ext, mode="reader") 

1508 

1509 assert engine is not None 

1510 self.engine = engine 

1511 self.storage_options = storage_options 

1512 

1513 self._reader = self._engines[engine](self._io, storage_options=storage_options) 

1514 

1515 def __fspath__(self): 

1516 return self._io 

1517 

1518 def parse( 

1519 self, 

1520 sheet_name: str | int | list[int] | list[str] | None = 0, 

1521 header: int | Sequence[int] | None = 0, 

1522 names=None, 

1523 index_col: int | Sequence[int] | None = None, 

1524 usecols=None, 

1525 converters=None, 

1526 true_values: Iterable[Hashable] | None = None, 

1527 false_values: Iterable[Hashable] | None = None, 

1528 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

1529 nrows: int | None = None, 

1530 na_values=None, 

1531 parse_dates: list | dict | bool = False, 

1532 date_parser: Callable | lib.NoDefault = lib.no_default, 

1533 date_format: str | dict[Hashable, str] | None = None, 

1534 thousands: str | None = None, 

1535 comment: str | None = None, 

1536 skipfooter: int = 0, 

1537 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1538 **kwds, 

1539 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: 

1540 """ 

1541 Parse specified sheet(s) into a DataFrame. 

1542 

1543 Equivalent to read_excel(ExcelFile, ...) See the read_excel 

1544 docstring for more info on accepted parameters. 

1545 

1546 Returns 

1547 ------- 

1548 DataFrame or dict of DataFrames 

1549 DataFrame from the passed in Excel file. 

1550 """ 

1551 return self._reader.parse( 

1552 sheet_name=sheet_name, 

1553 header=header, 

1554 names=names, 

1555 index_col=index_col, 

1556 usecols=usecols, 

1557 converters=converters, 

1558 true_values=true_values, 

1559 false_values=false_values, 

1560 skiprows=skiprows, 

1561 nrows=nrows, 

1562 na_values=na_values, 

1563 parse_dates=parse_dates, 

1564 date_parser=date_parser, 

1565 date_format=date_format, 

1566 thousands=thousands, 

1567 comment=comment, 

1568 skipfooter=skipfooter, 

1569 dtype_backend=dtype_backend, 

1570 **kwds, 

1571 ) 

1572 

1573 @property 

1574 def book(self): 

1575 return self._reader.book 

1576 

1577 @property 

1578 def sheet_names(self): 

1579 return self._reader.sheet_names 

1580 

1581 def close(self) -> None: 

1582 """close io if necessary""" 

1583 self._reader.close() 

1584 

1585 def __enter__(self) -> ExcelFile: 

1586 return self 

1587 

1588 def __exit__( 

1589 self, 

1590 exc_type: type[BaseException] | None, 

1591 exc_value: BaseException | None, 

1592 traceback: TracebackType | None, 

1593 ) -> None: 

1594 self.close()