1from __future__ import annotations
2
3from datetime import time
4import math
5from typing import TYPE_CHECKING
6
7import numpy as np
8
9from pandas.compat._optional import import_optional_dependency
10from pandas.util._decorators import doc
11
12from pandas.core.shared_docs import _shared_docs
13
14from pandas.io.excel._base import BaseExcelReader
15
16if TYPE_CHECKING:
17 from xlrd import Book
18
19 from pandas._typing import (
20 Scalar,
21 StorageOptions,
22 )
23
24
25class XlrdReader(BaseExcelReader["Book"]):
26 @doc(storage_options=_shared_docs["storage_options"])
27 def __init__(
28 self,
29 filepath_or_buffer,
30 storage_options: StorageOptions | None = None,
31 engine_kwargs: dict | None = None,
32 ) -> None:
33 """
34 Reader using xlrd engine.
35
36 Parameters
37 ----------
38 filepath_or_buffer : str, path object or Workbook
39 Object to be parsed.
40 {storage_options}
41 engine_kwargs : dict, optional
42 Arbitrary keyword arguments passed to excel engine.
43 """
44 err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
45 import_optional_dependency("xlrd", extra=err_msg)
46 super().__init__(
47 filepath_or_buffer,
48 storage_options=storage_options,
49 engine_kwargs=engine_kwargs,
50 )
51
52 @property
53 def _workbook_class(self) -> type[Book]:
54 from xlrd import Book
55
56 return Book
57
58 def load_workbook(self, filepath_or_buffer, engine_kwargs) -> Book:
59 from xlrd import open_workbook
60
61 if hasattr(filepath_or_buffer, "read"):
62 data = filepath_or_buffer.read()
63 return open_workbook(file_contents=data, **engine_kwargs)
64 else:
65 return open_workbook(filepath_or_buffer, **engine_kwargs)
66
67 @property
68 def sheet_names(self):
69 return self.book.sheet_names()
70
71 def get_sheet_by_name(self, name):
72 self.raise_if_bad_sheet_by_name(name)
73 return self.book.sheet_by_name(name)
74
75 def get_sheet_by_index(self, index):
76 self.raise_if_bad_sheet_by_index(index)
77 return self.book.sheet_by_index(index)
78
79 def get_sheet_data(
80 self, sheet, file_rows_needed: int | None = None
81 ) -> list[list[Scalar]]:
82 from xlrd import (
83 XL_CELL_BOOLEAN,
84 XL_CELL_DATE,
85 XL_CELL_ERROR,
86 XL_CELL_NUMBER,
87 xldate,
88 )
89
90 epoch1904 = self.book.datemode
91
92 def _parse_cell(cell_contents, cell_typ):
93 """
94 converts the contents of the cell into a pandas appropriate object
95 """
96 if cell_typ == XL_CELL_DATE:
97 # Use the newer xlrd datetime handling.
98 try:
99 cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
100 except OverflowError:
101 return cell_contents
102
103 # Excel doesn't distinguish between dates and time,
104 # so we treat dates on the epoch as times only.
105 # Also, Excel supports 1900 and 1904 epochs.
106 year = (cell_contents.timetuple())[0:3]
107 if (not epoch1904 and year == (1899, 12, 31)) or (
108 epoch1904 and year == (1904, 1, 1)
109 ):
110 cell_contents = time(
111 cell_contents.hour,
112 cell_contents.minute,
113 cell_contents.second,
114 cell_contents.microsecond,
115 )
116
117 elif cell_typ == XL_CELL_ERROR:
118 cell_contents = np.nan
119 elif cell_typ == XL_CELL_BOOLEAN:
120 cell_contents = bool(cell_contents)
121 elif cell_typ == XL_CELL_NUMBER:
122 # GH5394 - Excel 'numbers' are always floats
123 # it's a minimal perf hit and less surprising
124 if math.isfinite(cell_contents):
125 # GH54564 - don't attempt to convert NaN/Inf
126 val = int(cell_contents)
127 if val == cell_contents:
128 cell_contents = val
129 return cell_contents
130
131 data = []
132
133 nrows = sheet.nrows
134 if file_rows_needed is not None:
135 nrows = min(nrows, file_rows_needed)
136 for i in range(nrows):
137 row = [
138 _parse_cell(value, typ)
139 for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
140 ]
141 data.append(row)
142
143 return data