1# pyright: reportMissingImports=false
2from __future__ import annotations
3
4from typing import TYPE_CHECKING
5
6from pandas.compat._optional import import_optional_dependency
7from pandas.util._decorators import doc
8
9from pandas.core.shared_docs import _shared_docs
10
11from pandas.io.excel._base import BaseExcelReader
12
13if TYPE_CHECKING:
14 from pyxlsb import Workbook
15
16 from pandas._typing import (
17 FilePath,
18 ReadBuffer,
19 Scalar,
20 StorageOptions,
21 )
22
23
24class PyxlsbReader(BaseExcelReader["Workbook"]):
25 @doc(storage_options=_shared_docs["storage_options"])
26 def __init__(
27 self,
28 filepath_or_buffer: FilePath | ReadBuffer[bytes],
29 storage_options: StorageOptions | None = None,
30 engine_kwargs: dict | None = None,
31 ) -> None:
32 """
33 Reader using pyxlsb engine.
34
35 Parameters
36 ----------
37 filepath_or_buffer : str, path object, or Workbook
38 Object to be parsed.
39 {storage_options}
40 engine_kwargs : dict, optional
41 Arbitrary keyword arguments passed to excel engine.
42 """
43 import_optional_dependency("pyxlsb")
44 # This will call load_workbook on the filepath or buffer
45 # And set the result to the book-attribute
46 super().__init__(
47 filepath_or_buffer,
48 storage_options=storage_options,
49 engine_kwargs=engine_kwargs,
50 )
51
52 @property
53 def _workbook_class(self) -> type[Workbook]:
54 from pyxlsb import Workbook
55
56 return Workbook
57
58 def load_workbook(
59 self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
60 ) -> Workbook:
61 from pyxlsb import open_workbook
62
63 # TODO: hack in buffer capability
64 # This might need some modifications to the Pyxlsb library
65 # Actual work for opening it is in xlsbpackage.py, line 20-ish
66
67 return open_workbook(filepath_or_buffer, **engine_kwargs)
68
69 @property
70 def sheet_names(self) -> list[str]:
71 return self.book.sheets
72
73 def get_sheet_by_name(self, name: str):
74 self.raise_if_bad_sheet_by_name(name)
75 return self.book.get_sheet(name)
76
77 def get_sheet_by_index(self, index: int):
78 self.raise_if_bad_sheet_by_index(index)
79 # pyxlsb sheets are indexed from 1 onwards
80 # There's a fix for this in the source, but the pypi package doesn't have it
81 return self.book.get_sheet(index + 1)
82
83 def _convert_cell(self, cell) -> Scalar:
84 # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
85 # This means that there is no way to read datetime types from an xlsb file yet
86 if cell.v is None:
87 return "" # Prevents non-named columns from not showing up as Unnamed: i
88 if isinstance(cell.v, float):
89 val = int(cell.v)
90 if val == cell.v:
91 return val
92 else:
93 return float(cell.v)
94
95 return cell.v
96
97 def get_sheet_data(
98 self,
99 sheet,
100 file_rows_needed: int | None = None,
101 ) -> list[list[Scalar]]:
102 data: list[list[Scalar]] = []
103 previous_row_number = -1
104 # When sparse=True the rows can have different lengths and empty rows are
105 # not returned. The cells are namedtuples of row, col, value (r, c, v).
106 for row in sheet.rows(sparse=True):
107 row_number = row[0].r
108 converted_row = [self._convert_cell(cell) for cell in row]
109 while converted_row and converted_row[-1] == "":
110 # trim trailing empty elements
111 converted_row.pop()
112 if converted_row:
113 data.extend([[]] * (row_number - previous_row_number - 1))
114 data.append(converted_row)
115 previous_row_number = row_number
116 if file_rows_needed is not None and len(data) >= file_rows_needed:
117 break
118 if data:
119 # extend rows to max_width
120 max_width = max(len(data_row) for data_row in data)
121 if min(len(data_row) for data_row in data) < max_width:
122 empty_cell: list[Scalar] = [""]
123 data = [
124 data_row + (max_width - len(data_row)) * empty_cell
125 for data_row in data
126 ]
127 return data