Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/excel/_pyxlsb.py: 32%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

59 statements  

1# pyright: reportMissingImports=false 

2from __future__ import annotations 

3 

4from typing import TYPE_CHECKING 

5 

6from pandas.compat._optional import import_optional_dependency 

7from pandas.util._decorators import doc 

8 

9from pandas.core.shared_docs import _shared_docs 

10 

11from pandas.io.excel._base import BaseExcelReader 

12 

13if TYPE_CHECKING: 

14 from pyxlsb import Workbook 

15 

16 from pandas._typing import ( 

17 FilePath, 

18 ReadBuffer, 

19 Scalar, 

20 StorageOptions, 

21 ) 

22 

23 

24class PyxlsbReader(BaseExcelReader["Workbook"]): 

25 @doc(storage_options=_shared_docs["storage_options"]) 

26 def __init__( 

27 self, 

28 filepath_or_buffer: FilePath | ReadBuffer[bytes], 

29 storage_options: StorageOptions | None = None, 

30 engine_kwargs: dict | None = None, 

31 ) -> None: 

32 """ 

33 Reader using pyxlsb engine. 

34 

35 Parameters 

36 ---------- 

37 filepath_or_buffer : str, path object, or Workbook 

38 Object to be parsed. 

39 {storage_options} 

40 engine_kwargs : dict, optional 

41 Arbitrary keyword arguments passed to excel engine. 

42 """ 

43 import_optional_dependency("pyxlsb") 

44 # This will call load_workbook on the filepath or buffer 

45 # And set the result to the book-attribute 

46 super().__init__( 

47 filepath_or_buffer, 

48 storage_options=storage_options, 

49 engine_kwargs=engine_kwargs, 

50 ) 

51 

52 @property 

53 def _workbook_class(self) -> type[Workbook]: 

54 from pyxlsb import Workbook 

55 

56 return Workbook 

57 

58 def load_workbook( 

59 self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs 

60 ) -> Workbook: 

61 from pyxlsb import open_workbook 

62 

63 # TODO: hack in buffer capability 

64 # This might need some modifications to the Pyxlsb library 

65 # Actual work for opening it is in xlsbpackage.py, line 20-ish 

66 

67 return open_workbook(filepath_or_buffer, **engine_kwargs) 

68 

69 @property 

70 def sheet_names(self) -> list[str]: 

71 return self.book.sheets 

72 

73 def get_sheet_by_name(self, name: str): 

74 self.raise_if_bad_sheet_by_name(name) 

75 return self.book.get_sheet(name) 

76 

77 def get_sheet_by_index(self, index: int): 

78 self.raise_if_bad_sheet_by_index(index) 

79 # pyxlsb sheets are indexed from 1 onwards 

80 # There's a fix for this in the source, but the pypi package doesn't have it 

81 return self.book.get_sheet(index + 1) 

82 

83 def _convert_cell(self, cell) -> Scalar: 

84 # TODO: there is no way to distinguish between floats and datetimes in pyxlsb 

85 # This means that there is no way to read datetime types from an xlsb file yet 

86 if cell.v is None: 

87 return "" # Prevents non-named columns from not showing up as Unnamed: i 

88 if isinstance(cell.v, float): 

89 val = int(cell.v) 

90 if val == cell.v: 

91 return val 

92 else: 

93 return float(cell.v) 

94 

95 return cell.v 

96 

97 def get_sheet_data( 

98 self, 

99 sheet, 

100 file_rows_needed: int | None = None, 

101 ) -> list[list[Scalar]]: 

102 data: list[list[Scalar]] = [] 

103 previous_row_number = -1 

104 # When sparse=True the rows can have different lengths and empty rows are 

105 # not returned. The cells are namedtuples of row, col, value (r, c, v). 

106 for row in sheet.rows(sparse=True): 

107 row_number = row[0].r 

108 converted_row = [self._convert_cell(cell) for cell in row] 

109 while converted_row and converted_row[-1] == "": 

110 # trim trailing empty elements 

111 converted_row.pop() 

112 if converted_row: 

113 data.extend([[]] * (row_number - previous_row_number - 1)) 

114 data.append(converted_row) 

115 previous_row_number = row_number 

116 if file_rows_needed is not None and len(data) >= file_rows_needed: 

117 break 

118 if data: 

119 # extend rows to max_width 

120 max_width = max(len(data_row) for data_row in data) 

121 if min(len(data_row) for data_row in data) < max_width: 

122 empty_cell: list[Scalar] = [""] 

123 data = [ 

124 data_row + (max_width - len(data_row)) * empty_cell 

125 for data_row in data 

126 ] 

127 return data