Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/openpyxl/reader/excel.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

208 statements  

1# Copyright (c) 2010-2024 openpyxl 

2 

3 

4"""Read an xlsx file into Python""" 

5 

6# Python stdlib imports 

7from zipfile import ZipFile, ZIP_DEFLATED 

8from io import BytesIO 

9import os.path 

10import warnings 

11 

12from openpyxl.pivot.table import TableDefinition 

13 

14# Allow blanket setting of KEEP_VBA for testing 

15try: 

16 from ..tests import KEEP_VBA 

17except ImportError: 

18 KEEP_VBA = False 

19 

20# package imports 

21from openpyxl.utils.exceptions import InvalidFileException 

22from openpyxl.xml.constants import ( 

23 ARC_CORE, 

24 ARC_CUSTOM, 

25 ARC_CONTENT_TYPES, 

26 ARC_WORKBOOK, 

27 ARC_THEME, 

28 COMMENTS_NS, 

29 SHARED_STRINGS, 

30 XLTM, 

31 XLTX, 

32 XLSM, 

33 XLSX, 

34) 

35from openpyxl.cell import MergedCell 

36from openpyxl.comments.comment_sheet import CommentSheet 

37 

38from .strings import read_string_table, read_rich_text 

39from .workbook import WorkbookParser 

40from openpyxl.styles.stylesheet import apply_stylesheet 

41 

42from openpyxl.packaging.core import DocumentProperties 

43from openpyxl.packaging.custom import CustomPropertyList 

44from openpyxl.packaging.manifest import Manifest, Override 

45 

46from openpyxl.packaging.relationship import ( 

47 RelationshipList, 

48 get_dependents, 

49 get_rels_path, 

50) 

51 

52from openpyxl.worksheet._read_only import ReadOnlyWorksheet 

53from openpyxl.worksheet._reader import WorksheetReader 

54from openpyxl.chartsheet import Chartsheet 

55from openpyxl.worksheet.table import Table 

56from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing 

57 

58from openpyxl.xml.functions import fromstring 

59 

60from .drawings import find_images 

61 

62 

63SUPPORTED_FORMATS = ('.xlsx', '.xlsm', '.xltx', '.xltm') 

64 

65 

66def _validate_archive(filename): 

67 """ 

68 Does a first check whether filename is a string or a file-like 

69 object. If it is a string representing a filename, a check is done 

70 for supported formats by checking the given file-extension. If the 

71 file-extension is not in SUPPORTED_FORMATS an InvalidFileException 

72 will raised. Otherwise the filename (resp. file-like object) will 

73 forwarded to zipfile.ZipFile returning a ZipFile-Instance. 

74 """ 

75 is_file_like = hasattr(filename, 'read') 

76 if not is_file_like: 

77 file_format = os.path.splitext(filename)[-1].lower() 

78 if file_format not in SUPPORTED_FORMATS: 

79 if file_format == '.xls': 

80 msg = ('openpyxl does not support the old .xls file format, ' 

81 'please use xlrd to read this file, or convert it to ' 

82 'the more recent .xlsx file format.') 

83 elif file_format == '.xlsb': 

84 msg = ('openpyxl does not support binary format .xlsb, ' 

85 'please convert this file to .xlsx format if you want ' 

86 'to open it with openpyxl') 

87 else: 

88 msg = ('openpyxl does not support %s file format, ' 

89 'please check you can open ' 

90 'it with Excel first. ' 

91 'Supported formats are: %s') % (file_format, 

92 ','.join(SUPPORTED_FORMATS)) 

93 raise InvalidFileException(msg) 

94 

95 archive = ZipFile(filename, 'r') 

96 return archive 

97 

98 

99def _find_workbook_part(package): 

100 workbook_types = [XLTM, XLTX, XLSM, XLSX] 

101 for ct in workbook_types: 

102 part = package.find(ct) 

103 if part: 

104 return part 

105 

106 # some applications reassign the default for application/xml 

107 defaults = {p.ContentType for p in package.Default} 

108 workbook_type = defaults & set(workbook_types) 

109 if workbook_type: 

110 return Override("/" + ARC_WORKBOOK, workbook_type.pop()) 

111 

112 raise IOError("File contains no valid workbook part") 

113 

114 

115class ExcelReader: 

116 

117 """ 

118 Read an Excel package and dispatch the contents to the relevant modules 

119 """ 

120 

121 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA, 

122 data_only=False, keep_links=True, rich_text=False): 

123 self.archive = _validate_archive(fn) 

124 self.valid_files = self.archive.namelist() 

125 self.read_only = read_only 

126 self.keep_vba = keep_vba 

127 self.data_only = data_only 

128 self.keep_links = keep_links 

129 self.rich_text = rich_text 

130 self.shared_strings = [] 

131 

132 

133 def read_manifest(self): 

134 src = self.archive.read(ARC_CONTENT_TYPES) 

135 root = fromstring(src) 

136 self.package = Manifest.from_tree(root) 

137 

138 

139 def read_strings(self): 

140 ct = self.package.find(SHARED_STRINGS) 

141 reader = read_string_table 

142 if self.rich_text: 

143 reader = read_rich_text 

144 if ct is not None: 

145 strings_path = ct.PartName[1:] 

146 with self.archive.open(strings_path,) as src: 

147 self.shared_strings = reader(src) 

148 

149 

150 def read_workbook(self): 

151 wb_part = _find_workbook_part(self.package) 

152 self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links) 

153 self.parser.parse() 

154 wb = self.parser.wb 

155 wb._sheets = [] 

156 wb._data_only = self.data_only 

157 wb._read_only = self.read_only 

158 wb.template = wb_part.ContentType in (XLTX, XLTM) 

159 

160 # If are going to preserve the vba then attach a copy of the archive to the 

161 # workbook so that is available for the save. 

162 if self.keep_vba: 

163 wb.vba_archive = ZipFile(BytesIO(), 'a', ZIP_DEFLATED) 

164 for name in self.valid_files: 

165 wb.vba_archive.writestr(name, self.archive.read(name)) 

166 

167 if self.read_only: 

168 wb._archive = self.archive 

169 

170 self.wb = wb 

171 

172 

173 def read_properties(self): 

174 if ARC_CORE in self.valid_files: 

175 src = fromstring(self.archive.read(ARC_CORE)) 

176 self.wb.properties = DocumentProperties.from_tree(src) 

177 

178 

179 def read_custom(self): 

180 if ARC_CUSTOM in self.valid_files: 

181 src = fromstring(self.archive.read(ARC_CUSTOM)) 

182 self.wb.custom_doc_props = CustomPropertyList.from_tree(src) 

183 

184 

185 def read_theme(self): 

186 if ARC_THEME in self.valid_files: 

187 self.wb.loaded_theme = self.archive.read(ARC_THEME) 

188 

189 

190 def read_chartsheet(self, sheet, rel): 

191 sheet_path = rel.target 

192 rels_path = get_rels_path(sheet_path) 

193 rels = [] 

194 if rels_path in self.valid_files: 

195 rels = get_dependents(self.archive, rels_path) 

196 

197 with self.archive.open(sheet_path, "r") as src: 

198 xml = src.read() 

199 node = fromstring(xml) 

200 cs = Chartsheet.from_tree(node) 

201 cs._parent = self.wb 

202 cs.title = sheet.name 

203 self.wb._add_sheet(cs) 

204 

205 drawings = rels.find(SpreadsheetDrawing._rel_type) 

206 for rel in drawings: 

207 charts, images = find_images(self.archive, rel.target) 

208 for c in charts: 

209 cs.add_chart(c) 

210 

211 

212 def read_worksheets(self): 

213 comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data.""" 

214 for sheet, rel in self.parser.find_sheets(): 

215 if rel.target not in self.valid_files: 

216 continue 

217 

218 if "chartsheet" in rel.Type: 

219 self.read_chartsheet(sheet, rel) 

220 continue 

221 

222 rels_path = get_rels_path(rel.target) 

223 rels = RelationshipList() 

224 if rels_path in self.valid_files: 

225 rels = get_dependents(self.archive, rels_path) 

226 

227 if self.read_only: 

228 ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings) 

229 ws.sheet_state = sheet.state 

230 self.wb._sheets.append(ws) 

231 continue 

232 else: 

233 fh = self.archive.open(rel.target) 

234 ws = self.wb.create_sheet(sheet.name) 

235 ws._rels = rels 

236 ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only, self.rich_text) 

237 ws_parser.bind_all() 

238 fh.close() 

239 

240 # assign any comments to cells 

241 for r in rels.find(COMMENTS_NS): 

242 src = self.archive.read(r.target) 

243 comment_sheet = CommentSheet.from_tree(fromstring(src)) 

244 for ref, comment in comment_sheet.comments: 

245 try: 

246 ws[ref].comment = comment 

247 except AttributeError: 

248 c = ws[ref] 

249 if isinstance(c, MergedCell): 

250 warnings.warn(comment_warning.format(ws.title, c.coordinate)) 

251 continue 

252 

253 # preserve link to VML file if VBA 

254 if self.wb.vba_archive and ws.legacy_drawing: 

255 ws.legacy_drawing = rels.get(ws.legacy_drawing).target 

256 else: 

257 ws.legacy_drawing = None 

258 

259 for t in ws_parser.tables: 

260 src = self.archive.read(t) 

261 xml = fromstring(src) 

262 table = Table.from_tree(xml) 

263 ws.add_table(table) 

264 

265 drawings = rels.find(SpreadsheetDrawing._rel_type) 

266 for rel in drawings: 

267 charts, images = find_images(self.archive, rel.target) 

268 for c in charts: 

269 ws.add_chart(c, c.anchor) 

270 for im in images: 

271 ws.add_image(im, im.anchor) 

272 

273 pivot_rel = rels.find(TableDefinition.rel_type) 

274 pivot_caches = self.parser.pivot_caches 

275 for r in pivot_rel: 

276 pivot_path = r.Target 

277 src = self.archive.read(pivot_path) 

278 tree = fromstring(src) 

279 pivot = TableDefinition.from_tree(tree) 

280 pivot.cache = pivot_caches[pivot.cacheId] 

281 ws.add_pivot(pivot) 

282 

283 ws.sheet_state = sheet.state 

284 

285 

286 def read(self): 

287 action = "read manifest" 

288 try: 

289 self.read_manifest() 

290 action = "read strings" 

291 self.read_strings() 

292 action = "read workbook" 

293 self.read_workbook() 

294 action = "read properties" 

295 self.read_properties() 

296 action = "read custom properties" 

297 self.read_custom() 

298 action = "read theme" 

299 self.read_theme() 

300 action = "read stylesheet" 

301 apply_stylesheet(self.archive, self.wb) 

302 action = "read worksheets" 

303 self.read_worksheets() 

304 action = "assign names" 

305 self.parser.assign_names() 

306 if not self.read_only: 

307 self.archive.close() 

308 except ValueError as e: 

309 raise ValueError( 

310 f"Unable to read workbook: could not {action} from {self.archive.filename}.\n" 

311 "This is most probably because the workbook source files contain some invalid XML.\n" 

312 "Please see the exception for more details." 

313 ) from e 

314 

315 

316def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA, 

317 data_only=False, keep_links=True, rich_text=False): 

318 """Open the given filename and return the workbook 

319 

320 :param filename: the path to open or a file-like object 

321 :type filename: string or a file-like object open in binary mode c.f., :class:`zipfile.ZipFile` 

322 

323 :param read_only: optimised for reading, content cannot be edited 

324 :type read_only: bool 

325 

326 :param keep_vba: preserve vba content (this does NOT mean you can use it) 

327 :type keep_vba: bool 

328 

329 :param data_only: controls whether cells with formulae have either the formula (default) or the value stored the last time Excel read the sheet 

330 :type data_only: bool 

331 

332 :param keep_links: whether links to external workbooks should be preserved. The default is True 

333 :type keep_links: bool 

334 

335 :param rich_text: if set to True openpyxl will preserve any rich text formatting in cells. The default is False 

336 :type rich_text: bool 

337 

338 :rtype: :class:`openpyxl.workbook.Workbook` 

339 

340 .. note:: 

341 

342 When using lazy load, all worksheets will be :class:`openpyxl.worksheet.iter_worksheet.IterableWorksheet` 

343 and the returned workbook will be read-only. 

344 

345 """ 

346 reader = ExcelReader(filename, read_only, keep_vba, 

347 data_only, keep_links, rich_text) 

348 reader.read() 

349 return reader.wb