Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/openpyxl/worksheet/_reader.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

315 statements  

1# Copyright (c) 2010-2024 openpyxl 

2 

3"""Reader for a single worksheet.""" 

4from copy import copy 

5from warnings import warn 

6 

7# compatibility imports 

8from openpyxl.xml.functions import iterparse 

9 

10# package imports 

11from openpyxl.cell import Cell, MergedCell 

12from openpyxl.cell.text import Text 

13from openpyxl.worksheet.dimensions import ( 

14 ColumnDimension, 

15 RowDimension, 

16 SheetFormatProperties, 

17) 

18 

19from openpyxl.xml.constants import ( 

20 SHEET_MAIN_NS, 

21 EXT_TYPES, 

22) 

23from openpyxl.formatting.formatting import ConditionalFormatting 

24from openpyxl.formula.translate import Translator 

25from openpyxl.utils import ( 

26 get_column_letter, 

27 coordinate_to_tuple, 

28 ) 

29from openpyxl.utils.datetime import from_excel, from_ISO8601, WINDOWS_EPOCH 

30from openpyxl.descriptors.excel import ExtensionList 

31from openpyxl.cell.rich_text import CellRichText 

32 

33from .formula import DataTableFormula, ArrayFormula 

34from .filters import AutoFilter 

35from .header_footer import HeaderFooter 

36from .hyperlink import HyperlinkList 

37from .merge import MergeCells 

38from .page import PageMargins, PrintOptions, PrintPageSetup 

39from .pagebreak import RowBreak, ColBreak 

40from .protection import SheetProtection 

41from .scenario import ScenarioList 

42from .views import SheetViewList 

43from .datavalidation import DataValidationList 

44from .table import TablePartList 

45from .properties import WorksheetProperties 

46from .dimensions import SheetDimension 

47from .related import Related 

48 

49 

50CELL_TAG = '{%s}c' % SHEET_MAIN_NS 

51VALUE_TAG = '{%s}v' % SHEET_MAIN_NS 

52FORMULA_TAG = '{%s}f' % SHEET_MAIN_NS 

53MERGE_TAG = '{%s}mergeCells' % SHEET_MAIN_NS 

54INLINE_STRING = "{%s}is" % SHEET_MAIN_NS 

55COL_TAG = '{%s}col' % SHEET_MAIN_NS 

56ROW_TAG = '{%s}row' % SHEET_MAIN_NS 

57CF_TAG = '{%s}conditionalFormatting' % SHEET_MAIN_NS 

58LEGACY_TAG = '{%s}legacyDrawing' % SHEET_MAIN_NS 

59PROT_TAG = '{%s}sheetProtection' % SHEET_MAIN_NS 

60EXT_TAG = "{%s}extLst" % SHEET_MAIN_NS 

61HYPERLINK_TAG = "{%s}hyperlinks" % SHEET_MAIN_NS 

62TABLE_TAG = "{%s}tableParts" % SHEET_MAIN_NS 

63PRINT_TAG = '{%s}printOptions' % SHEET_MAIN_NS 

64MARGINS_TAG = '{%s}pageMargins' % SHEET_MAIN_NS 

65PAGE_TAG = '{%s}pageSetup' % SHEET_MAIN_NS 

66HEADER_TAG = '{%s}headerFooter' % SHEET_MAIN_NS 

67FILTER_TAG = '{%s}autoFilter' % SHEET_MAIN_NS 

68VALIDATION_TAG = '{%s}dataValidations' % SHEET_MAIN_NS 

69PROPERTIES_TAG = '{%s}sheetPr' % SHEET_MAIN_NS 

70VIEWS_TAG = '{%s}sheetViews' % SHEET_MAIN_NS 

71FORMAT_TAG = '{%s}sheetFormatPr' % SHEET_MAIN_NS 

72ROW_BREAK_TAG = '{%s}rowBreaks' % SHEET_MAIN_NS 

73COL_BREAK_TAG = '{%s}colBreaks' % SHEET_MAIN_NS 

74SCENARIOS_TAG = '{%s}scenarios' % SHEET_MAIN_NS 

75DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS 

76DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS 

77CUSTOM_VIEWS_TAG = '{%s}customSheetViews' % SHEET_MAIN_NS 

78 

79 

80def _cast_number(value): 

81 "Convert numbers as string to an int or float" 

82 if "." in value or "E" in value or "e" in value: 

83 return float(value) 

84 return int(value) 

85 

86 

87def parse_richtext_string(element): 

88 """ 

89 Parse inline string and preserve rich text formatting 

90 """ 

91 value = CellRichText.from_tree(element) or "" 

92 if len(value) == 1 and isinstance(value[0], str): 

93 value = value[0] 

94 return value 

95 

96 

97class WorkSheetParser: 

98 

99 def __init__(self, src, shared_strings, data_only=False, 

100 epoch=WINDOWS_EPOCH, date_formats=set(), 

101 timedelta_formats=set(), rich_text=False): 

102 self.min_row = self.min_col = None 

103 self.epoch = epoch 

104 self.source = src 

105 self.shared_strings = shared_strings 

106 self.data_only = data_only 

107 self.shared_formulae = {} 

108 self.row_counter = self.col_counter = 0 

109 self.tables = TablePartList() 

110 self.date_formats = date_formats 

111 self.timedelta_formats = timedelta_formats 

112 self.row_dimensions = {} 

113 self.column_dimensions = {} 

114 self.number_formats = [] 

115 self.keep_vba = False 

116 self.hyperlinks = HyperlinkList() 

117 self.formatting = [] 

118 self.legacy_drawing = None 

119 self.merged_cells = None 

120 self.row_breaks = RowBreak() 

121 self.col_breaks = ColBreak() 

122 self.rich_text = rich_text 

123 

124 

125 def parse(self): 

126 dispatcher = { 

127 COL_TAG: self.parse_column_dimensions, 

128 PROT_TAG: self.parse_sheet_protection, 

129 EXT_TAG: self.parse_extensions, 

130 CF_TAG: self.parse_formatting, 

131 LEGACY_TAG: self.parse_legacy, 

132 ROW_BREAK_TAG: self.parse_row_breaks, 

133 COL_BREAK_TAG: self.parse_col_breaks, 

134 CUSTOM_VIEWS_TAG: self.parse_custom_views, 

135 } 

136 

137 properties = { 

138 PRINT_TAG: ('print_options', PrintOptions), 

139 MARGINS_TAG: ('page_margins', PageMargins), 

140 PAGE_TAG: ('page_setup', PrintPageSetup), 

141 HEADER_TAG: ('HeaderFooter', HeaderFooter), 

142 FILTER_TAG: ('auto_filter', AutoFilter), 

143 VALIDATION_TAG: ('data_validations', DataValidationList), 

144 PROPERTIES_TAG: ('sheet_properties', WorksheetProperties), 

145 VIEWS_TAG: ('views', SheetViewList), 

146 FORMAT_TAG: ('sheet_format', SheetFormatProperties), 

147 SCENARIOS_TAG: ('scenarios', ScenarioList), 

148 TABLE_TAG: ('tables', TablePartList), 

149 HYPERLINK_TAG: ('hyperlinks', HyperlinkList), 

150 MERGE_TAG: ('merged_cells', MergeCells), 

151 

152 } 

153 

154 it = iterparse(self.source) # add a finaliser to close the source when this becomes possible 

155 

156 for _, element in it: 

157 tag_name = element.tag 

158 if tag_name in dispatcher: 

159 dispatcher[tag_name](element) 

160 element.clear() 

161 elif tag_name in properties: 

162 prop = properties[tag_name] 

163 obj = prop[1].from_tree(element) 

164 setattr(self, prop[0], obj) 

165 element.clear() 

166 elif tag_name == ROW_TAG: 

167 row = self.parse_row(element) 

168 element.clear() 

169 yield row 

170 

171 

172 def parse_dimensions(self): 

173 """ 

174 Get worksheet dimensions if they are provided. 

175 """ 

176 it = iterparse(self.source) 

177 

178 for _event, element in it: 

179 if element.tag == DIMENSION_TAG: 

180 dim = SheetDimension.from_tree(element) 

181 return dim.boundaries 

182 

183 elif element.tag == DATA_TAG: 

184 # Dimensions missing 

185 break 

186 element.clear() 

187 

188 

189 def parse_cell(self, element): 

190 data_type = element.get('t', 'n') 

191 coordinate = element.get('r') 

192 style_id = element.get('s', 0) 

193 if style_id: 

194 style_id = int(style_id) 

195 

196 if data_type == "inlineStr": 

197 value = None 

198 else: 

199 value = element.findtext(VALUE_TAG, None) or None 

200 

201 if coordinate: 

202 row, column = coordinate_to_tuple(coordinate) 

203 self.col_counter = column 

204 else: 

205 self.col_counter += 1 

206 row, column = self.row_counter, self.col_counter 

207 

208 if not self.data_only and element.find(FORMULA_TAG) is not None: 

209 data_type = 'f' 

210 value = self.parse_formula(element) 

211 

212 elif value is not None: 

213 if data_type == 'n': 

214 value = _cast_number(value) 

215 if style_id in self.date_formats: 

216 data_type = 'd' 

217 try: 

218 value = from_excel( 

219 value, self.epoch, timedelta=style_id in self.timedelta_formats 

220 ) 

221 except (OverflowError, ValueError): 

222 msg = f"""Cell {coordinate} is marked as a date but the serial value {value} is outside the limits for dates. The cell will be treated as an error.""" 

223 warn(msg) 

224 data_type = "e" 

225 value = "#VALUE!" 

226 elif data_type == 's': 

227 value = self.shared_strings[int(value)] 

228 elif data_type == 'b': 

229 value = bool(int(value)) 

230 elif data_type == "str": 

231 data_type = "s" 

232 elif data_type == 'd': 

233 value = from_ISO8601(value) 

234 

235 elif data_type == 'inlineStr': 

236 child = element.find(INLINE_STRING) 

237 if child is not None: 

238 data_type = 's' 

239 if self.rich_text: 

240 value = parse_richtext_string(child) 

241 else: 

242 value = Text.from_tree(child).content 

243 

244 return {'row':row, 'column':column, 'value':value, 'data_type':data_type, 'style_id':style_id} 

245 

246 

247 def parse_formula(self, element): 

248 """ 

249 possible formulae types: shared, array, datatable 

250 """ 

251 formula = element.find(FORMULA_TAG) 

252 formula_type = formula.get('t') 

253 coordinate = element.get('r') 

254 value = "=" 

255 if formula.text is not None: 

256 value += formula.text 

257 

258 if formula_type == "array": 

259 value = ArrayFormula(ref=formula.get('ref'), text=value) 

260 

261 elif formula_type == "shared": 

262 idx = formula.get('si') 

263 if idx in self.shared_formulae: 

264 trans = self.shared_formulae[idx] 

265 value = trans.translate_formula(coordinate) 

266 elif value != "=": 

267 self.shared_formulae[idx] = Translator(value, coordinate) 

268 

269 elif formula_type == "dataTable": 

270 value = DataTableFormula(**formula.attrib) 

271 

272 return value 

273 

274 

275 def parse_column_dimensions(self, col): 

276 attrs = dict(col.attrib) 

277 column = get_column_letter(int(attrs['min'])) 

278 attrs['index'] = column 

279 self.column_dimensions[column] = attrs 

280 

281 

282 def parse_row(self, row): 

283 attrs = dict(row.attrib) 

284 

285 if "r" in attrs: 

286 try: 

287 self.row_counter = int(attrs['r']) 

288 except ValueError: 

289 val = float(attrs['r']) 

290 if val.is_integer(): 

291 self.row_counter = int(val) 

292 else: 

293 raise ValueError(f"{attrs['r']} is not a valid row number") 

294 else: 

295 self.row_counter += 1 

296 self.col_counter = 0 

297 

298 keys = {k for k in attrs if not k.startswith('{')} 

299 if keys - {'r', 'spans'}: 

300 # don't create dimension objects unless they have relevant information 

301 self.row_dimensions[str(self.row_counter)] = attrs 

302 

303 cells = [self.parse_cell(el) for el in row] 

304 return self.row_counter, cells 

305 

306 

307 def parse_formatting(self, element): 

308 try: 

309 cf = ConditionalFormatting.from_tree(element) 

310 self.formatting.append(cf) 

311 except TypeError as e: 

312 msg = f"Failed to load a conditional formatting rule. It will be discarded. Cause: {e}" 

313 warn(msg) 

314 

315 

316 def parse_sheet_protection(self, element): 

317 protection = SheetProtection.from_tree(element) 

318 password = element.get("password") 

319 if password is not None: 

320 protection.set_password(password, True) 

321 self.protection = protection 

322 

323 

324 def parse_extensions(self, element): 

325 extLst = ExtensionList.from_tree(element) 

326 for e in extLst.ext: 

327 ext_type = EXT_TYPES.get(e.uri.upper(), "Unknown") 

328 msg = "{0} extension is not supported and will be removed".format(ext_type) 

329 warn(msg) 

330 

331 

332 def parse_legacy(self, element): 

333 obj = Related.from_tree(element) 

334 self.legacy_drawing = obj.id 

335 

336 

337 def parse_row_breaks(self, element): 

338 brk = RowBreak.from_tree(element) 

339 self.row_breaks = brk 

340 

341 

342 def parse_col_breaks(self, element): 

343 brk = ColBreak.from_tree(element) 

344 self.col_breaks = brk 

345 

346 

347 def parse_custom_views(self, element): 

348 # clear page_breaks to avoid duplication which Excel doesn't like 

349 # basically they're ignored in custom views 

350 self.row_breaks = RowBreak() 

351 self.col_breaks = ColBreak() 

352 

353 

354class WorksheetReader: 

355 """ 

356 Create a parser and apply it to a workbook 

357 """ 

358 

359 def __init__(self, ws, xml_source, shared_strings, data_only, rich_text): 

360 self.ws = ws 

361 self.parser = WorkSheetParser(xml_source, shared_strings, 

362 data_only, ws.parent.epoch, ws.parent._date_formats, 

363 ws.parent._timedelta_formats, rich_text) 

364 self.tables = [] 

365 

366 

367 def bind_cells(self): 

368 for idx, row in self.parser.parse(): 

369 for cell in row: 

370 style = self.ws.parent._cell_styles[cell['style_id']] 

371 c = Cell(self.ws, row=cell['row'], column=cell['column'], style_array=style) 

372 c._value = cell['value'] 

373 c.data_type = cell['data_type'] 

374 self.ws._cells[(cell['row'], cell['column'])] = c 

375 

376 if self.ws._cells: 

377 self.ws._current_row = self.ws.max_row # use cells not row dimensions 

378 

379 

380 def bind_formatting(self): 

381 for cf in self.parser.formatting: 

382 for rule in cf.rules: 

383 if rule.dxfId is not None: 

384 rule.dxf = self.ws.parent._differential_styles[rule.dxfId] 

385 self.ws.conditional_formatting[cf] = rule 

386 

387 

388 def bind_tables(self): 

389 for t in self.parser.tables.tablePart: 

390 rel = self.ws._rels.get(t.id) 

391 self.tables.append(rel.Target) 

392 

393 

394 def bind_merged_cells(self): 

395 from openpyxl.worksheet.cell_range import MultiCellRange 

396 from openpyxl.worksheet.merge import MergedCellRange 

397 if not self.parser.merged_cells: 

398 return 

399 

400 ranges = [] 

401 for cr in self.parser.merged_cells.mergeCell: 

402 mcr = MergedCellRange(self.ws, cr.ref) 

403 self.ws._clean_merge_range(mcr) 

404 ranges.append(mcr) 

405 self.ws.merged_cells = MultiCellRange(ranges) 

406 

407 

408 def bind_hyperlinks(self): 

409 for link in self.parser.hyperlinks.hyperlink: 

410 if link.id: 

411 rel = self.ws._rels.get(link.id) 

412 link.target = rel.Target 

413 if ":" in link.ref: 

414 # range of cells 

415 for row in self.ws[link.ref]: 

416 for cell in row: 

417 try: 

418 cell.hyperlink = copy(link) 

419 except AttributeError: 

420 pass 

421 else: 

422 cell = self.ws[link.ref] 

423 if isinstance(cell, MergedCell): 

424 cell = self.normalize_merged_cell_link(cell.coordinate) 

425 cell.hyperlink = link 

426 

427 def normalize_merged_cell_link(self, coord): 

428 """ 

429 Returns the appropriate cell to which a hyperlink, which references a merged cell at the specified coordinates, 

430 should be bound. 

431 """ 

432 for rng in self.ws.merged_cells: 

433 if coord in rng: 

434 return self.ws.cell(*rng.top[0]) 

435 

436 def bind_col_dimensions(self): 

437 for col, cd in self.parser.column_dimensions.items(): 

438 if 'style' in cd: 

439 key = int(cd['style']) 

440 cd['style'] = self.ws.parent._cell_styles[key] 

441 self.ws.column_dimensions[col] = ColumnDimension(self.ws, **cd) 

442 

443 

444 def bind_row_dimensions(self): 

445 for row, rd in self.parser.row_dimensions.items(): 

446 if 's' in rd: 

447 key = int(rd['s']) 

448 rd['s'] = self.ws.parent._cell_styles[key] 

449 self.ws.row_dimensions[int(row)] = RowDimension(self.ws, **rd) 

450 

451 

452 def bind_properties(self): 

453 for k in ('print_options', 'page_margins', 'page_setup', 

454 'HeaderFooter', 'auto_filter', 'data_validations', 

455 'sheet_properties', 'views', 'sheet_format', 

456 'row_breaks', 'col_breaks', 'scenarios', 'legacy_drawing', 

457 'protection', 

458 ): 

459 v = getattr(self.parser, k, None) 

460 if v is not None: 

461 setattr(self.ws, k, v) 

462 

463 

464 def bind_all(self): 

465 self.bind_cells() 

466 self.bind_merged_cells() 

467 self.bind_hyperlinks() 

468 self.bind_formatting() 

469 self.bind_col_dimensions() 

470 self.bind_row_dimensions() 

471 self.bind_tables() 

472 self.bind_properties()