1# Copyright (c) 2010-2024 openpyxl
2
3"""Reader for a single worksheet."""
4from copy import copy
5from warnings import warn
6
7# compatibility imports
8from openpyxl.xml.functions import iterparse
9
10# package imports
11from openpyxl.cell import Cell, MergedCell
12from openpyxl.cell.text import Text
13from openpyxl.worksheet.dimensions import (
14 ColumnDimension,
15 RowDimension,
16 SheetFormatProperties,
17)
18
19from openpyxl.xml.constants import (
20 SHEET_MAIN_NS,
21 EXT_TYPES,
22)
23from openpyxl.formatting.formatting import ConditionalFormatting
24from openpyxl.formula.translate import Translator
25from openpyxl.utils import (
26 get_column_letter,
27 coordinate_to_tuple,
28 )
29from openpyxl.utils.datetime import from_excel, from_ISO8601, WINDOWS_EPOCH
30from openpyxl.descriptors.excel import ExtensionList
31from openpyxl.cell.rich_text import CellRichText
32
33from .formula import DataTableFormula, ArrayFormula
34from .filters import AutoFilter
35from .header_footer import HeaderFooter
36from .hyperlink import HyperlinkList
37from .merge import MergeCells
38from .page import PageMargins, PrintOptions, PrintPageSetup
39from .pagebreak import RowBreak, ColBreak
40from .protection import SheetProtection
41from .scenario import ScenarioList
42from .views import SheetViewList
43from .datavalidation import DataValidationList
44from .table import TablePartList
45from .properties import WorksheetProperties
46from .dimensions import SheetDimension
47from .related import Related
48
49
50CELL_TAG = '{%s}c' % SHEET_MAIN_NS
51VALUE_TAG = '{%s}v' % SHEET_MAIN_NS
52FORMULA_TAG = '{%s}f' % SHEET_MAIN_NS
53MERGE_TAG = '{%s}mergeCells' % SHEET_MAIN_NS
54INLINE_STRING = "{%s}is" % SHEET_MAIN_NS
55COL_TAG = '{%s}col' % SHEET_MAIN_NS
56ROW_TAG = '{%s}row' % SHEET_MAIN_NS
57CF_TAG = '{%s}conditionalFormatting' % SHEET_MAIN_NS
58LEGACY_TAG = '{%s}legacyDrawing' % SHEET_MAIN_NS
59PROT_TAG = '{%s}sheetProtection' % SHEET_MAIN_NS
60EXT_TAG = "{%s}extLst" % SHEET_MAIN_NS
61HYPERLINK_TAG = "{%s}hyperlinks" % SHEET_MAIN_NS
62TABLE_TAG = "{%s}tableParts" % SHEET_MAIN_NS
63PRINT_TAG = '{%s}printOptions' % SHEET_MAIN_NS
64MARGINS_TAG = '{%s}pageMargins' % SHEET_MAIN_NS
65PAGE_TAG = '{%s}pageSetup' % SHEET_MAIN_NS
66HEADER_TAG = '{%s}headerFooter' % SHEET_MAIN_NS
67FILTER_TAG = '{%s}autoFilter' % SHEET_MAIN_NS
68VALIDATION_TAG = '{%s}dataValidations' % SHEET_MAIN_NS
69PROPERTIES_TAG = '{%s}sheetPr' % SHEET_MAIN_NS
70VIEWS_TAG = '{%s}sheetViews' % SHEET_MAIN_NS
71FORMAT_TAG = '{%s}sheetFormatPr' % SHEET_MAIN_NS
72ROW_BREAK_TAG = '{%s}rowBreaks' % SHEET_MAIN_NS
73COL_BREAK_TAG = '{%s}colBreaks' % SHEET_MAIN_NS
74SCENARIOS_TAG = '{%s}scenarios' % SHEET_MAIN_NS
75DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS
76DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS
77CUSTOM_VIEWS_TAG = '{%s}customSheetViews' % SHEET_MAIN_NS
78
79
80def _cast_number(value):
81 "Convert numbers as string to an int or float"
82 if "." in value or "E" in value or "e" in value:
83 return float(value)
84 return int(value)
85
86
87def parse_richtext_string(element):
88 """
89 Parse inline string and preserve rich text formatting
90 """
91 value = CellRichText.from_tree(element) or ""
92 if len(value) == 1 and isinstance(value[0], str):
93 value = value[0]
94 return value
95
96
97class WorkSheetParser:
98
99 def __init__(self, src, shared_strings, data_only=False,
100 epoch=WINDOWS_EPOCH, date_formats=set(),
101 timedelta_formats=set(), rich_text=False):
102 self.min_row = self.min_col = None
103 self.epoch = epoch
104 self.source = src
105 self.shared_strings = shared_strings
106 self.data_only = data_only
107 self.shared_formulae = {}
108 self.row_counter = self.col_counter = 0
109 self.tables = TablePartList()
110 self.date_formats = date_formats
111 self.timedelta_formats = timedelta_formats
112 self.row_dimensions = {}
113 self.column_dimensions = {}
114 self.number_formats = []
115 self.keep_vba = False
116 self.hyperlinks = HyperlinkList()
117 self.formatting = []
118 self.legacy_drawing = None
119 self.merged_cells = None
120 self.row_breaks = RowBreak()
121 self.col_breaks = ColBreak()
122 self.rich_text = rich_text
123
124
125 def parse(self):
126 dispatcher = {
127 COL_TAG: self.parse_column_dimensions,
128 PROT_TAG: self.parse_sheet_protection,
129 EXT_TAG: self.parse_extensions,
130 CF_TAG: self.parse_formatting,
131 LEGACY_TAG: self.parse_legacy,
132 ROW_BREAK_TAG: self.parse_row_breaks,
133 COL_BREAK_TAG: self.parse_col_breaks,
134 CUSTOM_VIEWS_TAG: self.parse_custom_views,
135 }
136
137 properties = {
138 PRINT_TAG: ('print_options', PrintOptions),
139 MARGINS_TAG: ('page_margins', PageMargins),
140 PAGE_TAG: ('page_setup', PrintPageSetup),
141 HEADER_TAG: ('HeaderFooter', HeaderFooter),
142 FILTER_TAG: ('auto_filter', AutoFilter),
143 VALIDATION_TAG: ('data_validations', DataValidationList),
144 PROPERTIES_TAG: ('sheet_properties', WorksheetProperties),
145 VIEWS_TAG: ('views', SheetViewList),
146 FORMAT_TAG: ('sheet_format', SheetFormatProperties),
147 SCENARIOS_TAG: ('scenarios', ScenarioList),
148 TABLE_TAG: ('tables', TablePartList),
149 HYPERLINK_TAG: ('hyperlinks', HyperlinkList),
150 MERGE_TAG: ('merged_cells', MergeCells),
151
152 }
153
154 it = iterparse(self.source) # add a finaliser to close the source when this becomes possible
155
156 for _, element in it:
157 tag_name = element.tag
158 if tag_name in dispatcher:
159 dispatcher[tag_name](element)
160 element.clear()
161 elif tag_name in properties:
162 prop = properties[tag_name]
163 obj = prop[1].from_tree(element)
164 setattr(self, prop[0], obj)
165 element.clear()
166 elif tag_name == ROW_TAG:
167 row = self.parse_row(element)
168 element.clear()
169 yield row
170
171
172 def parse_dimensions(self):
173 """
174 Get worksheet dimensions if they are provided.
175 """
176 it = iterparse(self.source)
177
178 for _event, element in it:
179 if element.tag == DIMENSION_TAG:
180 dim = SheetDimension.from_tree(element)
181 return dim.boundaries
182
183 elif element.tag == DATA_TAG:
184 # Dimensions missing
185 break
186 element.clear()
187
188
189 def parse_cell(self, element):
190 data_type = element.get('t', 'n')
191 coordinate = element.get('r')
192 style_id = element.get('s', 0)
193 if style_id:
194 style_id = int(style_id)
195
196 if data_type == "inlineStr":
197 value = None
198 else:
199 value = element.findtext(VALUE_TAG, None) or None
200
201 if coordinate:
202 row, column = coordinate_to_tuple(coordinate)
203 self.col_counter = column
204 else:
205 self.col_counter += 1
206 row, column = self.row_counter, self.col_counter
207
208 if not self.data_only and element.find(FORMULA_TAG) is not None:
209 data_type = 'f'
210 value = self.parse_formula(element)
211
212 elif value is not None:
213 if data_type == 'n':
214 value = _cast_number(value)
215 if style_id in self.date_formats:
216 data_type = 'd'
217 try:
218 value = from_excel(
219 value, self.epoch, timedelta=style_id in self.timedelta_formats
220 )
221 except (OverflowError, ValueError):
222 msg = f"""Cell {coordinate} is marked as a date but the serial value {value} is outside the limits for dates. The cell will be treated as an error."""
223 warn(msg)
224 data_type = "e"
225 value = "#VALUE!"
226 elif data_type == 's':
227 value = self.shared_strings[int(value)]
228 elif data_type == 'b':
229 value = bool(int(value))
230 elif data_type == "str":
231 data_type = "s"
232 elif data_type == 'd':
233 value = from_ISO8601(value)
234
235 elif data_type == 'inlineStr':
236 child = element.find(INLINE_STRING)
237 if child is not None:
238 data_type = 's'
239 if self.rich_text:
240 value = parse_richtext_string(child)
241 else:
242 value = Text.from_tree(child).content
243
244 return {'row':row, 'column':column, 'value':value, 'data_type':data_type, 'style_id':style_id}
245
246
247 def parse_formula(self, element):
248 """
249 possible formulae types: shared, array, datatable
250 """
251 formula = element.find(FORMULA_TAG)
252 formula_type = formula.get('t')
253 coordinate = element.get('r')
254 value = "="
255 if formula.text is not None:
256 value += formula.text
257
258 if formula_type == "array":
259 value = ArrayFormula(ref=formula.get('ref'), text=value)
260
261 elif formula_type == "shared":
262 idx = formula.get('si')
263 if idx in self.shared_formulae:
264 trans = self.shared_formulae[idx]
265 value = trans.translate_formula(coordinate)
266 elif value != "=":
267 self.shared_formulae[idx] = Translator(value, coordinate)
268
269 elif formula_type == "dataTable":
270 value = DataTableFormula(**formula.attrib)
271
272 return value
273
274
275 def parse_column_dimensions(self, col):
276 attrs = dict(col.attrib)
277 column = get_column_letter(int(attrs['min']))
278 attrs['index'] = column
279 self.column_dimensions[column] = attrs
280
281
282 def parse_row(self, row):
283 attrs = dict(row.attrib)
284
285 if "r" in attrs:
286 try:
287 self.row_counter = int(attrs['r'])
288 except ValueError:
289 val = float(attrs['r'])
290 if val.is_integer():
291 self.row_counter = int(val)
292 else:
293 raise ValueError(f"{attrs['r']} is not a valid row number")
294 else:
295 self.row_counter += 1
296 self.col_counter = 0
297
298 keys = {k for k in attrs if not k.startswith('{')}
299 if keys - {'r', 'spans'}:
300 # don't create dimension objects unless they have relevant information
301 self.row_dimensions[str(self.row_counter)] = attrs
302
303 cells = [self.parse_cell(el) for el in row]
304 return self.row_counter, cells
305
306
307 def parse_formatting(self, element):
308 try:
309 cf = ConditionalFormatting.from_tree(element)
310 self.formatting.append(cf)
311 except TypeError as e:
312 msg = f"Failed to load a conditional formatting rule. It will be discarded. Cause: {e}"
313 warn(msg)
314
315
316 def parse_sheet_protection(self, element):
317 protection = SheetProtection.from_tree(element)
318 password = element.get("password")
319 if password is not None:
320 protection.set_password(password, True)
321 self.protection = protection
322
323
324 def parse_extensions(self, element):
325 extLst = ExtensionList.from_tree(element)
326 for e in extLst.ext:
327 ext_type = EXT_TYPES.get(e.uri.upper(), "Unknown")
328 msg = "{0} extension is not supported and will be removed".format(ext_type)
329 warn(msg)
330
331
332 def parse_legacy(self, element):
333 obj = Related.from_tree(element)
334 self.legacy_drawing = obj.id
335
336
337 def parse_row_breaks(self, element):
338 brk = RowBreak.from_tree(element)
339 self.row_breaks = brk
340
341
342 def parse_col_breaks(self, element):
343 brk = ColBreak.from_tree(element)
344 self.col_breaks = brk
345
346
347 def parse_custom_views(self, element):
348 # clear page_breaks to avoid duplication which Excel doesn't like
349 # basically they're ignored in custom views
350 self.row_breaks = RowBreak()
351 self.col_breaks = ColBreak()
352
353
354class WorksheetReader:
355 """
356 Create a parser and apply it to a workbook
357 """
358
359 def __init__(self, ws, xml_source, shared_strings, data_only, rich_text):
360 self.ws = ws
361 self.parser = WorkSheetParser(xml_source, shared_strings,
362 data_only, ws.parent.epoch, ws.parent._date_formats,
363 ws.parent._timedelta_formats, rich_text)
364 self.tables = []
365
366
367 def bind_cells(self):
368 for idx, row in self.parser.parse():
369 for cell in row:
370 style = self.ws.parent._cell_styles[cell['style_id']]
371 c = Cell(self.ws, row=cell['row'], column=cell['column'], style_array=style)
372 c._value = cell['value']
373 c.data_type = cell['data_type']
374 self.ws._cells[(cell['row'], cell['column'])] = c
375
376 if self.ws._cells:
377 self.ws._current_row = self.ws.max_row # use cells not row dimensions
378
379
380 def bind_formatting(self):
381 for cf in self.parser.formatting:
382 for rule in cf.rules:
383 if rule.dxfId is not None:
384 rule.dxf = self.ws.parent._differential_styles[rule.dxfId]
385 self.ws.conditional_formatting[cf] = rule
386
387
388 def bind_tables(self):
389 for t in self.parser.tables.tablePart:
390 rel = self.ws._rels.get(t.id)
391 self.tables.append(rel.Target)
392
393
394 def bind_merged_cells(self):
395 from openpyxl.worksheet.cell_range import MultiCellRange
396 from openpyxl.worksheet.merge import MergedCellRange
397 if not self.parser.merged_cells:
398 return
399
400 ranges = []
401 for cr in self.parser.merged_cells.mergeCell:
402 mcr = MergedCellRange(self.ws, cr.ref)
403 self.ws._clean_merge_range(mcr)
404 ranges.append(mcr)
405 self.ws.merged_cells = MultiCellRange(ranges)
406
407
408 def bind_hyperlinks(self):
409 for link in self.parser.hyperlinks.hyperlink:
410 if link.id:
411 rel = self.ws._rels.get(link.id)
412 link.target = rel.Target
413 if ":" in link.ref:
414 # range of cells
415 for row in self.ws[link.ref]:
416 for cell in row:
417 try:
418 cell.hyperlink = copy(link)
419 except AttributeError:
420 pass
421 else:
422 cell = self.ws[link.ref]
423 if isinstance(cell, MergedCell):
424 cell = self.normalize_merged_cell_link(cell.coordinate)
425 cell.hyperlink = link
426
427 def normalize_merged_cell_link(self, coord):
428 """
429 Returns the appropriate cell to which a hyperlink, which references a merged cell at the specified coordinates,
430 should be bound.
431 """
432 for rng in self.ws.merged_cells:
433 if coord in rng:
434 return self.ws.cell(*rng.top[0])
435
436 def bind_col_dimensions(self):
437 for col, cd in self.parser.column_dimensions.items():
438 if 'style' in cd:
439 key = int(cd['style'])
440 cd['style'] = self.ws.parent._cell_styles[key]
441 self.ws.column_dimensions[col] = ColumnDimension(self.ws, **cd)
442
443
444 def bind_row_dimensions(self):
445 for row, rd in self.parser.row_dimensions.items():
446 if 's' in rd:
447 key = int(rd['s'])
448 rd['s'] = self.ws.parent._cell_styles[key]
449 self.ws.row_dimensions[int(row)] = RowDimension(self.ws, **rd)
450
451
452 def bind_properties(self):
453 for k in ('print_options', 'page_margins', 'page_setup',
454 'HeaderFooter', 'auto_filter', 'data_validations',
455 'sheet_properties', 'views', 'sheet_format',
456 'row_breaks', 'col_breaks', 'scenarios', 'legacy_drawing',
457 'protection',
458 ):
459 v = getattr(self.parser, k, None)
460 if v is not None:
461 setattr(self.ws, k, v)
462
463
464 def bind_all(self):
465 self.bind_cells()
466 self.bind_merged_cells()
467 self.bind_hyperlinks()
468 self.bind_formatting()
469 self.bind_col_dimensions()
470 self.bind_row_dimensions()
471 self.bind_tables()
472 self.bind_properties()