1# Copyright (c) 2010-2024 openpyxl
2
3from warnings import warn
4
5from openpyxl.xml.functions import fromstring
6
7from openpyxl.packaging.relationship import (
8 get_dependents,
9 get_rels_path,
10 get_rel,
11)
12from openpyxl.packaging.workbook import WorkbookPackage
13from openpyxl.workbook import Workbook
14from openpyxl.workbook.defined_name import DefinedNameList
15from openpyxl.workbook.external_link.external import read_external_link
16from openpyxl.pivot.cache import CacheDefinition
17from openpyxl.pivot.record import RecordList
18from openpyxl.worksheet.print_settings import PrintTitles, PrintArea
19
20from openpyxl.utils.datetime import CALENDAR_MAC_1904
21
22
23class WorkbookParser:
24
25 _rels = None
26
27 def __init__(self, archive, workbook_part_name, keep_links=True):
28 self.archive = archive
29 self.workbook_part_name = workbook_part_name
30 self.defined_names = DefinedNameList()
31 self.wb = Workbook()
32 self.keep_links = keep_links
33 self.sheets = []
34
35
36 @property
37 def rels(self):
38 if self._rels is None:
39 self._rels = get_dependents(self.archive, get_rels_path(self.workbook_part_name)).to_dict()
40 return self._rels
41
42
43 def parse(self):
44 src = self.archive.read(self.workbook_part_name)
45 node = fromstring(src)
46 package = WorkbookPackage.from_tree(node)
47 if package.properties.date1904:
48 self.wb.epoch = CALENDAR_MAC_1904
49
50 self.wb.code_name = package.properties.codeName
51 self.wb.active = package.active
52 self.wb.views = package.bookViews
53 self.sheets = package.sheets
54 self.wb.calculation = package.calcPr
55 self.caches = package.pivotCaches
56
57 # external links contain cached worksheets and can be very big
58 if not self.keep_links:
59 package.externalReferences = []
60
61 for ext_ref in package.externalReferences:
62 rel = self.rels.get(ext_ref.id)
63 self.wb._external_links.append(
64 read_external_link(self.archive, rel.Target)
65 )
66
67 if package.definedNames:
68 self.defined_names = package.definedNames
69
70 self.wb.security = package.workbookProtection
71
72
73 def find_sheets(self):
74 """
75 Find all sheets in the workbook and return the link to the source file.
76
77 Older XLSM files sometimes contain invalid sheet elements.
78 Warn user when these are removed.
79 """
80
81 for sheet in self.sheets:
82 if not sheet.id:
83 msg = f"File contains an invalid specification for {0}. This will be removed".format(sheet.name)
84 warn(msg)
85 continue
86 yield sheet, self.rels[sheet.id]
87
88
89 def assign_names(self):
90 """
91 Bind defined names and other definitions to worksheets or the workbook
92 """
93
94 for idx, names in self.defined_names.by_sheet().items():
95 if idx == "global":
96 self.wb.defined_names = names
97 continue
98
99 try:
100 sheet = self.wb._sheets[idx]
101 except IndexError:
102 warn(f"Defined names for sheet index {idx} cannot be located")
103 continue
104
105 for name, defn in names.items():
106 reserved = defn.is_reserved
107 if reserved is None:
108 sheet.defined_names[name] = defn
109
110 elif reserved == "Print_Titles":
111 titles = PrintTitles.from_string(defn.value)
112 sheet._print_rows = titles.rows
113 sheet._print_cols = titles.cols
114 elif reserved == "Print_Area":
115 try:
116 sheet._print_area = PrintArea.from_string(defn.value)
117 except TypeError:
118 warn(f"Print area cannot be set to Defined name: {defn.value}.")
119 continue
120
121 @property
122 def pivot_caches(self):
123 """
124 Get PivotCache objects
125 """
126 d = {}
127 for c in self.caches:
128 cache = get_rel(self.archive, self.rels, id=c.id, cls=CacheDefinition)
129 if cache.deps:
130 records = get_rel(self.archive, cache.deps, cache.id, RecordList)
131 cache.records = records
132 d[c.cacheId] = cache
133 return d