1# Copyright (c) 2010-2024 openpyxl
2
3
4"""Read an xlsx file into Python"""
5
6# Python stdlib imports
7from zipfile import ZipFile, ZIP_DEFLATED
8from io import BytesIO
9import os.path
10import warnings
11
12from openpyxl.pivot.table import TableDefinition
13
14# Allow blanket setting of KEEP_VBA for testing
15try:
16 from ..tests import KEEP_VBA
17except ImportError:
18 KEEP_VBA = False
19
20# package imports
21from openpyxl.utils.exceptions import InvalidFileException
22from openpyxl.xml.constants import (
23 ARC_CORE,
24 ARC_CUSTOM,
25 ARC_CONTENT_TYPES,
26 ARC_WORKBOOK,
27 ARC_THEME,
28 COMMENTS_NS,
29 SHARED_STRINGS,
30 XLTM,
31 XLTX,
32 XLSM,
33 XLSX,
34)
35from openpyxl.cell import MergedCell
36from openpyxl.comments.comment_sheet import CommentSheet
37
38from .strings import read_string_table, read_rich_text
39from .workbook import WorkbookParser
40from openpyxl.styles.stylesheet import apply_stylesheet
41
42from openpyxl.packaging.core import DocumentProperties
43from openpyxl.packaging.custom import CustomPropertyList
44from openpyxl.packaging.manifest import Manifest, Override
45
46from openpyxl.packaging.relationship import (
47 RelationshipList,
48 get_dependents,
49 get_rels_path,
50)
51
52from openpyxl.worksheet._read_only import ReadOnlyWorksheet
53from openpyxl.worksheet._reader import WorksheetReader
54from openpyxl.chartsheet import Chartsheet
55from openpyxl.worksheet.table import Table
56from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
57
58from openpyxl.xml.functions import fromstring
59
60from .drawings import find_images
61
62
63SUPPORTED_FORMATS = ('.xlsx', '.xlsm', '.xltx', '.xltm')
64
65
66def _validate_archive(filename):
67 """
68 Does a first check whether filename is a string or a file-like
69 object. If it is a string representing a filename, a check is done
70 for supported formats by checking the given file-extension. If the
71 file-extension is not in SUPPORTED_FORMATS an InvalidFileException
72 will raised. Otherwise the filename (resp. file-like object) will
73 forwarded to zipfile.ZipFile returning a ZipFile-Instance.
74 """
75 is_file_like = hasattr(filename, 'read')
76 if not is_file_like:
77 file_format = os.path.splitext(filename)[-1].lower()
78 if file_format not in SUPPORTED_FORMATS:
79 if file_format == '.xls':
80 msg = ('openpyxl does not support the old .xls file format, '
81 'please use xlrd to read this file, or convert it to '
82 'the more recent .xlsx file format.')
83 elif file_format == '.xlsb':
84 msg = ('openpyxl does not support binary format .xlsb, '
85 'please convert this file to .xlsx format if you want '
86 'to open it with openpyxl')
87 else:
88 msg = ('openpyxl does not support %s file format, '
89 'please check you can open '
90 'it with Excel first. '
91 'Supported formats are: %s') % (file_format,
92 ','.join(SUPPORTED_FORMATS))
93 raise InvalidFileException(msg)
94
95 archive = ZipFile(filename, 'r')
96 return archive
97
98
99def _find_workbook_part(package):
100 workbook_types = [XLTM, XLTX, XLSM, XLSX]
101 for ct in workbook_types:
102 part = package.find(ct)
103 if part:
104 return part
105
106 # some applications reassign the default for application/xml
107 defaults = {p.ContentType for p in package.Default}
108 workbook_type = defaults & set(workbook_types)
109 if workbook_type:
110 return Override("/" + ARC_WORKBOOK, workbook_type.pop())
111
112 raise IOError("File contains no valid workbook part")
113
114
115class ExcelReader:
116
117 """
118 Read an Excel package and dispatch the contents to the relevant modules
119 """
120
121 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
122 data_only=False, keep_links=True, rich_text=False):
123 self.archive = _validate_archive(fn)
124 self.valid_files = self.archive.namelist()
125 self.read_only = read_only
126 self.keep_vba = keep_vba
127 self.data_only = data_only
128 self.keep_links = keep_links
129 self.rich_text = rich_text
130 self.shared_strings = []
131
132
133 def read_manifest(self):
134 src = self.archive.read(ARC_CONTENT_TYPES)
135 root = fromstring(src)
136 self.package = Manifest.from_tree(root)
137
138
139 def read_strings(self):
140 ct = self.package.find(SHARED_STRINGS)
141 reader = read_string_table
142 if self.rich_text:
143 reader = read_rich_text
144 if ct is not None:
145 strings_path = ct.PartName[1:]
146 with self.archive.open(strings_path,) as src:
147 self.shared_strings = reader(src)
148
149
150 def read_workbook(self):
151 wb_part = _find_workbook_part(self.package)
152 self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links)
153 self.parser.parse()
154 wb = self.parser.wb
155 wb._sheets = []
156 wb._data_only = self.data_only
157 wb._read_only = self.read_only
158 wb.template = wb_part.ContentType in (XLTX, XLTM)
159
160 # If are going to preserve the vba then attach a copy of the archive to the
161 # workbook so that is available for the save.
162 if self.keep_vba:
163 wb.vba_archive = ZipFile(BytesIO(), 'a', ZIP_DEFLATED)
164 for name in self.valid_files:
165 wb.vba_archive.writestr(name, self.archive.read(name))
166
167 if self.read_only:
168 wb._archive = self.archive
169
170 self.wb = wb
171
172
173 def read_properties(self):
174 if ARC_CORE in self.valid_files:
175 src = fromstring(self.archive.read(ARC_CORE))
176 self.wb.properties = DocumentProperties.from_tree(src)
177
178
179 def read_custom(self):
180 if ARC_CUSTOM in self.valid_files:
181 src = fromstring(self.archive.read(ARC_CUSTOM))
182 self.wb.custom_doc_props = CustomPropertyList.from_tree(src)
183
184
185 def read_theme(self):
186 if ARC_THEME in self.valid_files:
187 self.wb.loaded_theme = self.archive.read(ARC_THEME)
188
189
190 def read_chartsheet(self, sheet, rel):
191 sheet_path = rel.target
192 rels_path = get_rels_path(sheet_path)
193 rels = []
194 if rels_path in self.valid_files:
195 rels = get_dependents(self.archive, rels_path)
196
197 with self.archive.open(sheet_path, "r") as src:
198 xml = src.read()
199 node = fromstring(xml)
200 cs = Chartsheet.from_tree(node)
201 cs._parent = self.wb
202 cs.title = sheet.name
203 self.wb._add_sheet(cs)
204
205 drawings = rels.find(SpreadsheetDrawing._rel_type)
206 for rel in drawings:
207 charts, images = find_images(self.archive, rel.target)
208 for c in charts:
209 cs.add_chart(c)
210
211
212 def read_worksheets(self):
213 comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data."""
214 for sheet, rel in self.parser.find_sheets():
215 if rel.target not in self.valid_files:
216 continue
217
218 if "chartsheet" in rel.Type:
219 self.read_chartsheet(sheet, rel)
220 continue
221
222 rels_path = get_rels_path(rel.target)
223 rels = RelationshipList()
224 if rels_path in self.valid_files:
225 rels = get_dependents(self.archive, rels_path)
226
227 if self.read_only:
228 ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings)
229 ws.sheet_state = sheet.state
230 self.wb._sheets.append(ws)
231 continue
232 else:
233 fh = self.archive.open(rel.target)
234 ws = self.wb.create_sheet(sheet.name)
235 ws._rels = rels
236 ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only, self.rich_text)
237 ws_parser.bind_all()
238 fh.close()
239
240 # assign any comments to cells
241 for r in rels.find(COMMENTS_NS):
242 src = self.archive.read(r.target)
243 comment_sheet = CommentSheet.from_tree(fromstring(src))
244 for ref, comment in comment_sheet.comments:
245 try:
246 ws[ref].comment = comment
247 except AttributeError:
248 c = ws[ref]
249 if isinstance(c, MergedCell):
250 warnings.warn(comment_warning.format(ws.title, c.coordinate))
251 continue
252
253 # preserve link to VML file if VBA
254 if self.wb.vba_archive and ws.legacy_drawing:
255 ws.legacy_drawing = rels.get(ws.legacy_drawing).target
256 else:
257 ws.legacy_drawing = None
258
259 for t in ws_parser.tables:
260 src = self.archive.read(t)
261 xml = fromstring(src)
262 table = Table.from_tree(xml)
263 ws.add_table(table)
264
265 drawings = rels.find(SpreadsheetDrawing._rel_type)
266 for rel in drawings:
267 charts, images = find_images(self.archive, rel.target)
268 for c in charts:
269 ws.add_chart(c, c.anchor)
270 for im in images:
271 ws.add_image(im, im.anchor)
272
273 pivot_rel = rels.find(TableDefinition.rel_type)
274 pivot_caches = self.parser.pivot_caches
275 for r in pivot_rel:
276 pivot_path = r.Target
277 src = self.archive.read(pivot_path)
278 tree = fromstring(src)
279 pivot = TableDefinition.from_tree(tree)
280 pivot.cache = pivot_caches[pivot.cacheId]
281 ws.add_pivot(pivot)
282
283 ws.sheet_state = sheet.state
284
285
286 def read(self):
287 action = "read manifest"
288 try:
289 self.read_manifest()
290 action = "read strings"
291 self.read_strings()
292 action = "read workbook"
293 self.read_workbook()
294 action = "read properties"
295 self.read_properties()
296 action = "read custom properties"
297 self.read_custom()
298 action = "read theme"
299 self.read_theme()
300 action = "read stylesheet"
301 apply_stylesheet(self.archive, self.wb)
302 action = "read worksheets"
303 self.read_worksheets()
304 action = "assign names"
305 self.parser.assign_names()
306 if not self.read_only:
307 self.archive.close()
308 except ValueError as e:
309 raise ValueError(
310 f"Unable to read workbook: could not {action} from {self.archive.filename}.\n"
311 "This is most probably because the workbook source files contain some invalid XML.\n"
312 "Please see the exception for more details."
313 ) from e
314
315
316def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA,
317 data_only=False, keep_links=True, rich_text=False):
318 """Open the given filename and return the workbook
319
320 :param filename: the path to open or a file-like object
321 :type filename: string or a file-like object open in binary mode c.f., :class:`zipfile.ZipFile`
322
323 :param read_only: optimised for reading, content cannot be edited
324 :type read_only: bool
325
326 :param keep_vba: preserve vba content (this does NOT mean you can use it)
327 :type keep_vba: bool
328
329 :param data_only: controls whether cells with formulae have either the formula (default) or the value stored the last time Excel read the sheet
330 :type data_only: bool
331
332 :param keep_links: whether links to external workbooks should be preserved. The default is True
333 :type keep_links: bool
334
335 :param rich_text: if set to True openpyxl will preserve any rich text formatting in cells. The default is False
336 :type rich_text: bool
337
338 :rtype: :class:`openpyxl.workbook.Workbook`
339
340 .. note::
341
342 When using lazy load, all worksheets will be :class:`openpyxl.worksheet.iter_worksheet.IterableWorksheet`
343 and the returned workbook will be read-only.
344
345 """
346 reader = ExcelReader(filename, read_only, keep_vba,
347 data_only, keep_links, rich_text)
348 reader.read()
349 return reader.wb