Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/openpyxl/reader/excel.py: 21%
206 statements
« prev ^ index » next coverage.py v7.3.3, created at 2023-12-20 06:34 +0000
« prev ^ index » next coverage.py v7.3.3, created at 2023-12-20 06:34 +0000
1# Copyright (c) 2010-2023 openpyxl
4"""Read an xlsx file into Python"""
6# Python stdlib imports
7from zipfile import ZipFile, ZIP_DEFLATED
8from io import BytesIO
9import os.path
10import warnings
12from openpyxl.pivot.table import TableDefinition
14# Allow blanket setting of KEEP_VBA for testing
15try:
16 from ..tests import KEEP_VBA
17except ImportError:
18 KEEP_VBA = False
20# package imports
21from openpyxl.utils.exceptions import InvalidFileException
22from openpyxl.xml.constants import (
23 ARC_CORE,
24 ARC_CUSTOM,
25 ARC_CONTENT_TYPES,
26 ARC_WORKBOOK,
27 ARC_THEME,
28 COMMENTS_NS,
29 SHARED_STRINGS,
30 XLTM,
31 XLTX,
32 XLSM,
33 XLSX,
34)
35from openpyxl.cell import MergedCell
36from openpyxl.comments.comment_sheet import CommentSheet
38from .strings import read_string_table, read_rich_text
39from .workbook import WorkbookParser
40from openpyxl.styles.stylesheet import apply_stylesheet
42from openpyxl.packaging.core import DocumentProperties
43from openpyxl.packaging.custom import CustomPropertyList
44from openpyxl.packaging.manifest import Manifest, Override
46from openpyxl.packaging.relationship import (
47 RelationshipList,
48 get_dependents,
49 get_rels_path,
50)
52from openpyxl.worksheet._read_only import ReadOnlyWorksheet
53from openpyxl.worksheet._reader import WorksheetReader
54from openpyxl.chartsheet import Chartsheet
55from openpyxl.worksheet.table import Table
56from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
58from openpyxl.xml.functions import fromstring
60from .drawings import find_images
63SUPPORTED_FORMATS = ('.xlsx', '.xlsm', '.xltx', '.xltm')
66def _validate_archive(filename):
67 """
68 Does a first check whether filename is a string or a file-like
69 object. If it is a string representing a filename, a check is done
70 for supported formats by checking the given file-extension. If the
71 file-extension is not in SUPPORTED_FORMATS an InvalidFileException
72 will raised. Otherwise the filename (resp. file-like object) will
73 forwarded to zipfile.ZipFile returning a ZipFile-Instance.
74 """
75 is_file_like = hasattr(filename, 'read')
76 if not is_file_like:
77 file_format = os.path.splitext(filename)[-1].lower()
78 if file_format not in SUPPORTED_FORMATS:
79 if file_format == '.xls':
80 msg = ('openpyxl does not support the old .xls file format, '
81 'please use xlrd to read this file, or convert it to '
82 'the more recent .xlsx file format.')
83 elif file_format == '.xlsb':
84 msg = ('openpyxl does not support binary format .xlsb, '
85 'please convert this file to .xlsx format if you want '
86 'to open it with openpyxl')
87 else:
88 msg = ('openpyxl does not support %s file format, '
89 'please check you can open '
90 'it with Excel first. '
91 'Supported formats are: %s') % (file_format,
92 ','.join(SUPPORTED_FORMATS))
93 raise InvalidFileException(msg)
95 archive = ZipFile(filename, 'r')
96 return archive
99def _find_workbook_part(package):
100 workbook_types = [XLTM, XLTX, XLSM, XLSX]
101 for ct in workbook_types:
102 part = package.find(ct)
103 if part:
104 return part
106 # some applications reassign the default for application/xml
107 defaults = {p.ContentType for p in package.Default}
108 workbook_type = defaults & set(workbook_types)
109 if workbook_type:
110 return Override("/" + ARC_WORKBOOK, workbook_type.pop())
112 raise IOError("File contains no valid workbook part")
115class ExcelReader:
117 """
118 Read an Excel package and dispatch the contents to the relevant modules
119 """
121 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
122 data_only=False, keep_links=True, rich_text=False):
123 self.archive = _validate_archive(fn)
124 self.valid_files = self.archive.namelist()
125 self.read_only = read_only
126 self.keep_vba = keep_vba
127 self.data_only = data_only
128 self.keep_links = keep_links
129 self.rich_text = rich_text
130 self.shared_strings = []
133 def read_manifest(self):
134 src = self.archive.read(ARC_CONTENT_TYPES)
135 root = fromstring(src)
136 self.package = Manifest.from_tree(root)
139 def read_strings(self):
140 ct = self.package.find(SHARED_STRINGS)
141 reader = read_string_table
142 if self.rich_text:
143 reader = read_rich_text
144 if ct is not None:
145 strings_path = ct.PartName[1:]
146 with self.archive.open(strings_path,) as src:
147 self.shared_strings = reader(src)
150 def read_workbook(self):
151 wb_part = _find_workbook_part(self.package)
152 self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links)
153 self.parser.parse()
154 wb = self.parser.wb
155 wb._sheets = []
156 wb._data_only = self.data_only
157 wb._read_only = self.read_only
158 wb.template = wb_part.ContentType in (XLTX, XLTM)
160 # If are going to preserve the vba then attach a copy of the archive to the
161 # workbook so that is available for the save.
162 if self.keep_vba:
163 wb.vba_archive = ZipFile(BytesIO(), 'a', ZIP_DEFLATED)
164 for name in self.valid_files:
165 wb.vba_archive.writestr(name, self.archive.read(name))
167 if self.read_only:
168 wb._archive = self.archive
170 self.wb = wb
173 def read_properties(self):
174 if ARC_CORE in self.valid_files:
175 src = fromstring(self.archive.read(ARC_CORE))
176 self.wb.properties = DocumentProperties.from_tree(src)
179 def read_custom(self):
180 if ARC_CUSTOM in self.valid_files:
181 src = fromstring(self.archive.read(ARC_CUSTOM))
182 self.wb.custom_doc_props = CustomPropertyList.from_tree(src)
185 def read_theme(self):
186 if ARC_THEME in self.valid_files:
187 self.wb.loaded_theme = self.archive.read(ARC_THEME)
190 def read_chartsheet(self, sheet, rel):
191 sheet_path = rel.target
192 rels_path = get_rels_path(sheet_path)
193 rels = []
194 if rels_path in self.valid_files:
195 rels = get_dependents(self.archive, rels_path)
197 with self.archive.open(sheet_path, "r") as src:
198 xml = src.read()
199 node = fromstring(xml)
200 cs = Chartsheet.from_tree(node)
201 cs._parent = self.wb
202 cs.title = sheet.name
203 self.wb._add_sheet(cs)
205 drawings = rels.find(SpreadsheetDrawing._rel_type)
206 for rel in drawings:
207 charts, images = find_images(self.archive, rel.target)
208 for c in charts:
209 cs.add_chart(c)
212 def read_worksheets(self):
213 comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data."""
214 for sheet, rel in self.parser.find_sheets():
215 if rel.target not in self.valid_files:
216 continue
218 if "chartsheet" in rel.Type:
219 self.read_chartsheet(sheet, rel)
220 continue
222 rels_path = get_rels_path(rel.target)
223 rels = RelationshipList()
224 if rels_path in self.valid_files:
225 rels = get_dependents(self.archive, rels_path)
227 if self.read_only:
228 ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings)
229 ws.sheet_state = sheet.state
230 self.wb._sheets.append(ws)
231 continue
232 else:
233 fh = self.archive.open(rel.target)
234 ws = self.wb.create_sheet(sheet.name)
235 ws._rels = rels
236 ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only, self.rich_text)
237 ws_parser.bind_all()
239 # assign any comments to cells
240 for r in rels.find(COMMENTS_NS):
241 src = self.archive.read(r.target)
242 comment_sheet = CommentSheet.from_tree(fromstring(src))
243 for ref, comment in comment_sheet.comments:
244 try:
245 ws[ref].comment = comment
246 except AttributeError:
247 c = ws[ref]
248 if isinstance(c, MergedCell):
249 warnings.warn(comment_warning.format(ws.title, c.coordinate))
250 continue
252 # preserve link to VML file if VBA
253 if self.wb.vba_archive and ws.legacy_drawing:
254 ws.legacy_drawing = rels[ws.legacy_drawing].target
255 else:
256 ws.legacy_drawing = None
258 for t in ws_parser.tables:
259 src = self.archive.read(t)
260 xml = fromstring(src)
261 table = Table.from_tree(xml)
262 ws.add_table(table)
264 drawings = rels.find(SpreadsheetDrawing._rel_type)
265 for rel in drawings:
266 charts, images = find_images(self.archive, rel.target)
267 for c in charts:
268 ws.add_chart(c, c.anchor)
269 for im in images:
270 ws.add_image(im, im.anchor)
272 pivot_rel = rels.find(TableDefinition.rel_type)
273 for r in pivot_rel:
274 pivot_path = r.Target
275 src = self.archive.read(pivot_path)
276 tree = fromstring(src)
277 pivot = TableDefinition.from_tree(tree)
278 pivot.cache = self.parser.pivot_caches[pivot.cacheId]
279 ws.add_pivot(pivot)
281 ws.sheet_state = sheet.state
284 def read(self):
285 action = "read manifest"
286 try:
287 self.read_manifest()
288 action = "read strings"
289 self.read_strings()
290 action = "read workbook"
291 self.read_workbook()
292 action = "read properties"
293 self.read_properties()
294 action = "read custom properties"
295 self.read_custom()
296 action = "read theme"
297 self.read_theme()
298 action = "read stylesheet"
299 apply_stylesheet(self.archive, self.wb)
300 action = "read worksheets"
301 self.read_worksheets()
302 action = "assign names"
303 self.parser.assign_names()
304 if not self.read_only:
305 self.archive.close()
306 except ValueError as e:
307 raise ValueError(
308 f"Unable to read workbook: could not {action} from {self.archive.filename}.\n"
309 "This is most probably because the workbook source files contain some invalid XML.\n"
310 "Please see the exception for more details."
311 ) from e
314def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA,
315 data_only=False, keep_links=True, rich_text=False):
316 """Open the given filename and return the workbook
318 :param filename: the path to open or a file-like object
319 :type filename: string or a file-like object open in binary mode c.f., :class:`zipfile.ZipFile`
321 :param read_only: optimised for reading, content cannot be edited
322 :type read_only: bool
324 :param keep_vba: preserve vba content (this does NOT mean you can use it)
325 :type keep_vba: bool
327 :param data_only: controls whether cells with formulae have either the formula (default) or the value stored the last time Excel read the sheet
328 :type data_only: bool
330 :param keep_links: whether links to external workbooks should be preserved. The default is True
331 :type keep_links: bool
333 :param rich_text: if set to True openpyxl will preserve any rich text formatting in cells. The default is False
334 :type rich_text: bool
336 :rtype: :class:`openpyxl.workbook.Workbook`
338 .. note::
340 When using lazy load, all worksheets will be :class:`openpyxl.worksheet.iter_worksheet.IterableWorksheet`
341 and the returned workbook will be read-only.
343 """
344 reader = ExcelReader(filename, read_only, keep_vba,
345 data_only, keep_links, rich_text)
346 reader.read()
347 return reader.wb