Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/openpyxl/reader/excel.py: 21%

4"""Read an xlsx file into Python"""

6# Python stdlib imports

7from zipfile import ZipFile, ZIP_DEFLATED

8from io import BytesIO

9import os.path

10import warnings

12from openpyxl.pivot.table import TableDefinition

14# Allow blanket setting of KEEP_VBA for testing

15try:

16 from ..tests import KEEP_VBA

17except ImportError:

18 KEEP_VBA = False

20# package imports

21from openpyxl.utils.exceptions import InvalidFileException

22from openpyxl.xml.constants import (

23 ARC_CORE,

24 ARC_CUSTOM,

25 ARC_CONTENT_TYPES,

26 ARC_WORKBOOK,

27 ARC_THEME,

28 COMMENTS_NS,

29 SHARED_STRINGS,

30 XLTM,

31 XLTX,

32 XLSM,

33 XLSX,

34)

35from openpyxl.cell import MergedCell

36from openpyxl.comments.comment_sheet import CommentSheet

38from .strings import read_string_table, read_rich_text

39from .workbook import WorkbookParser

40from openpyxl.styles.stylesheet import apply_stylesheet

42from openpyxl.packaging.core import DocumentProperties

43from openpyxl.packaging.custom import CustomPropertyList

44from openpyxl.packaging.manifest import Manifest, Override

46from openpyxl.packaging.relationship import (

47 RelationshipList,

48 get_dependents,

49 get_rels_path,

50)

52from openpyxl.worksheet._read_only import ReadOnlyWorksheet

53from openpyxl.worksheet._reader import WorksheetReader

54from openpyxl.chartsheet import Chartsheet

55from openpyxl.worksheet.table import Table

56from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing

58from openpyxl.xml.functions import fromstring

60from .drawings import find_images

63SUPPORTED_FORMATS = ('.xlsx', '.xlsm', '.xltx', '.xltm')

66def _validate_archive(filename):

67 """

68 Does a first check whether filename is a string or a file-like

69 object. If it is a string representing a filename, a check is done

70 for supported formats by checking the given file-extension. If the

71 file-extension is not in SUPPORTED_FORMATS an InvalidFileException

72 will raised. Otherwise the filename (resp. file-like object) will

73 forwarded to zipfile.ZipFile returning a ZipFile-Instance.

74 """

75 is_file_like = hasattr(filename, 'read')

76 if not is_file_like:

77 file_format = os.path.splitext(filename)[-1].lower()

78 if file_format not in SUPPORTED_FORMATS:

79 if file_format == '.xls':

80 msg = ('openpyxl does not support the old .xls file format, '

81 'please use xlrd to read this file, or convert it to '

82 'the more recent .xlsx file format.')

83 elif file_format == '.xlsb':

84 msg = ('openpyxl does not support binary format .xlsb, '

85 'please convert this file to .xlsx format if you want '

86 'to open it with openpyxl')

87 else:

88 msg = ('openpyxl does not support %s file format, '

89 'please check you can open '

90 'it with Excel first. '

91 'Supported formats are: %s') % (file_format,

92 ','.join(SUPPORTED_FORMATS))

93 raise InvalidFileException(msg)

95 archive = ZipFile(filename, 'r')

96 return archive

99def _find_workbook_part(package):

100 workbook_types = [XLTM, XLTX, XLSM, XLSX]

101 for ct in workbook_types:

102 part = package.find(ct)

103 if part:

104 return part

105

106 # some applications reassign the default for application/xml

107 defaults = {p.ContentType for p in package.Default}

108 workbook_type = defaults & set(workbook_types)

109 if workbook_type:

110 return Override("/" + ARC_WORKBOOK, workbook_type.pop())

111

112 raise IOError("File contains no valid workbook part")

113

114

115class ExcelReader:

116

117 """

118 Read an Excel package and dispatch the contents to the relevant modules

119 """

120

121 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,

122 data_only=False, keep_links=True, rich_text=False):

123 self.archive = _validate_archive(fn)

124 self.valid_files = self.archive.namelist()

125 self.read_only = read_only

126 self.keep_vba = keep_vba

127 self.data_only = data_only

128 self.keep_links = keep_links

129 self.rich_text = rich_text

130 self.shared_strings = []

131

132

133 def read_manifest(self):

134 src = self.archive.read(ARC_CONTENT_TYPES)

135 root = fromstring(src)

136 self.package = Manifest.from_tree(root)

137

138

139 def read_strings(self):

140 ct = self.package.find(SHARED_STRINGS)

141 reader = read_string_table

142 if self.rich_text:

143 reader = read_rich_text

144 if ct is not None:

145 strings_path = ct.PartName[1:]

146 with self.archive.open(strings_path,) as src:

147 self.shared_strings = reader(src)

148

149

150 def read_workbook(self):

151 wb_part = _find_workbook_part(self.package)

152 self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links)

153 self.parser.parse()

154 wb = self.parser.wb

155 wb._sheets = []

156 wb._data_only = self.data_only

157 wb._read_only = self.read_only

158 wb.template = wb_part.ContentType in (XLTX, XLTM)

159

160 # If are going to preserve the vba then attach a copy of the archive to the

161 # workbook so that is available for the save.

162 if self.keep_vba:

163 wb.vba_archive = ZipFile(BytesIO(), 'a', ZIP_DEFLATED)

164 for name in self.valid_files:

165 wb.vba_archive.writestr(name, self.archive.read(name))

166

167 if self.read_only:

168 wb._archive = self.archive

169

170 self.wb = wb

171

172

173 def read_properties(self):

174 if ARC_CORE in self.valid_files:

175 src = fromstring(self.archive.read(ARC_CORE))

176 self.wb.properties = DocumentProperties.from_tree(src)

177

178

179 def read_custom(self):

180 if ARC_CUSTOM in self.valid_files:

181 src = fromstring(self.archive.read(ARC_CUSTOM))

182 self.wb.custom_doc_props = CustomPropertyList.from_tree(src)

183

184

185 def read_theme(self):

186 if ARC_THEME in self.valid_files:

187 self.wb.loaded_theme = self.archive.read(ARC_THEME)

188

189

190 def read_chartsheet(self, sheet, rel):

191 sheet_path = rel.target

192 rels_path = get_rels_path(sheet_path)

193 rels = []

194 if rels_path in self.valid_files:

195 rels = get_dependents(self.archive, rels_path)

196

197 with self.archive.open(sheet_path, "r") as src:

198 xml = src.read()

199 node = fromstring(xml)

200 cs = Chartsheet.from_tree(node)

201 cs._parent = self.wb

202 cs.title = sheet.name

203 self.wb._add_sheet(cs)

204

205 drawings = rels.find(SpreadsheetDrawing._rel_type)

206 for rel in drawings:

207 charts, images = find_images(self.archive, rel.target)

208 for c in charts:

209 cs.add_chart(c)

210

211

212 def read_worksheets(self):

213 comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data."""

214 for sheet, rel in self.parser.find_sheets():

215 if rel.target not in self.valid_files:

216 continue

217

218 if "chartsheet" in rel.Type:

219 self.read_chartsheet(sheet, rel)

220 continue

221

222 rels_path = get_rels_path(rel.target)

223 rels = RelationshipList()

224 if rels_path in self.valid_files:

225 rels = get_dependents(self.archive, rels_path)

226

227 if self.read_only:

228 ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings)

229 ws.sheet_state = sheet.state

230 self.wb._sheets.append(ws)

231 continue

232 else:

233 fh = self.archive.open(rel.target)

234 ws = self.wb.create_sheet(sheet.name)

235 ws._rels = rels

236 ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only, self.rich_text)

237 ws_parser.bind_all()

238 fh.close()

239

240 # assign any comments to cells

241 for r in rels.find(COMMENTS_NS):

242 src = self.archive.read(r.target)

243 comment_sheet = CommentSheet.from_tree(fromstring(src))

244 for ref, comment in comment_sheet.comments:

245 try:

246 ws[ref].comment = comment

247 except AttributeError:

248 c = ws[ref]

249 if isinstance(c, MergedCell):

250 warnings.warn(comment_warning.format(ws.title, c.coordinate))

251 continue

252

253 # preserve link to VML file if VBA

254 if self.wb.vba_archive and ws.legacy_drawing:

255 ws.legacy_drawing = rels.get(ws.legacy_drawing).target

256 else:

257 ws.legacy_drawing = None

258

259 for t in ws_parser.tables:

260 src = self.archive.read(t)

261 xml = fromstring(src)

262 table = Table.from_tree(xml)

263 ws.add_table(table)

264

265 drawings = rels.find(SpreadsheetDrawing._rel_type)

266 for rel in drawings:

267 charts, images = find_images(self.archive, rel.target)

268 for c in charts:

269 ws.add_chart(c, c.anchor)

270 for im in images:

271 ws.add_image(im, im.anchor)

272

273 pivot_rel = rels.find(TableDefinition.rel_type)

274 pivot_caches = self.parser.pivot_caches

275 for r in pivot_rel:

276 pivot_path = r.Target

277 src = self.archive.read(pivot_path)

278 tree = fromstring(src)

279 pivot = TableDefinition.from_tree(tree)

280 pivot.cache = pivot_caches[pivot.cacheId]

281 ws.add_pivot(pivot)

282

283 ws.sheet_state = sheet.state

284

285

286 def read(self):

287 action = "read manifest"

288 try:

289 self.read_manifest()

290 action = "read strings"

291 self.read_strings()

292 action = "read workbook"

293 self.read_workbook()

294 action = "read properties"

295 self.read_properties()

296 action = "read custom properties"

297 self.read_custom()

298 action = "read theme"

299 self.read_theme()

300 action = "read stylesheet"

301 apply_stylesheet(self.archive, self.wb)

302 action = "read worksheets"

303 self.read_worksheets()

304 action = "assign names"

305 self.parser.assign_names()

306 if not self.read_only:

307 self.archive.close()

308 except ValueError as e:

309 raise ValueError(

310 f"Unable to read workbook: could not {action} from {self.archive.filename}.\n"

311 "This is most probably because the workbook source files contain some invalid XML.\n"

312 "Please see the exception for more details."

313 ) from e

314

315

316def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA,

317 data_only=False, keep_links=True, rich_text=False):

318 """Open the given filename and return the workbook

319

320 :param filename: the path to open or a file-like object

321 :type filename: string or a file-like object open in binary mode c.f., :class:`zipfile.ZipFile`

322

323 :param read_only: optimised for reading, content cannot be edited

324 :type read_only: bool

325

326 :param keep_vba: preserve vba content (this does NOT mean you can use it)

327 :type keep_vba: bool

328

329 :param data_only: controls whether cells with formulae have either the formula (default) or the value stored the last time Excel read the sheet

330 :type data_only: bool

331

332 :param keep_links: whether links to external workbooks should be preserved. The default is True

333 :type keep_links: bool

334

335 :param rich_text: if set to True openpyxl will preserve any rich text formatting in cells. The default is False

336 :type rich_text: bool

337

338 :rtype: :class:`openpyxl.workbook.Workbook`

339

340 .. note::

341

342 When using lazy load, all worksheets will be :class:`openpyxl.worksheet.iter_worksheet.IterableWorksheet`

343 and the returned workbook will be read-only.

344

345 """

346 reader = ExcelReader(filename, read_only, keep_vba,

347 data_only, keep_links, rich_text)

348 reader.read()

349 return reader.wb