Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/olefile/olefile.py: 42%

1186 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:37 +0000

1""" 

2olefile (formerly OleFileIO_PL) 

3 

4Module to read/write Microsoft OLE2 files (also called Structured Storage or 

5Microsoft Compound Document File Format), such as Microsoft Office 97-2003 

6documents, Image Composer and FlashPix files, Outlook messages, ... 

7This version is compatible with Python 2.7 and 3.5+ 

8 

9Project website: https://www.decalage.info/olefile 

10 

11olefile is copyright (c) 2005-2020 Philippe Lagadec 

12(https://www.decalage.info) 

13 

14olefile is based on the OleFileIO module from the PIL library v1.1.7 

15See: http://www.pythonware.com/products/pil/index.htm 

16and http://svn.effbot.org/public/tags/pil-1.1.7/PIL/OleFileIO.py 

17 

18The Python Imaging Library (PIL) is 

19Copyright (c) 1997-2009 by Secret Labs AB 

20Copyright (c) 1995-2009 by Fredrik Lundh 

21 

22See source code and LICENSE.txt for information on usage and redistribution. 

23""" 

24 

25# Since olefile v0.47, only Python 2.7 and 3.5+ are supported 

26# This import enables print() as a function rather than a keyword 

27# (main requirement to be compatible with Python 3.x) 

28# The comment on the line below should be printed on Python 2.5 or older: 

29from __future__ import print_function # This version of olefile requires Python 2.7 or 3.5+. 

30 

31 

32#--- LICENSE ------------------------------------------------------------------ 

33 

34# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2020 Philippe Lagadec 

35# (https://www.decalage.info) 

36# 

37# All rights reserved. 

38# 

39# Redistribution and use in source and binary forms, with or without modification, 

40# are permitted provided that the following conditions are met: 

41# 

42# * Redistributions of source code must retain the above copyright notice, this 

43# list of conditions and the following disclaimer. 

44# * Redistributions in binary form must reproduce the above copyright notice, 

45# this list of conditions and the following disclaimer in the documentation 

46# and/or other materials provided with the distribution. 

47# 

48# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 

49# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 

50# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 

51# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 

52# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 

53# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 

54# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 

55# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 

56# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 

57# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

58 

59# ---------- 

60# PIL License: 

61# 

62# olefile is based on source code from the OleFileIO module of the Python 

63# Imaging Library (PIL) published by Fredrik Lundh under the following license: 

64 

65# The Python Imaging Library (PIL) is 

66# Copyright (c) 1997-2009 by Secret Labs AB 

67# Copyright (c) 1995-2009 by Fredrik Lundh 

68# 

69# By obtaining, using, and/or copying this software and/or its associated 

70# documentation, you agree that you have read, understood, and will comply with 

71# the following terms and conditions: 

72# 

73# Permission to use, copy, modify, and distribute this software and its 

74# associated documentation for any purpose and without fee is hereby granted, 

75# provided that the above copyright notice appears in all copies, and that both 

76# that copyright notice and this permission notice appear in supporting 

77# documentation, and that the name of Secret Labs AB or the author(s) not be used 

78# in advertising or publicity pertaining to distribution of the software 

79# without specific, written prior permission. 

80# 

81# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS 

82# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 

83# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, 

84# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 

85# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 

86# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 

87# PERFORMANCE OF THIS SOFTWARE. 

88 

89__date__ = "2020-10-07" 

90__version__ = '0.47.dev4' 

91__author__ = "Philippe Lagadec" 

92 

93__all__ = ['isOleFile', 'OleFileIO', 'OleMetadata', 'enable_logging', 

94 'MAGIC', 'STGTY_EMPTY', 

95 'STGTY_STREAM', 'STGTY_STORAGE', 'STGTY_ROOT', 'STGTY_PROPERTY', 

96 'STGTY_LOCKBYTES', 'MINIMAL_OLEFILE_SIZE', 

97 'DEFECT_UNSURE', 'DEFECT_POTENTIAL', 'DEFECT_INCORRECT', 

98 'DEFECT_FATAL', 'DEFAULT_PATH_ENCODING', 

99 'MAXREGSECT', 'DIFSECT', 'FATSECT', 'ENDOFCHAIN', 'FREESECT', 

100 'MAXREGSID', 'NOSTREAM', 'UNKNOWN_SIZE', 'WORD_CLSID', 

101 'OleFileIONotClosed' 

102] 

103 

104import io 

105import sys 

106import struct, array, os.path, datetime, logging, warnings, traceback 

107 

108#=== COMPATIBILITY WORKAROUNDS ================================================ 

109 

110# For Python 3.x, need to redefine long as int: 

111if str is not bytes: 

112 long = int 

113 

114# Need to make sure we use xrange both on Python 2 and 3.x: 

115try: 

116 # on Python 2 we need xrange: 

117 iterrange = xrange 

118except Exception: 

119 # no xrange, for Python 3 it was renamed as range: 

120 iterrange = range 

121 

122# [PL] workaround to fix an issue with array item size on 64 bits systems: 

123if array.array('L').itemsize == 4: 

124 # on 32 bits platforms, long integers in an array are 32 bits: 

125 UINT32 = 'L' 

126elif array.array('I').itemsize == 4: 

127 # on 64 bits platforms, integers in an array are 32 bits: 

128 UINT32 = 'I' 

129elif array.array('i').itemsize == 4: 

130 # On 64 bit Jython, signed integers ('i') are the only way to store our 32 

131 # bit values in an array in a *somewhat* reasonable way, as the otherwise 

132 # perfectly suited 'H' (unsigned int, 32 bits) results in a completely 

133 # unusable behaviour. This is most likely caused by the fact that Java 

134 # doesn't have unsigned values, and thus Jython's "array" implementation, 

135 # which is based on "jarray", doesn't have them either. 

136 # NOTE: to trick Jython into converting the values it would normally 

137 # interpret as "signed" into "unsigned", a binary-and operation with 

138 # 0xFFFFFFFF can be used. This way it is possible to use the same comparing 

139 # operations on all platforms / implementations. The corresponding code 

140 # lines are flagged with a 'JYTHON-WORKAROUND' tag below. 

141 UINT32 = 'i' 

142else: 

143 raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...') 

144 

145 

146# [PL] These workarounds were inspired from the Path module 

147# (see http://www.jorendorff.com/articles/python/path/) 

148# TODO: remove the use of basestring, as it was removed in Python 3 

149try: 

150 basestring 

151except NameError: 

152 basestring = str 

153 

154if sys.version_info[0] < 3: 

155 # On Python 2.x, the default encoding for path names is UTF-8: 

156 DEFAULT_PATH_ENCODING = 'utf-8' 

157else: 

158 # On Python 3.x, the default encoding for path names is Unicode (None): 

159 DEFAULT_PATH_ENCODING = None 

160 

161 

162# === LOGGING ================================================================= 

163 

164def get_logger(name, level=logging.CRITICAL+1): 

165 """ 

166 Create a suitable logger object for this module. 

167 The goal is not to change settings of the root logger, to avoid getting 

168 other modules' logs on the screen. 

169 If a logger exists with same name, reuse it. (Else it would have duplicate 

170 handlers and messages would be doubled.) 

171 The level is set to CRITICAL+1 by default, to avoid any logging. 

172 """ 

173 # First, test if there is already a logger with the same name, else it 

174 # will generate duplicate messages (due to duplicate handlers): 

175 if name in logging.Logger.manager.loggerDict: 

176 #NOTE: another less intrusive but more "hackish" solution would be to 

177 # use getLogger then test if its effective level is not default. 

178 logger = logging.getLogger(name) 

179 # make sure level is OK: 

180 logger.setLevel(level) 

181 return logger 

182 # get a new logger: 

183 logger = logging.getLogger(name) 

184 # only add a NullHandler for this logger, it is up to the application 

185 # to configure its own logging: 

186 logger.addHandler(logging.NullHandler()) 

187 logger.setLevel(level) 

188 return logger 

189 

190 

191# a global logger object used for debugging: 

192log = get_logger('olefile') 

193 

194 

195def enable_logging(): 

196 """ 

197 Enable logging for this module (disabled by default). 

198 This will set the module-specific logger level to NOTSET, which 

199 means the main application controls the actual logging level. 

200 """ 

201 log.setLevel(logging.NOTSET) 

202 

203 

204#=== CONSTANTS =============================================================== 

205 

206#: magic bytes that should be at the beginning of every OLE file: 

207MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' 

208 

209# [PL]: added constants for Sector IDs (from AAF specifications) 

210MAXREGSECT = 0xFFFFFFFA #: (-6) maximum SECT 

211DIFSECT = 0xFFFFFFFC #: (-4) denotes a DIFAT sector in a FAT 

212FATSECT = 0xFFFFFFFD #: (-3) denotes a FAT sector in a FAT 

213ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain 

214FREESECT = 0xFFFFFFFF #: (-1) unallocated sector 

215 

216# [PL]: added constants for Directory Entry IDs (from AAF specifications) 

217MAXREGSID = 0xFFFFFFFA #: (-6) maximum directory entry ID 

218NOSTREAM = 0xFFFFFFFF #: (-1) unallocated directory entry 

219 

220# [PL] object types in storage (from AAF specifications) 

221STGTY_EMPTY = 0 #: empty directory entry 

222STGTY_STORAGE = 1 #: element is a storage object 

223STGTY_STREAM = 2 #: element is a stream object 

224STGTY_LOCKBYTES = 3 #: element is an ILockBytes object 

225STGTY_PROPERTY = 4 #: element is an IPropertyStorage object 

226STGTY_ROOT = 5 #: element is a root storage 

227 

228# Unknown size for a stream (used by OleStream): 

229UNKNOWN_SIZE = 0x7FFFFFFF 

230 

231# 

232# -------------------------------------------------------------------- 

233# property types 

234 

235VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; 

236VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; 

237VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; 

238VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; 

239VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; 

240VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; 

241VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; 

242VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; 

243VT_VECTOR=0x1000; 

244 

245# map property id to name (for debugging purposes) 

246VT = {} 

247for keyword, var in list(vars().items()): 

248 if keyword[:3] == "VT_": 

249 VT[var] = keyword 

250 

251# 

252# -------------------------------------------------------------------- 

253# Some common document types (root.clsid fields) 

254 

255WORD_CLSID = "00020900-0000-0000-C000-000000000046" 

256# TODO: check Excel, PPT, ... 

257 

258# [PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() 

259DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect 

260DEFECT_POTENTIAL = 20 # a potential defect 

261DEFECT_INCORRECT = 30 # an error according to specifications, but parsing 

262 # can go on 

263DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is 

264 # impossible 

265 

266# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes 

267# (this is used in isOleFile and OleFileIO.open) 

268MINIMAL_OLEFILE_SIZE = 1536 

269 

270#=== FUNCTIONS =============================================================== 

271 

272def isOleFile (filename): 

273 """ 

274 Test if a file is an OLE container (according to the magic bytes in its header). 

275 

276 .. note:: 

277 This function only checks the first 8 bytes of the file, not the 

278 rest of the OLE structure. 

279 

280 .. versionadded:: 0.16 

281 

282 :param filename: filename, contents or file-like object of the OLE file (string-like or file-like object) 

283 

284 - if filename is a string smaller than 1536 bytes, it is the path 

285 of the file to open. (bytes or unicode string) 

286 - if filename is a string longer than 1535 bytes, it is parsed 

287 as the content of an OLE file in memory. (bytes type only) 

288 - if filename is a file-like object (with read and seek methods), 

289 it is parsed as-is. 

290 

291 :type filename: bytes or str or unicode or file 

292 :returns: True if OLE, False otherwise. 

293 :rtype: bool 

294 """ 

295 # check if filename is a string-like or file-like object: 

296 if hasattr(filename, 'read'): 

297 # file-like object: use it directly 

298 header = filename.read(len(MAGIC)) 

299 # just in case, seek back to start of file: 

300 filename.seek(0) 

301 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: 

302 # filename is a bytes string containing the OLE file to be parsed: 

303 header = filename[:len(MAGIC)] 

304 else: 

305 # string-like object: filename of file on disk 

306 with open(filename, 'rb') as fp: 

307 header = fp.read(len(MAGIC)) 

308 if header == MAGIC: 

309 return True 

310 else: 

311 return False 

312 

313 

314if bytes is str: 

315 # version for Python 2.x 

316 def i8(c): 

317 return ord(c) 

318else: 

319 # version for Python 3.x 

320 def i8(c): 

321 return c if c.__class__ is int else c[0] 

322 

323 

324def i16(c, o = 0): 

325 """ 

326 Converts a 2-bytes (16 bits) string to an integer. 

327 

328 :param c: string containing bytes to convert 

329 :param o: offset of bytes to convert in string 

330 """ 

331 return struct.unpack("<H", c[o:o+2])[0] 

332 

333 

334def i32(c, o = 0): 

335 """ 

336 Converts a 4-bytes (32 bits) string to an integer. 

337 

338 :param c: string containing bytes to convert 

339 :param o: offset of bytes to convert in string 

340 """ 

341 return struct.unpack("<I", c[o:o+4])[0] 

342 

343 

344def _clsid(clsid): 

345 """ 

346 Converts a CLSID to a human-readable string. 

347 

348 :param clsid: string of length 16. 

349 """ 

350 assert len(clsid) == 16 

351 # if clsid is only made of null bytes, return an empty string: 

352 # (PL: why not simply return the string with zeroes?) 

353 if not clsid.strip(b"\0"): 

354 return "" 

355 return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % 

356 ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + 

357 tuple(map(i8, clsid[8:16])))) 

358 

359 

360 

361def filetime2datetime(filetime): 

362 """ 

363 convert FILETIME (64 bits int) to Python datetime.datetime 

364 """ 

365 # TODO: manage exception when microseconds is too large 

366 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/ 

367 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) 

368 # log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) 

369 return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) 

370 

371 

372 

373#=== CLASSES ================================================================== 

374 

375class OleFileError(IOError): 

376 """ 

377 Generic base error for this module. 

378 """ 

379 pass 

380 

381class NotOleFileError(OleFileError): 

382 """ 

383 Error raised when the opened file is not an OLE file. 

384 """ 

385 pass 

386 

387class OleMetadata: 

388 """ 

389 Class to parse and store metadata from standard properties of OLE files. 

390 

391 Available attributes: 

392 codepage, title, subject, author, keywords, comments, template, 

393 last_saved_by, revision_number, total_edit_time, last_printed, create_time, 

394 last_saved_time, num_pages, num_words, num_chars, thumbnail, 

395 creating_application, security, codepage_doc, category, presentation_target, 

396 bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, 

397 scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, 

398 chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, 

399 version, dig_sig, content_type, content_status, language, doc_version 

400 

401 Note: an attribute is set to None when not present in the properties of the 

402 OLE file. 

403 

404 References for SummaryInformation stream: 

405 

406 - https://msdn.microsoft.com/en-us/library/dd942545.aspx 

407 - https://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx 

408 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx 

409 - https://msdn.microsoft.com/en-us/library/aa372045.aspx 

410 - http://sedna-soft.de/articles/summary-information-stream/ 

411 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html 

412 

413 References for DocumentSummaryInformation stream: 

414 

415 - https://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx 

416 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx 

417 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html 

418 

419 New in version 0.25 

420 """ 

421 

422 # attribute names for SummaryInformation stream properties: 

423 # (ordered by property id, starting at 1) 

424 SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', 

425 'template', 'last_saved_by', 'revision_number', 'total_edit_time', 

426 'last_printed', 'create_time', 'last_saved_time', 'num_pages', 

427 'num_words', 'num_chars', 'thumbnail', 'creating_application', 

428 'security'] 

429 

430 # attribute names for DocumentSummaryInformation stream properties: 

431 # (ordered by property id, starting at 1) 

432 DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', 

433 'slides', 'notes', 'hidden_slides', 'mm_clips', 

434 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', 

435 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', 

436 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', 

437 'content_type', 'content_status', 'language', 'doc_version'] 

438 

439 def __init__(self): 

440 """ 

441 Constructor for OleMetadata 

442 All attributes are set to None by default 

443 """ 

444 # properties from SummaryInformation stream 

445 self.codepage = None 

446 self.title = None 

447 self.subject = None 

448 self.author = None 

449 self.keywords = None 

450 self.comments = None 

451 self.template = None 

452 self.last_saved_by = None 

453 self.revision_number = None 

454 self.total_edit_time = None 

455 self.last_printed = None 

456 self.create_time = None 

457 self.last_saved_time = None 

458 self.num_pages = None 

459 self.num_words = None 

460 self.num_chars = None 

461 self.thumbnail = None 

462 self.creating_application = None 

463 self.security = None 

464 # properties from DocumentSummaryInformation stream 

465 self.codepage_doc = None 

466 self.category = None 

467 self.presentation_target = None 

468 self.bytes = None 

469 self.lines = None 

470 self.paragraphs = None 

471 self.slides = None 

472 self.notes = None 

473 self.hidden_slides = None 

474 self.mm_clips = None 

475 self.scale_crop = None 

476 self.heading_pairs = None 

477 self.titles_of_parts = None 

478 self.manager = None 

479 self.company = None 

480 self.links_dirty = None 

481 self.chars_with_spaces = None 

482 self.unused = None 

483 self.shared_doc = None 

484 self.link_base = None 

485 self.hlinks = None 

486 self.hlinks_changed = None 

487 self.version = None 

488 self.dig_sig = None 

489 self.content_type = None 

490 self.content_status = None 

491 self.language = None 

492 self.doc_version = None 

493 

494 def parse_properties(self, ole_file): 

495 """ 

496 Parse standard properties of an OLE file, from the streams 

497 ``\\x05SummaryInformation`` and ``\\x05DocumentSummaryInformation``, 

498 if present. 

499 Properties are converted to strings, integers or python datetime objects. 

500 If a property is not present, its value is set to None. 

501 

502 :param ole_file: OleFileIO object from which to parse properties 

503 """ 

504 # first set all attributes to None: 

505 for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): 

506 setattr(self, attrib, None) 

507 if ole_file.exists("\x05SummaryInformation"): 

508 # get properties from the stream: 

509 # (converting timestamps to python datetime, except total_edit_time, 

510 # which is property #10) 

511 props = ole_file.getproperties("\x05SummaryInformation", 

512 convert_time=True, no_conversion=[10]) 

513 # store them into this object's attributes: 

514 for i in range(len(self.SUMMARY_ATTRIBS)): 

515 # ids for standards properties start at 0x01, until 0x13 

516 value = props.get(i+1, None) 

517 setattr(self, self.SUMMARY_ATTRIBS[i], value) 

518 if ole_file.exists("\x05DocumentSummaryInformation"): 

519 # get properties from the stream: 

520 props = ole_file.getproperties("\x05DocumentSummaryInformation", 

521 convert_time=True) 

522 # store them into this object's attributes: 

523 for i in range(len(self.DOCSUM_ATTRIBS)): 

524 # ids for standards properties start at 0x01, until 0x13 

525 value = props.get(i+1, None) 

526 setattr(self, self.DOCSUM_ATTRIBS[i], value) 

527 

528 def dump(self): 

529 """ 

530 Dump all metadata, for debugging purposes. 

531 """ 

532 print('Properties from SummaryInformation stream:') 

533 for prop in self.SUMMARY_ATTRIBS: 

534 value = getattr(self, prop) 

535 print('- {}: {}'.format(prop, repr(value))) 

536 print('Properties from DocumentSummaryInformation stream:') 

537 for prop in self.DOCSUM_ATTRIBS: 

538 value = getattr(self, prop) 

539 print('- {}: {}'.format(prop, repr(value))) 

540 

541class OleFileIONotClosed(RuntimeWarning): 

542 """ 

543 Warning type used when OleFileIO is destructed but has open file handle. 

544 """ 

545 def __init__(self, stack_of_open=None): 

546 super(OleFileIONotClosed, self).__init__() 

547 self.stack_of_open = stack_of_open 

548 

549 def __str__(self): 

550 msg = 'Deleting OleFileIO instance with open file handle. ' \ 

551 'You should ensure that OleFileIO is never deleted ' \ 

552 'without calling close() first. Consider using '\ 

553 '"with OleFileIO(...) as ole: ...".' 

554 if self.stack_of_open: 

555 return ''.join([msg, '\n', 'Stacktrace of open() call:\n'] + 

556 self.stack_of_open.format()) 

557 else: 

558 return msg 

559 

560 

561# --- OleStream --------------------------------------------------------------- 

562 

563class OleStream(io.BytesIO): 

564 """ 

565 OLE2 Stream 

566 

567 Returns a read-only file object which can be used to read 

568 the contents of a OLE stream (instance of the BytesIO class). 

569 To open a stream, use the openstream method in the OleFileIO class. 

570 

571 This function can be used with either ordinary streams, 

572 or ministreams, depending on the offset, sectorsize, and 

573 fat table arguments. 

574 

575 Attributes: 

576 

577 - size: actual size of data stream, after it was opened. 

578 """ 

579 # FIXME: should store the list of sects obtained by following 

580 # the fat chain, and load new sectors on demand instead of 

581 # loading it all in one go. 

582 

583 def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio): 

584 """ 

585 Constructor for OleStream class. 

586 

587 :param fp: file object, the OLE container or the MiniFAT stream 

588 :param sect: sector index of first sector in the stream 

589 :param size: total size of the stream 

590 :param offset: offset in bytes for the first FAT or MiniFAT sector 

591 :param sectorsize: size of one sector 

592 :param fat: array/list of sector indexes (FAT or MiniFAT) 

593 :param filesize: size of OLE file (for debugging) 

594 :param olefileio: OleFileIO object containing this stream 

595 :returns: a BytesIO instance containing the OLE stream 

596 """ 

597 log.debug('OleStream.__init__:') 

598 log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' 

599 %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) 

600 self.ole = olefileio 

601 # this check is necessary, otherwise when attempting to open a stream 

602 # from a closed OleFileIO, a stream of size zero is returned without 

603 # raising an exception. (see issue #81) 

604 if self.ole.fp.closed: 

605 raise OSError('Attempting to open a stream from a closed OLE File') 

606 # [PL] To detect malformed documents with FAT loops, we compute the 

607 # expected number of sectors in the stream: 

608 unknown_size = False 

609 if size == UNKNOWN_SIZE: 

610 # this is the case when called from OleFileIO._open(), and stream 

611 # size is not known in advance (for example when reading the 

612 # Directory stream). Then we can only guess maximum size: 

613 size = len(fat)*sectorsize 

614 # and we keep a record that size was unknown: 

615 unknown_size = True 

616 log.debug(' stream with UNKNOWN SIZE') 

617 nb_sectors = (size + (sectorsize-1)) // sectorsize 

618 log.debug('nb_sectors = %d' % nb_sectors) 

619 # This number should (at least) be less than the total number of 

620 # sectors in the given FAT: 

621 if nb_sectors > len(fat): 

622 self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large') 

623 # optimization(?): data is first a list of strings, and join() is called 

624 # at the end to concatenate all in one string. 

625 # (this may not be really useful with recent Python versions) 

626 data = [] 

627 # if size is zero, then first sector index should be ENDOFCHAIN: 

628 if size == 0 and sect != ENDOFCHAIN: 

629 log.debug('size == 0 and sect != ENDOFCHAIN:') 

630 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream') 

631 # [PL] A fixed-length for loop is used instead of an undefined while 

632 # loop to avoid DoS attacks: 

633 for i in range(nb_sectors): 

634 log.debug('Reading stream sector[%d] = %Xh' % (i, sect)) 

635 # Sector index may be ENDOFCHAIN, but only if size was unknown 

636 if sect == ENDOFCHAIN: 

637 if unknown_size: 

638 log.debug('Reached ENDOFCHAIN sector for stream with unknown size') 

639 break 

640 else: 

641 # else this means that the stream is smaller than declared: 

642 log.debug('sect=ENDOFCHAIN before expected size') 

643 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream') 

644 # sector index should be within FAT: 

645 if sect<0 or sect>=len(fat): 

646 log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) 

647 log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) 

648## tmp_data = b"".join(data) 

649## f = open('test_debug.bin', 'wb') 

650## f.write(tmp_data) 

651## f.close() 

652## log.debug('data read so far: %d bytes' % len(tmp_data)) 

653 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range') 

654 # stop reading here if the exception is ignored: 

655 break 

656 # TODO: merge this code with OleFileIO.getsect() ? 

657 # TODO: check if this works with 4K sectors: 

658 try: 

659 fp.seek(offset + sectorsize * sect) 

660 except Exception: 

661 log.debug('sect=%d, seek=%d, filesize=%d' % 

662 (sect, offset+sectorsize*sect, filesize)) 

663 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range') 

664 # stop reading here if the exception is ignored: 

665 break 

666 sector_data = fp.read(sectorsize) 

667 # [PL] check if there was enough data: 

668 # Note: if sector is the last of the file, sometimes it is not a 

669 # complete sector (of 512 or 4K), so we may read less than 

670 # sectorsize. 

671 if len(sector_data)!=sectorsize and sect!=(len(fat)-1): 

672 log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % 

673 (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) 

674 log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) 

675 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector') 

676 data.append(sector_data) 

677 # jump to next sector in the FAT: 

678 try: 

679 sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND 

680 except IndexError: 

681 # [PL] if pointer is out of the FAT an exception is raised 

682 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range') 

683 # stop reading here if the exception is ignored: 

684 break 

685 # [PL] Last sector should be a "end of chain" marker: 

686 # if sect != ENDOFCHAIN: 

687 # raise IOError('incorrect last sector index in OLE stream') 

688 data = b"".join(data) 

689 # Data is truncated to the actual stream size: 

690 if len(data) >= size: 

691 log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size)) 

692 data = data[:size] 

693 # actual stream size is stored for future use: 

694 self.size = size 

695 elif unknown_size: 

696 # actual stream size was not known, now we know the size of read 

697 # data: 

698 log.debug('Read data of length %d, the stream size was unknown' % len(data)) 

699 self.size = len(data) 

700 else: 

701 # read data is less than expected: 

702 log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size)) 

703 # TODO: provide details in exception message 

704 self.size = len(data) 

705 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared') 

706 # when all data is read in memory, BytesIO constructor is called 

707 io.BytesIO.__init__(self, data) 

708 # Then the OleStream object can be used as a read-only file object. 

709 

710 

711# --- OleDirectoryEntry ------------------------------------------------------- 

712 

713class OleDirectoryEntry: 

714 """ 

715 OLE2 Directory Entry pointing to a stream or a storage 

716 """ 

717 # struct to parse directory entries: 

718 # <: little-endian byte order, standard sizes 

719 # (note: this should guarantee that Q returns a 64 bits int) 

720 # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes 

721 # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 

722 # B: uint8, dir entry type (between 0 and 5) 

723 # B: uint8, color: 0=black, 1=red 

724 # I: uint32, index of left child node in the red-black tree, NOSTREAM if none 

725 # I: uint32, index of right child node in the red-black tree, NOSTREAM if none 

726 # I: uint32, index of child root node if it is a storage, else NOSTREAM 

727 # 16s: CLSID, unique identifier (only used if it is a storage) 

728 # I: uint32, user flags 

729 # Q (was 8s): uint64, creation timestamp or zero 

730 # Q (was 8s): uint64, modification timestamp or zero 

731 # I: uint32, SID of first sector if stream or ministream, SID of 1st sector 

732 # of stream containing ministreams if root entry, 0 otherwise 

733 # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise 

734 # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise 

735 STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' 

736 # size of a directory entry: 128 bytes 

737 DIRENTRY_SIZE = 128 

738 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE 

739 

740 def __init__(self, entry, sid, ole_file): 

741 """ 

742 Constructor for an OleDirectoryEntry object. 

743 Parses a 128-bytes entry from the OLE Directory stream. 

744 

745 :param bytes entry: bytes string (must be 128 bytes long) 

746 :param int sid: index of this directory entry in the OLE file directory 

747 :param OleFileIO ole_file: OleFileIO object containing this directory entry 

748 """ 

749 self.sid = sid 

750 # ref to ole_file is stored for future use 

751 self.olefile = ole_file 

752 # kids is a list of children entries, if this entry is a storage: 

753 # (list of OleDirectoryEntry objects) 

754 self.kids = [] 

755 # kids_dict is a dictionary of children entries, indexed by their 

756 # name in lowercase: used to quickly find an entry, and to detect 

757 # duplicates 

758 self.kids_dict = {} 

759 # flag used to detect if the entry is referenced more than once in 

760 # directory: 

761 self.used = False 

762 # decode DirEntry 

763 ( 

764 self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes 

765 self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 

766 self.entry_type, 

767 self.color, 

768 self.sid_left, 

769 self.sid_right, 

770 self.sid_child, 

771 clsid, 

772 self.dwUserFlags, 

773 self.createTime, 

774 self.modifyTime, 

775 self.isectStart, 

776 self.sizeLow, 

777 self.sizeHigh 

778 ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry) 

779 if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: 

780 ole_file._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') 

781 # only first directory entry can (and should) be root: 

782 if self.entry_type == STGTY_ROOT and sid != 0: 

783 ole_file._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') 

784 if sid == 0 and self.entry_type != STGTY_ROOT: 

785 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') 

786 # log.debug(struct.unpack(fmt_entry, entry[:len_entry])) 

787 # name should be at most 31 unicode characters + null character, 

788 # so 64 bytes in total (31*2 + 2): 

789 if self.namelength > 64: 

790 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes') 

791 # if exception not raised, namelength is set to the maximum value: 

792 self.namelength = 64 

793 # only characters without ending null char are kept: 

794 self.name_utf16 = self.name_raw[:(self.namelength-2)] 

795 # TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) 

796 # TODO: check if the name does not contain forbidden characters: 

797 # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." 

798 # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: 

799 self.name = ole_file._decode_utf16_str(self.name_utf16) 

800 

801 log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) 

802 log.debug(' - type: %d' % self.entry_type) 

803 log.debug(' - sect: %Xh' % self.isectStart) 

804 log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, 

805 self.sid_right, self.sid_child)) 

806 

807 # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes 

808 # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1 

809 # or some other value so it cannot be raised as a defect in general: 

810 if ole_file.sectorsize == 512: 

811 if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF: 

812 log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % 

813 (ole_file.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh)) 

814 ole_file._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') 

815 self.size = self.sizeLow 

816 else: 

817 self.size = self.sizeLow + (long(self.sizeHigh)<<32) 

818 log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh)) 

819 

820 self.clsid = _clsid(clsid) 

821 # a storage should have a null size, BUT some implementations such as 

822 # Word 8 for Mac seem to allow non-null values => Potential defect: 

823 if self.entry_type == STGTY_STORAGE and self.size != 0: 

824 ole_file._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') 

825 # check if stream is not already referenced elsewhere: 

826 self.is_minifat = False 

827 if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: 

828 if self.size < ole_file.minisectorcutoff \ 

829 and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT 

830 # ministream object 

831 self.is_minifat = True 

832 else: 

833 self.is_minifat = False 

834 ole_file._check_duplicate_stream(self.isectStart, self.is_minifat) 

835 self.sect_chain = None 

836 

837 def build_sect_chain(self, ole_file): 

838 """ 

839 Build the sector chain for a stream (from the FAT or the MiniFAT) 

840 

841 :param OleFileIO ole_file: OleFileIO object containing this directory entry 

842 :return: nothing 

843 """ 

844 # TODO: seems to be used only from _write_mini_stream, is it useful? 

845 # TODO: use self.olefile instead of ole_file 

846 if self.sect_chain: 

847 return 

848 if self.entry_type not in (STGTY_ROOT, STGTY_STREAM) or self.size == 0: 

849 return 

850 

851 self.sect_chain = list() 

852 

853 if self.is_minifat and not ole_file.minifat: 

854 ole_file.loadminifat() 

855 

856 next_sect = self.isectStart 

857 while next_sect != ENDOFCHAIN: 

858 self.sect_chain.append(next_sect) 

859 if self.is_minifat: 

860 next_sect = ole_file.minifat[next_sect] 

861 else: 

862 next_sect = ole_file.fat[next_sect] 

863 

864 def build_storage_tree(self): 

865 """ 

866 Read and build the red-black tree attached to this OleDirectoryEntry 

867 object, if it is a storage. 

868 Note that this method builds a tree of all subentries, so it should 

869 only be called for the root object once. 

870 """ 

871 log.debug('build_storage_tree: SID=%d - %s - sid_child=%d' 

872 % (self.sid, repr(self.name), self.sid_child)) 

873 if self.sid_child != NOSTREAM: 

874 # if child SID is not NOSTREAM, then this entry is a storage. 

875 # Let's walk through the tree of children to fill the kids list: 

876 self.append_kids(self.sid_child) 

877 

878 # Note from OpenOffice documentation: the safest way is to 

879 # recreate the tree because some implementations may store broken 

880 # red-black trees... 

881 

882 # in the OLE file, entries are sorted on (length, name). 

883 # for convenience, we sort them on name instead: 

884 # (see rich comparison methods in this class) 

885 self.kids.sort() 

886 

887 def append_kids(self, child_sid): 

888 """ 

889 Walk through red-black tree of children of this directory entry to add 

890 all of them to the kids list. (recursive method) 

891 

892 :param child_sid: index of child directory entry to use, or None when called 

893 first time for the root. (only used during recursion) 

894 """ 

895 log.debug('append_kids: child_sid=%d' % child_sid) 

896 # [PL] this method was added to use simple recursion instead of a complex 

897 # algorithm. 

898 # if this is not a storage or a leaf of the tree, nothing to do: 

899 if child_sid == NOSTREAM: 

900 return 

901 # check if child SID is in the proper range: 

902 if child_sid<0 or child_sid>=len(self.olefile.direntries): 

903 self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range') 

904 else: 

905 # get child direntry: 

906 child = self.olefile._load_direntry(child_sid) #direntries[child_sid] 

907 log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' 

908 % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) 

909 # Check if kid was not already referenced in a storage: 

910 if child.used: 

911 self.olefile._raise_defect(DEFECT_INCORRECT, 

912 'OLE Entry referenced more than once') 

913 return 

914 child.used = True 

915 # the directory entries are organized as a red-black tree. 

916 # (cf. Wikipedia for details) 

917 # First walk through left side of the tree: 

918 self.append_kids(child.sid_left) 

919 # Check if its name is not already used (case-insensitive): 

920 name_lower = child.name.lower() 

921 if name_lower in self.kids_dict: 

922 self.olefile._raise_defect(DEFECT_INCORRECT, 

923 "Duplicate filename in OLE storage") 

924 # Then the child_sid OleDirectoryEntry object is appended to the 

925 # kids list and dictionary: 

926 self.kids.append(child) 

927 self.kids_dict[name_lower] = child 

928 # Finally walk through right side of the tree: 

929 self.append_kids(child.sid_right) 

930 # Afterwards build kid's own tree if it's also a storage: 

931 child.build_storage_tree() 

932 

933 def __eq__(self, other): 

934 "Compare entries by name" 

935 return self.name == other.name 

936 

937 def __lt__(self, other): 

938 "Compare entries by name" 

939 return self.name < other.name 

940 

941 def __ne__(self, other): 

942 return not self.__eq__(other) 

943 

944 def __le__(self, other): 

945 return self.__eq__(other) or self.__lt__(other) 

946 

947 # Reflected __lt__() and __le__() will be used for __gt__() and __ge__() 

948 

949 # TODO: replace by the same function as MS implementation ? 

950 # (order by name length first, then case-insensitive order) 

951 

952 def dump(self, tab = 0): 

953 "Dump this entry, and all its subentries (for debug purposes only)" 

954 TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", 

955 "(property)", "(root)"] 

956 try: 

957 type_name = TYPES[self.entry_type] 

958 except IndexError: 

959 type_name = '(UNKNOWN)' 

960 print(" "*tab + repr(self.name), type_name, end=' ') 

961 if self.entry_type in (STGTY_STREAM, STGTY_ROOT): 

962 print(self.size, "bytes", end=' ') 

963 print() 

964 if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: 

965 print(" "*tab + "{%s}" % self.clsid) 

966 

967 for kid in self.kids: 

968 kid.dump(tab + 2) 

969 

970 def getmtime(self): 

971 """ 

972 Return modification time of a directory entry. 

973 

974 :returns: None if modification time is null, a python datetime object 

975 otherwise (UTC timezone) 

976 

977 new in version 0.26 

978 """ 

979 if self.modifyTime == 0: 

980 return None 

981 return filetime2datetime(self.modifyTime) 

982 

983 

984 def getctime(self): 

985 """ 

986 Return creation time of a directory entry. 

987 

988 :returns: None if modification time is null, a python datetime object 

989 otherwise (UTC timezone) 

990 

991 new in version 0.26 

992 """ 

993 if self.createTime == 0: 

994 return None 

995 return filetime2datetime(self.createTime) 

996 

997 

998#--- OleFileIO ---------------------------------------------------------------- 

999 

1000class OleFileIO: 

1001 """ 

1002 OLE container object 

1003 

1004 This class encapsulates the interface to an OLE 2 structured 

1005 storage file. Use the listdir and openstream methods to 

1006 access the contents of this file. 

1007 

1008 Object names are given as a list of strings, one for each subentry 

1009 level. The root entry should be omitted. For example, the following 

1010 code extracts all image streams from a Microsoft Image Composer file:: 

1011 

1012 with OleFileIO("fan.mic") as ole: 

1013 

1014 for entry in ole.listdir(): 

1015 if entry[1:2] == "Image": 

1016 fin = ole.openstream(entry) 

1017 fout = open(entry[0:1], "wb") 

1018 while True: 

1019 s = fin.read(8192) 

1020 if not s: 

1021 break 

1022 fout.write(s) 

1023 

1024 You can use the viewer application provided with the Python Imaging 

1025 Library to view the resulting files (which happens to be standard 

1026 TIFF files). 

1027 """ 

1028 

1029 def __init__(self, filename=None, raise_defects=DEFECT_FATAL, 

1030 write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): 

1031 """ 

1032 Constructor for the OleFileIO class. 

1033 

1034 :param filename: file to open. 

1035 

1036 - if filename is a string smaller than 1536 bytes, it is the path 

1037 of the file to open. (bytes or unicode string) 

1038 - if filename is a string longer than 1535 bytes, it is parsed 

1039 as the content of an OLE file in memory. (bytes type only) 

1040 - if filename is a file-like object (with read, seek and tell methods), 

1041 it is parsed as-is. The caller is responsible for closing it when done. 

1042 

1043 :param raise_defects: minimal level for defects to be raised as exceptions. 

1044 (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a 

1045 security-oriented application, see source code for details) 

1046 

1047 :param write_mode: bool, if True the file is opened in read/write mode instead 

1048 of read-only by default. 

1049 

1050 :param debug: bool, set debug mode (deprecated, not used anymore) 

1051 

1052 :param path_encoding: None or str, name of the codec to use for path 

1053 names (streams and storages), or None for Unicode. 

1054 Unicode by default on Python 3+, UTF-8 on Python 2.x. 

1055 (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) 

1056 """ 

1057 # minimal level for defects to be raised as exceptions: 

1058 self._raise_defects_level = raise_defects 

1059 #: list of defects/issues not raised as exceptions: 

1060 #: tuples of (exception type, message) 

1061 self.parsing_issues = [] 

1062 self.write_mode = write_mode 

1063 self.path_encoding = path_encoding 

1064 # initialize all attributes to default values: 

1065 self._filesize = None 

1066 self.ministream = None 

1067 self._used_streams_fat = [] 

1068 self._used_streams_minifat = [] 

1069 self.byte_order = None 

1070 self.directory_fp = None 

1071 self.direntries = None 

1072 self.dll_version = None 

1073 self.fat = None 

1074 self.first_difat_sector = None 

1075 self.first_dir_sector = None 

1076 self.first_mini_fat_sector = None 

1077 self.fp = None 

1078 self.header_clsid = None 

1079 self.header_signature = None 

1080 self.metadata = None 

1081 self.mini_sector_shift = None 

1082 self.mini_sector_size = None 

1083 self.mini_stream_cutoff_size = None 

1084 self.minifat = None 

1085 self.minifatsect = None 

1086 # TODO: duplicates? 

1087 self.minisectorcutoff = None 

1088 self.minisectorsize = None 

1089 self.ministream = None 

1090 self.minor_version = None 

1091 self.nb_sect = None 

1092 self.num_difat_sectors = None 

1093 self.num_dir_sectors = None 

1094 self.num_fat_sectors = None 

1095 self.num_mini_fat_sectors = None 

1096 self.reserved1 = None 

1097 self.reserved2 = None 

1098 self.root = None 

1099 self.sector_shift = None 

1100 self.sector_size = None 

1101 self.transaction_signature_number = None 

1102 self._we_opened_fp = False 

1103 self._open_stack = None 

1104 if filename: 

1105 # try opening, ensure fp is closed if that fails 

1106 try: 

1107 self.open(filename, write_mode=write_mode) 

1108 except Exception: 

1109 # caller has no chance of calling close() now 

1110 self._close(warn=False) 

1111 raise 

1112 

1113 def __del__(self): 

1114 """Destructor, ensures all file handles are closed that we opened.""" 

1115 self._close(warn=True) 

1116 # super(OleFileIO, self).__del__() # there's no super-class destructor 

1117 

1118 

1119 def __enter__(self): 

1120 return self 

1121 

1122 

1123 def __exit__(self, *args): 

1124 self._close(warn=False) 

1125 

1126 

1127 def _raise_defect(self, defect_level, message, exception_type=OleFileError): 

1128 """ 

1129 This method should be called for any defect found during file parsing. 

1130 It may raise an OleFileError exception according to the minimal level chosen 

1131 for the OleFileIO object. 

1132 

1133 :param defect_level: defect level, possible values are: 

1134 

1135 - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect 

1136 - DEFECT_POTENTIAL : a potential defect 

1137 - DEFECT_INCORRECT : an error according to specifications, but parsing can go on 

1138 - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible 

1139 

1140 :param message: string describing the defect, used with raised exception. 

1141 :param exception_type: exception class to be raised, OleFileError by default 

1142 """ 

1143 # added by [PL] 

1144 if defect_level >= self._raise_defects_level: 

1145 log.error(message) 

1146 raise exception_type(message) 

1147 else: 

1148 # just record the issue, no exception raised: 

1149 self.parsing_issues.append((exception_type, message)) 

1150 log.warning(message) 

1151 

1152 

1153 def _decode_utf16_str(self, utf16_str, errors='replace'): 

1154 """ 

1155 Decode a string encoded in UTF-16 LE format, as found in the OLE 

1156 directory or in property streams. Return a string encoded 

1157 according to the path_encoding specified for the OleFileIO object. 

1158 

1159 :param bytes utf16_str: bytes string encoded in UTF-16 LE format 

1160 :param str errors: str, see python documentation for str.decode() 

1161 :return: str, encoded according to path_encoding 

1162 :rtype: str 

1163 """ 

1164 unicode_str = utf16_str.decode('UTF-16LE', errors) 

1165 if self.path_encoding: 

1166 # an encoding has been specified for path names: 

1167 return unicode_str.encode(self.path_encoding, errors) 

1168 else: 

1169 # path_encoding=None, return the Unicode string as-is: 

1170 return unicode_str 

1171 

1172 

1173 def open(self, filename, write_mode=False): 

1174 """ 

1175 Open an OLE2 file in read-only or read/write mode. 

1176 Read and parse the header, FAT and directory. 

1177 

1178 :param filename: string-like or file-like object, OLE file to parse 

1179 

1180 - if filename is a string smaller than 1536 bytes, it is the path 

1181 of the file to open. (bytes or unicode string) 

1182 - if filename is a string longer than 1535 bytes, it is parsed 

1183 as the content of an OLE file in memory. (bytes type only) 

1184 - if filename is a file-like object (with read, seek and tell methods), 

1185 it is parsed as-is. The caller is responsible for closing it when done 

1186 

1187 :param write_mode: bool, if True the file is opened in read/write mode instead 

1188 of read-only by default. (ignored if filename is not a path) 

1189 """ 

1190 self.write_mode = write_mode 

1191 # [PL] check if filename is a string-like or file-like object: 

1192 # (it is better to check for a read() method) 

1193 if hasattr(filename, 'read'): 

1194 # TODO: also check seek and tell methods? 

1195 # file-like object: use it directly 

1196 self.fp = filename 

1197 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: 

1198 # filename is a bytes string containing the OLE file to be parsed: 

1199 # convert it to BytesIO 

1200 self.fp = io.BytesIO(filename) 

1201 else: 

1202 # string-like object: filename of file on disk 

1203 if self.write_mode: 

1204 # open file in mode 'read with update, binary' 

1205 # According to https://docs.python.org/library/functions.html#open 

1206 # 'w' would truncate the file, 'a' may only append on some Unixes 

1207 mode = 'r+b' 

1208 else: 

1209 # read-only mode by default 

1210 mode = 'rb' 

1211 self.fp = open(filename, mode) 

1212 self._we_opened_fp = True 

1213 self._open_stack = traceback.extract_stack() # remember for warning 

1214 # obtain the filesize by using seek and tell, which should work on most 

1215 # file-like objects: 

1216 # TODO: do it above, using getsize with filename when possible? 

1217 # TODO: fix code to fail with clear exception when filesize cannot be obtained 

1218 filesize = 0 

1219 self.fp.seek(0, os.SEEK_END) 

1220 try: 

1221 filesize = self.fp.tell() 

1222 finally: 

1223 self.fp.seek(0) 

1224 self._filesize = filesize 

1225 log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize)) 

1226 

1227 # lists of streams in FAT and MiniFAT, to detect duplicate references 

1228 # (list of indexes of first sectors of each stream) 

1229 self._used_streams_fat = [] 

1230 self._used_streams_minifat = [] 

1231 

1232 header = self.fp.read(512) 

1233 

1234 if len(header) != 512 or header[:8] != MAGIC: 

1235 log.debug('Magic = {!r} instead of {!r}'.format(header[:8], MAGIC)) 

1236 self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file", NotOleFileError) 

1237 

1238 # [PL] header structure according to AAF specifications: 

1239 ##Header 

1240 ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] 

1241 ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 

1242 ## // 0x1a, 0xe1} for current version 

1243 ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ 

1244 ## // GetClassFile uses root directory class id) 

1245 ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is 

1246 ## // written by reference implementation 

1247 ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for 

1248 ## // 512-byte sectors, 4 for 4 KB sectors 

1249 ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering 

1250 ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; 

1251 ## // typically 9 indicating 512-byte sectors 

1252 ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; 

1253 ## // typically 6 indicating 64-byte mini-sectors 

1254 ##USHORT _usReserved; // [22H,02] reserved, must be zero 

1255 ##ULONG _ulReserved1; // [24H,04] reserved, must be zero 

1256 ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, 

1257 ## // number of SECTs in directory chain for 4 KB 

1258 ## // sectors 

1259 ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain 

1260 ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain 

1261 ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must 

1262 ## // be zero. The reference implementation 

1263 ## // does not support transactions 

1264 ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; 

1265 ## // typically 4096 bytes 

1266 ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain 

1267 ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain 

1268 ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain 

1269 ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain 

1270 ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors 

1271 ##}; 

1272 

1273 # [PL] header decoding: 

1274 # '<' indicates little-endian byte ordering for Intel (cf. struct module help) 

1275 fmt_header = '<8s16sHHHHHHLLLLLLLLLL' 

1276 header_size = struct.calcsize(fmt_header) 

1277 log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) 

1278 header1 = header[:header_size] 

1279 ( 

1280 self.header_signature, 

1281 self.header_clsid, 

1282 self.minor_version, 

1283 self.dll_version, 

1284 self.byte_order, 

1285 self.sector_shift, 

1286 self.mini_sector_shift, 

1287 self.reserved1, 

1288 self.reserved2, 

1289 self.num_dir_sectors, 

1290 self.num_fat_sectors, 

1291 self.first_dir_sector, 

1292 self.transaction_signature_number, 

1293 self.mini_stream_cutoff_size, 

1294 self.first_mini_fat_sector, 

1295 self.num_mini_fat_sectors, 

1296 self.first_difat_sector, 

1297 self.num_difat_sectors 

1298 ) = struct.unpack(fmt_header, header1) 

1299 log.debug( struct.unpack(fmt_header, header1)) 

1300 

1301 if self.header_signature != MAGIC: 

1302 # OLE signature should always be present 

1303 self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") 

1304 if self.header_clsid != bytearray(16): 

1305 # according to AAF specs, CLSID should always be zero 

1306 self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") 

1307 log.debug( "Minor Version = %d" % self.minor_version ) 

1308 # TODO: according to MS-CFB, minor version should be 0x003E 

1309 log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version ) 

1310 if self.dll_version not in [3, 4]: 

1311 # version 3: usual format, 512 bytes per sector 

1312 # version 4: large format, 4K per sector 

1313 self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") 

1314 log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order ) 

1315 if self.byte_order != 0xFFFE: 

1316 # For now only common little-endian documents are handled correctly 

1317 self._raise_defect(DEFECT_INCORRECT, "incorrect ByteOrder in OLE header") 

1318 # TODO: add big-endian support for documents created on Mac ? 

1319 # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE. 

1320 self.sector_size = 2**self.sector_shift 

1321 log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size ) 

1322 if self.sector_size not in [512, 4096]: 

1323 self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header") 

1324 if (self.dll_version==3 and self.sector_size!=512) \ 

1325 or (self.dll_version==4 and self.sector_size!=4096): 

1326 self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header") 

1327 self.mini_sector_size = 2**self.mini_sector_shift 

1328 log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size ) 

1329 if self.mini_sector_size not in [64]: 

1330 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header") 

1331 if self.reserved1 != 0 or self.reserved2 != 0: 

1332 self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") 

1333 log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors ) 

1334 # Number of directory sectors (only allowed if DllVersion != 3) 

1335 if self.sector_size==512 and self.num_dir_sectors!=0: 

1336 self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header") 

1337 log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors ) 

1338 # num_fat_sectors = number of FAT sectors in the file 

1339 log.debug( "First Directory sector = %Xh" % self.first_dir_sector ) 

1340 # first_dir_sector = 1st sector containing the directory 

1341 log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number ) 

1342 # Signature should be zero, BUT some implementations do not follow this 

1343 # rule => only a potential defect: 

1344 # (according to MS-CFB, may be != 0 for applications supporting file 

1345 # transactions) 

1346 if self.transaction_signature_number != 0: 

1347 self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)") 

1348 log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size ) 

1349 # MS-CFB: This integer field MUST be set to 0x00001000. This field 

1350 # specifies the maximum size of a user-defined data stream allocated 

1351 # from the mini FAT and mini stream, and that cutoff is 4096 bytes. 

1352 # Any user-defined data stream larger than or equal to this cutoff size 

1353 # must be allocated as normal sectors from the FAT. 

1354 if self.mini_stream_cutoff_size != 0x1000: 

1355 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header") 

1356 # if no exception is raised, the cutoff size is fixed to 0x1000 

1357 log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' % 

1358 self.mini_stream_cutoff_size) 

1359 self.mini_stream_cutoff_size = 0x1000 

1360 # TODO: check if these values are OK 

1361 log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector ) 

1362 log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors ) 

1363 log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector ) 

1364 log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors ) 

1365 

1366 # calculate the number of sectors in the file 

1367 # (-1 because header doesn't count) 

1368 self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1 

1369 log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect)) 

1370 # TODO: change this test, because an OLE file MAY contain other data 

1371 # after the last sector. 

1372 

1373 # file clsid 

1374 self.header_clsid = _clsid(header[8:24]) 

1375 

1376 # TODO: remove redundant attributes, and fix the code which uses them? 

1377 self.sectorsize = self.sector_size #1 << i16(header, 30) 

1378 self.minisectorsize = self.mini_sector_size #1 << i16(header, 32) 

1379 self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56) 

1380 

1381 # check known streams for duplicate references (these are always in FAT, 

1382 # never in MiniFAT): 

1383 self._check_duplicate_stream(self.first_dir_sector) 

1384 # check MiniFAT only if it is not empty: 

1385 if self.num_mini_fat_sectors: 

1386 self._check_duplicate_stream(self.first_mini_fat_sector) 

1387 # check DIFAT only if it is not empty: 

1388 if self.num_difat_sectors: 

1389 self._check_duplicate_stream(self.first_difat_sector) 

1390 

1391 # Load file allocation tables 

1392 self.loadfat(header) 

1393 # Load directory. This sets both the direntries list (ordered by sid) 

1394 # and the root (ordered by hierarchy) members. 

1395 self.loaddirectory(self.first_dir_sector) 

1396 self.minifatsect = self.first_mini_fat_sector 

1397 

1398 def close(self): 

1399 """ 

1400 close the OLE file, release the file object if we created it ourselves. 

1401 

1402 Leaves the file handle open if it was provided by the caller. 

1403 """ 

1404 self._close(warn=False) 

1405 

1406 def _close(self, warn=False): 

1407 """Implementation of close() with internal arg `warn`.""" 

1408 if self._we_opened_fp: 

1409 if warn: 

1410 warnings.warn(OleFileIONotClosed(self._open_stack)) 

1411 self.fp.close() 

1412 self._we_opened_fp = False 

1413 

1414 def _check_duplicate_stream(self, first_sect, minifat=False): 

1415 """ 

1416 Checks if a stream has not been already referenced elsewhere. 

1417 This method should only be called once for each known stream, and only 

1418 if stream size is not null. 

1419 

1420 :param first_sect: int, index of first sector of the stream in FAT 

1421 :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT 

1422 """ 

1423 if minifat: 

1424 log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect) 

1425 used_streams = self._used_streams_minifat 

1426 else: 

1427 log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect) 

1428 # some values can be safely ignored (not a real stream): 

1429 if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): 

1430 return 

1431 used_streams = self._used_streams_fat 

1432 # TODO: would it be more efficient using a dict or hash values, instead 

1433 # of a list of long ? 

1434 if first_sect in used_streams: 

1435 self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') 

1436 else: 

1437 used_streams.append(first_sect) 

1438 

1439 def dumpfat(self, fat, firstindex=0): 

1440 """ 

1441 Display a part of FAT in human-readable form for debugging purposes 

1442 """ 

1443 # dictionary to convert special FAT values in human-readable strings 

1444 VPL = 8 # values per line (8+1 * 8+1 = 81) 

1445 fatnames = { 

1446 FREESECT: "..free..", 

1447 ENDOFCHAIN: "[ END. ]", 

1448 FATSECT: "FATSECT ", 

1449 DIFSECT: "DIFSECT " 

1450 } 

1451 nbsect = len(fat) 

1452 nlines = (nbsect+VPL-1)//VPL 

1453 print("index", end=" ") 

1454 for i in range(VPL): 

1455 print("%8X" % i, end=" ") 

1456 print() 

1457 for l in range(nlines): 

1458 index = l*VPL 

1459 print("%6X:" % (firstindex+index), end=" ") 

1460 for i in range(index, index+VPL): 

1461 if i>=nbsect: 

1462 break 

1463 sect = fat[i] 

1464 aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND 

1465 if aux in fatnames: 

1466 name = fatnames[aux] 

1467 else: 

1468 if sect == i+1: 

1469 name = " --->" 

1470 else: 

1471 name = "%8X" % sect 

1472 print(name, end=" ") 

1473 print() 

1474 

1475 def dumpsect(self, sector, firstindex=0): 

1476 """ 

1477 Display a sector in a human-readable form, for debugging purposes 

1478 """ 

1479 VPL=8 # number of values per line (8+1 * 8+1 = 81) 

1480 tab = array.array(UINT32, sector) 

1481 if sys.byteorder == 'big': 

1482 tab.byteswap() 

1483 nbsect = len(tab) 

1484 nlines = (nbsect+VPL-1)//VPL 

1485 print("index", end=" ") 

1486 for i in range(VPL): 

1487 print("%8X" % i, end=" ") 

1488 print() 

1489 for l in range(nlines): 

1490 index = l*VPL 

1491 print("%6X:" % (firstindex+index), end=" ") 

1492 for i in range(index, index+VPL): 

1493 if i>=nbsect: 

1494 break 

1495 sect = tab[i] 

1496 name = "%8X" % sect 

1497 print(name, end=" ") 

1498 print() 

1499 

1500 def sect2array(self, sect): 

1501 """ 

1502 convert a sector to an array of 32 bits unsigned integers, 

1503 swapping bytes on big endian CPUs such as PowerPC (old Macs) 

1504 """ 

1505 # TODO: make this a static function 

1506 a = array.array(UINT32, sect) 

1507 # if CPU is big endian, swap bytes: 

1508 if sys.byteorder == 'big': 

1509 a.byteswap() 

1510 return a 

1511 

1512 def loadfat_sect(self, sect): 

1513 """ 

1514 Adds the indexes of the given sector to the FAT 

1515 

1516 :param sect: string containing the first FAT sector, or array of long integers 

1517 :returns: index of last FAT sector. 

1518 """ 

1519 # a FAT sector is an array of ulong integers. 

1520 if isinstance(sect, array.array): 

1521 # if sect is already an array it is directly used 

1522 fat1 = sect 

1523 else: 

1524 # if it's a raw sector, it is parsed in an array 

1525 fat1 = self.sect2array(sect) 

1526 # Display the sector contents only if the logging level is debug: 

1527 if log.isEnabledFor(logging.DEBUG): 

1528 self.dumpsect(sect) 

1529 # The FAT is a sector chain starting at the first index of itself. 

1530 # initialize isect, just in case: 

1531 isect = None 

1532 for isect in fat1: 

1533 isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND 

1534 log.debug("isect = %X" % isect) 

1535 if isect == ENDOFCHAIN or isect == FREESECT: 

1536 # the end of the sector chain has been reached 

1537 log.debug("found end of sector chain") 

1538 break 

1539 # read the FAT sector 

1540 s = self.getsect(isect) 

1541 # parse it as an array of 32 bits integers, and add it to the 

1542 # global FAT array 

1543 nextfat = self.sect2array(s) 

1544 self.fat = self.fat + nextfat 

1545 return isect 

1546 

1547 def loadfat(self, header): 

1548 """ 

1549 Load the FAT table. 

1550 """ 

1551 # The 1st sector of the file contains sector numbers for the first 109 

1552 # FAT sectors, right after the header which is 76 bytes long. 

1553 # (always 109, whatever the sector size: 512 bytes = 76+4*109) 

1554 # Additional sectors are described by DIF blocks 

1555 

1556 log.debug('Loading the FAT table, starting with the 1st sector after the header') 

1557 sect = header[76:512] 

1558 log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) ) 

1559 # fat = [] 

1560 # FAT is an array of 32 bits unsigned ints, it's more effective 

1561 # to use an array than a list in Python. 

1562 # It's initialized as empty first: 

1563 self.fat = array.array(UINT32) 

1564 self.loadfat_sect(sect) 

1565 # self.dumpfat(self.fat) 

1566 # for i in range(0, len(sect), 4): 

1567 # ix = i32(sect, i) 

1568 # # [PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: 

1569 # if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: 

1570 # break 

1571 # s = self.getsect(ix) 

1572 # # fat = fat + [i32(s, i) for i in range(0, len(s), 4)] 

1573 # fat = fat + array.array(UINT32, s) 

1574 if self.num_difat_sectors != 0: 

1575 log.debug('DIFAT is used, because file size > 6.8MB.') 

1576 # [PL] There's a DIFAT because file is larger than 6.8MB 

1577 # some checks just in case: 

1578 if self.num_fat_sectors <= 109: 

1579 # there must be at least 109 blocks in header and the rest in 

1580 # DIFAT, so number of sectors must be >109. 

1581 self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') 

1582 if self.first_difat_sector >= self.nb_sect: 

1583 # initial DIFAT block index must be valid 

1584 self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') 

1585 log.debug( "DIFAT analysis..." ) 

1586 # We compute the necessary number of DIFAT sectors : 

1587 # Number of pointers per DIFAT sector = (sectorsize/4)-1 

1588 # (-1 because the last pointer is the next DIFAT sector number) 

1589 nb_difat_sectors = (self.sectorsize//4)-1 

1590 # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) 

1591 nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors 

1592 log.debug( "nb_difat = %d" % nb_difat ) 

1593 if self.num_difat_sectors != nb_difat: 

1594 raise IOError('incorrect DIFAT') 

1595 isect_difat = self.first_difat_sector 

1596 for i in iterrange(nb_difat): 

1597 log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) 

1598 # TODO: check if corresponding FAT SID = DIFSECT 

1599 sector_difat = self.getsect(isect_difat) 

1600 difat = self.sect2array(sector_difat) 

1601 # Display the sector contents only if the logging level is debug: 

1602 if log.isEnabledFor(logging.DEBUG): 

1603 self.dumpsect(sector_difat) 

1604 self.loadfat_sect(difat[:nb_difat_sectors]) 

1605 # last DIFAT pointer is next DIFAT sector: 

1606 isect_difat = difat[nb_difat_sectors] 

1607 log.debug( "next DIFAT sector: %X" % isect_difat ) 

1608 # checks: 

1609 if isect_difat not in [ENDOFCHAIN, FREESECT]: 

1610 # last DIFAT pointer value must be ENDOFCHAIN or FREESECT 

1611 raise IOError('incorrect end of DIFAT') 

1612 # if len(self.fat) != self.num_fat_sectors: 

1613 # # FAT should contain num_fat_sectors blocks 

1614 # print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors)) 

1615 # raise IOError('incorrect DIFAT') 

1616 else: 

1617 log.debug('No DIFAT, because file size < 6.8MB.') 

1618 # since FAT is read from fixed-size sectors, it may contain more values 

1619 # than the actual number of sectors in the file. 

1620 # Keep only the relevant sector indexes: 

1621 if len(self.fat) > self.nb_sect: 

1622 log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) 

1623 self.fat = self.fat[:self.nb_sect] 

1624 log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect)) 

1625 # Display the FAT contents only if the logging level is debug: 

1626 if log.isEnabledFor(logging.DEBUG): 

1627 log.debug('\nFAT:') 

1628 self.dumpfat(self.fat) 

1629 

1630 def loadminifat(self): 

1631 """ 

1632 Load the MiniFAT table. 

1633 """ 

1634 # MiniFAT is stored in a standard sub-stream, pointed to by a header 

1635 # field. 

1636 # NOTE: there are two sizes to take into account for this stream: 

1637 # 1) Stream size is calculated according to the number of sectors 

1638 # declared in the OLE header. This allocated stream may be more than 

1639 # needed to store the actual sector indexes. 

1640 # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size) 

1641 stream_size = self.num_mini_fat_sectors * self.sector_size 

1642 # 2) Actually used size is calculated by dividing the MiniStream size 

1643 # (given by root entry size) by the size of mini sectors, *4 for 

1644 # 32 bits indexes: 

1645 nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size 

1646 used_size = nb_minisectors * 4 

1647 log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % 

1648 (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors)) 

1649 if used_size > stream_size: 

1650 # This is not really a problem, but may indicate a wrong implementation: 

1651 self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') 

1652 # In any case, first read stream_size: 

1653 s = self._open(self.minifatsect, stream_size, force_FAT=True).read() 

1654 # [PL] Old code replaced by an array: 

1655 #self.minifat = [i32(s, i) for i in range(0, len(s), 4)] 

1656 self.minifat = self.sect2array(s) 

1657 # Then shrink the array to used size, to avoid indexes out of MiniStream: 

1658 log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) 

1659 self.minifat = self.minifat[:nb_minisectors] 

1660 log.debug('loadminifat(): len=%d' % len(self.minifat)) 

1661 # Display the FAT contents only if the logging level is debug: 

1662 if log.isEnabledFor(logging.DEBUG): 

1663 log.debug('\nMiniFAT:') 

1664 self.dumpfat(self.minifat) 

1665 

1666 def getsect(self, sect): 

1667 """ 

1668 Read given sector from file on disk. 

1669 

1670 :param sect: int, sector index 

1671 :returns: a string containing the sector data. 

1672 """ 

1673 # From [MS-CFB]: A sector number can be converted into a byte offset 

1674 # into the file by using the following formula: 

1675 # (sector number + 1) x Sector Size. 

1676 # This implies that sector #0 of the file begins at byte offset Sector 

1677 # Size, not at 0. 

1678 

1679 # [PL] the original code in PIL was wrong when sectors are 4KB instead of 

1680 # 512 bytes: 

1681 #self.fp.seek(512 + self.sectorsize * sect) 

1682 # [PL]: added safety checks: 

1683 #print("getsect(%X)" % sect) 

1684 try: 

1685 self.fp.seek(self.sectorsize * (sect+1)) 

1686 except Exception: 

1687 log.debug('getsect(): sect=%X, seek=%d, filesize=%d' % 

1688 (sect, self.sectorsize*(sect+1), self._filesize)) 

1689 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') 

1690 sector = self.fp.read(self.sectorsize) 

1691 if len(sector) != self.sectorsize: 

1692 log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' % 

1693 (sect, len(sector), self.sectorsize)) 

1694 self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') 

1695 return sector 

1696 

1697 def write_sect(self, sect, data, padding=b'\x00'): 

1698 """ 

1699 Write given sector to file on disk. 

1700 

1701 :param sect: int, sector index 

1702 :param data: bytes, sector data 

1703 :param padding: single byte, padding character if data < sector size 

1704 """ 

1705 if not isinstance(data, bytes): 

1706 raise TypeError("write_sect: data must be a bytes string") 

1707 if not isinstance(padding, bytes) or len(padding)!=1: 

1708 raise TypeError("write_sect: padding must be a bytes string of 1 char") 

1709 # TODO: we could allow padding=None for no padding at all 

1710 try: 

1711 self.fp.seek(self.sectorsize * (sect+1)) 

1712 except Exception: 

1713 log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' % 

1714 (sect, self.sectorsize*(sect+1), self._filesize)) 

1715 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') 

1716 if len(data) < self.sectorsize: 

1717 # add padding 

1718 data += padding * (self.sectorsize - len(data)) 

1719 elif len(data) < self.sectorsize: 

1720 raise ValueError("Data is larger than sector size") 

1721 self.fp.write(data) 

1722 

1723 def _write_mini_sect(self, fp_pos, data, padding = b'\x00'): 

1724 """ 

1725 Write given sector to file on disk. 

1726 

1727 :param fp_pos: int, file position 

1728 :param data: bytes, sector data 

1729 :param padding: single byte, padding character if data < sector size 

1730 """ 

1731 if not isinstance(data, bytes): 

1732 raise TypeError("write_mini_sect: data must be a bytes string") 

1733 if not isinstance(padding, bytes) or len(padding) != 1: 

1734 raise TypeError("write_mini_sect: padding must be a bytes string of 1 char") 

1735 

1736 try: 

1737 self.fp.seek(fp_pos) 

1738 except Exception: 

1739 log.debug('write_mini_sect(): fp_pos=%d, filesize=%d' % 

1740 (fp_pos, self._filesize)) 

1741 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') 

1742 len_data = len(data) 

1743 if len_data < self.mini_sector_size: 

1744 data += padding * (self.mini_sector_size - len_data) 

1745 if self.mini_sector_size < len_data: 

1746 raise ValueError("Data is larger than sector size") 

1747 self.fp.write(data) 

1748 

1749 def loaddirectory(self, sect): 

1750 """ 

1751 Load the directory. 

1752 

1753 :param sect: sector index of directory stream. 

1754 """ 

1755 log.debug('Loading the Directory:') 

1756 # The directory is stored in a standard 

1757 # substream, independent of its size. 

1758 

1759 # open directory stream as a read-only file: 

1760 # (stream size is not known in advance) 

1761 self.directory_fp = self._open(sect, force_FAT=True) 

1762 

1763 # [PL] to detect malformed documents and avoid DoS attacks, the maximum 

1764 # number of directory entries can be calculated: 

1765 max_entries = self.directory_fp.size // 128 

1766 log.debug('loaddirectory: size=%d, max_entries=%d' % 

1767 (self.directory_fp.size, max_entries)) 

1768 

1769 # Create list of directory entries 

1770 # self.direntries = [] 

1771 # We start with a list of "None" object 

1772 self.direntries = [None] * max_entries 

1773 # for sid in iterrange(max_entries): 

1774 # entry = fp.read(128) 

1775 # if not entry: 

1776 # break 

1777 # self.direntries.append(OleDirectoryEntry(entry, sid, self)) 

1778 # load root entry: 

1779 root_entry = self._load_direntry(0) 

1780 # Root entry is the first entry: 

1781 self.root = self.direntries[0] 

1782 # TODO: read ALL directory entries (ignore bad entries?) 

1783 # TODO: adapt build_storage_tree to avoid duplicate reads 

1784 # for i in range(1, max_entries): 

1785 # self._load_direntry(i) 

1786 # read and build all storage trees, starting from the root: 

1787 self.root.build_storage_tree() 

1788 

1789 def _load_direntry (self, sid): 

1790 """ 

1791 Load a directory entry from the directory. 

1792 This method should only be called once for each storage/stream when 

1793 loading the directory. 

1794 

1795 :param sid: index of storage/stream in the directory. 

1796 :returns: a OleDirectoryEntry object 

1797 

1798 :exception OleFileError: if the entry has always been referenced. 

1799 """ 

1800 # check if SID is OK: 

1801 if sid<0 or sid>=len(self.direntries): 

1802 self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") 

1803 # check if entry was already referenced: 

1804 if self.direntries[sid] is not None: 

1805 self._raise_defect(DEFECT_INCORRECT, 

1806 "double reference for OLE stream/storage") 

1807 # if exception not raised, return the object 

1808 return self.direntries[sid] 

1809 self.directory_fp.seek(sid * 128) 

1810 entry = self.directory_fp.read(128) 

1811 self.direntries[sid] = OleDirectoryEntry(entry, sid, self) 

1812 return self.direntries[sid] 

1813 

1814 def dumpdirectory(self): 

1815 """ 

1816 Dump directory (for debugging only) 

1817 """ 

1818 self.root.dump() 

1819 

1820 def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False): 

1821 """ 

1822 Open a stream, either in FAT or MiniFAT according to its size. 

1823 (openstream helper) 

1824 

1825 :param start: index of first sector 

1826 :param size: size of stream (or nothing if size is unknown) 

1827 :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT 

1828 according to size. If True, it will always be opened in FAT. 

1829 """ 

1830 log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' % 

1831 (start, size, str(force_FAT))) 

1832 # stream size is compared to the mini_stream_cutoff_size threshold: 

1833 if size < self.minisectorcutoff and not force_FAT: 

1834 # ministream object 

1835 if not self.ministream: 

1836 # load MiniFAT if it wasn't already done: 

1837 self.loadminifat() 

1838 # The first sector index of the miniFAT stream is stored in the 

1839 # root directory entry: 

1840 size_ministream = self.root.size 

1841 log.debug('Opening MiniStream: sect=%Xh, size=%d' % 

1842 (self.root.isectStart, size_ministream)) 

1843 self.ministream = self._open(self.root.isectStart, 

1844 size_ministream, force_FAT=True) 

1845 return OleStream(fp=self.ministream, sect=start, size=size, 

1846 offset=0, sectorsize=self.minisectorsize, 

1847 fat=self.minifat, filesize=self.ministream.size, 

1848 olefileio=self) 

1849 else: 

1850 # standard stream 

1851 return OleStream(fp=self.fp, sect=start, size=size, 

1852 offset=self.sectorsize, 

1853 sectorsize=self.sectorsize, fat=self.fat, 

1854 filesize=self._filesize, 

1855 olefileio=self) 

1856 

1857 def _list(self, files, prefix, node, streams=True, storages=False): 

1858 """ 

1859 listdir helper 

1860 

1861 :param files: list of files to fill in 

1862 :param prefix: current location in storage tree (list of names) 

1863 :param node: current node (OleDirectoryEntry object) 

1864 :param streams: bool, include streams if True (True by default) - new in v0.26 

1865 :param storages: bool, include storages if True (False by default) - new in v0.26 

1866 (note: the root storage is never included) 

1867 """ 

1868 prefix = prefix + [node.name] 

1869 for entry in node.kids: 

1870 if entry.entry_type == STGTY_STORAGE: 

1871 # this is a storage 

1872 if storages: 

1873 # add it to the list 

1874 files.append(prefix[1:] + [entry.name]) 

1875 # check its kids 

1876 self._list(files, prefix, entry, streams, storages) 

1877 elif entry.entry_type == STGTY_STREAM: 

1878 # this is a stream 

1879 if streams: 

1880 # add it to the list 

1881 files.append(prefix[1:] + [entry.name]) 

1882 else: 

1883 self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.') 

1884 

1885 def listdir(self, streams=True, storages=False): 

1886 """ 

1887 Return a list of streams and/or storages stored in this file 

1888 

1889 :param streams: bool, include streams if True (True by default) - new in v0.26 

1890 :param storages: bool, include storages if True (False by default) - new in v0.26 

1891 (note: the root storage is never included) 

1892 :returns: list of stream and/or storage paths 

1893 """ 

1894 files = [] 

1895 self._list(files, [], self.root, streams, storages) 

1896 return files 

1897 

1898 def _find(self, filename): 

1899 """ 

1900 Returns directory entry of given filename. (openstream helper) 

1901 Note: this method is case-insensitive. 

1902 

1903 :param filename: path of stream in storage tree (except root entry), either: 

1904 

1905 - a string using Unix path syntax, for example: 

1906 'storage_1/storage_1.2/stream' 

1907 - or a list of storage filenames, path to the desired stream/storage. 

1908 Example: ['storage_1', 'storage_1.2', 'stream'] 

1909 

1910 :returns: sid of requested filename 

1911 :exception IOError: if file not found 

1912 """ 

1913 

1914 # if filename is a string instead of a list, split it on slashes to 

1915 # convert to a list: 

1916 if isinstance(filename, basestring): 

1917 filename = filename.split('/') 

1918 # walk across storage tree, following given path: 

1919 node = self.root 

1920 for name in filename: 

1921 for kid in node.kids: 

1922 if kid.name.lower() == name.lower(): 

1923 break 

1924 else: 

1925 raise IOError("file not found") 

1926 node = kid 

1927 return node.sid 

1928 

1929 def openstream(self, filename): 

1930 """ 

1931 Open a stream as a read-only file object (BytesIO). 

1932 Note: filename is case-insensitive. 

1933 

1934 :param filename: path of stream in storage tree (except root entry), either: 

1935 

1936 - a string using Unix path syntax, for example: 

1937 'storage_1/storage_1.2/stream' 

1938 - or a list of storage filenames, path to the desired stream/storage. 

1939 Example: ['storage_1', 'storage_1.2', 'stream'] 

1940 

1941 :returns: file object (read-only) 

1942 :exception IOError: if filename not found, or if this is not a stream. 

1943 """ 

1944 sid = self._find(filename) 

1945 entry = self.direntries[sid] 

1946 if entry.entry_type != STGTY_STREAM: 

1947 raise IOError("this file is not a stream") 

1948 return self._open(entry.isectStart, entry.size) 

1949 

1950 def _write_mini_stream(self, entry, data_to_write): 

1951 if not entry.sect_chain: 

1952 entry.build_sect_chain(self) 

1953 nb_sectors = len(entry.sect_chain) 

1954 

1955 if not self.root.sect_chain: 

1956 self.root.build_sect_chain(self) 

1957 block_size = self.sector_size // self.mini_sector_size 

1958 for idx, sect in enumerate(entry.sect_chain): 

1959 sect_base = sect // block_size 

1960 sect_offset = sect % block_size 

1961 fp_pos = (self.root.sect_chain[sect_base] + 1)*self.sector_size + sect_offset*self.mini_sector_size 

1962 if idx < (nb_sectors - 1): 

1963 data_per_sector = data_to_write[idx * self.mini_sector_size: (idx + 1) * self.mini_sector_size] 

1964 else: 

1965 data_per_sector = data_to_write[idx * self.mini_sector_size:] 

1966 self._write_mini_sect(fp_pos, data_per_sector) 

1967 

1968 def write_stream(self, stream_name, data): 

1969 """ 

1970 Write a stream to disk. For now, it is only possible to replace an 

1971 existing stream by data of the same size. 

1972 

1973 :param stream_name: path of stream in storage tree (except root entry), either: 

1974 

1975 - a string using Unix path syntax, for example: 

1976 'storage_1/storage_1.2/stream' 

1977 - or a list of storage filenames, path to the desired stream/storage. 

1978 Example: ['storage_1', 'storage_1.2', 'stream'] 

1979 

1980 :param data: bytes, data to be written, must be the same size as the original 

1981 stream. 

1982 """ 

1983 if not isinstance(data, bytes): 

1984 raise TypeError("write_stream: data must be a bytes string") 

1985 sid = self._find(stream_name) 

1986 entry = self.direntries[sid] 

1987 if entry.entry_type != STGTY_STREAM: 

1988 raise IOError("this is not a stream") 

1989 size = entry.size 

1990 if size != len(data): 

1991 raise ValueError("write_stream: data must be the same size as the existing stream") 

1992 if size < self.minisectorcutoff and entry.entry_type != STGTY_ROOT: 

1993 return self._write_mini_stream(entry = entry, data_to_write = data) 

1994 

1995 sect = entry.isectStart 

1996 # number of sectors to write 

1997 nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize 

1998 log.debug('nb_sectors = %d' % nb_sectors) 

1999 for i in range(nb_sectors): 

2000 # try: 

2001 # self.fp.seek(offset + self.sectorsize * sect) 

2002 # except Exception: 

2003 # log.debug('sect=%d, seek=%d' % 

2004 # (sect, offset+self.sectorsize*sect)) 

2005 # raise IOError('OLE sector index out of range') 

2006 # extract one sector from data, the last one being smaller: 

2007 if i<(nb_sectors-1): 

2008 data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize] 

2009 # TODO: comment this if it works 

2010 assert(len(data_sector)==self.sectorsize) 

2011 else: 

2012 data_sector = data [i*self.sectorsize:] 

2013 # TODO: comment this if it works 

2014 log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d' 

2015 % (size, self.sectorsize, len(data_sector), size % self.sectorsize)) 

2016 assert(len(data_sector) % self.sectorsize==size % self.sectorsize) 

2017 self.write_sect(sect, data_sector) 

2018 # self.fp.write(data_sector) 

2019 # jump to next sector in the FAT: 

2020 try: 

2021 sect = self.fat[sect] 

2022 except IndexError: 

2023 # [PL] if pointer is out of the FAT an exception is raised 

2024 raise IOError('incorrect OLE FAT, sector index out of range') 

2025 # [PL] Last sector should be a "end of chain" marker: 

2026 if sect != ENDOFCHAIN: 

2027 raise IOError('incorrect last sector index in OLE stream') 

2028 

2029 def get_type(self, filename): 

2030 """ 

2031 Test if given filename exists as a stream or a storage in the OLE 

2032 container, and return its type. 

2033 

2034 :param filename: path of stream in storage tree. (see openstream for syntax) 

2035 :returns: False if object does not exist, its entry type (>0) otherwise: 

2036 

2037 - STGTY_STREAM: a stream 

2038 - STGTY_STORAGE: a storage 

2039 - STGTY_ROOT: the root entry 

2040 """ 

2041 try: 

2042 sid = self._find(filename) 

2043 entry = self.direntries[sid] 

2044 return entry.entry_type 

2045 except Exception: 

2046 return False 

2047 

2048 def getclsid(self, filename): 

2049 """ 

2050 Return clsid of a stream/storage. 

2051 

2052 :param filename: path of stream/storage in storage tree. (see openstream for 

2053 syntax) 

2054 :returns: Empty string if clsid is null, a printable representation of the clsid otherwise 

2055 

2056 new in version 0.44 

2057 """ 

2058 sid = self._find(filename) 

2059 entry = self.direntries[sid] 

2060 return entry.clsid 

2061 

2062 def getmtime(self, filename): 

2063 """ 

2064 Return modification time of a stream/storage. 

2065 

2066 :param filename: path of stream/storage in storage tree. (see openstream for 

2067 syntax) 

2068 :returns: None if modification time is null, a python datetime object 

2069 otherwise (UTC timezone) 

2070 

2071 new in version 0.26 

2072 """ 

2073 sid = self._find(filename) 

2074 entry = self.direntries[sid] 

2075 return entry.getmtime() 

2076 

2077 def getctime(self, filename): 

2078 """ 

2079 Return creation time of a stream/storage. 

2080 

2081 :param filename: path of stream/storage in storage tree. (see openstream for 

2082 syntax) 

2083 :returns: None if creation time is null, a python datetime object 

2084 otherwise (UTC timezone) 

2085 

2086 new in version 0.26 

2087 """ 

2088 sid = self._find(filename) 

2089 entry = self.direntries[sid] 

2090 return entry.getctime() 

2091 

2092 def exists(self, filename): 

2093 """ 

2094 Test if given filename exists as a stream or a storage in the OLE 

2095 container. 

2096 Note: filename is case-insensitive. 

2097 

2098 :param filename: path of stream in storage tree. (see openstream for syntax) 

2099 :returns: True if object exist, else False. 

2100 """ 

2101 try: 

2102 sid = self._find(filename) 

2103 return True 

2104 except Exception: 

2105 return False 

2106 

2107 def get_size(self, filename): 

2108 """ 

2109 Return size of a stream in the OLE container, in bytes. 

2110 

2111 :param filename: path of stream in storage tree (see openstream for syntax) 

2112 :returns: size in bytes (long integer) 

2113 :exception IOError: if file not found 

2114 :exception TypeError: if this is not a stream. 

2115 """ 

2116 sid = self._find(filename) 

2117 entry = self.direntries[sid] 

2118 if entry.entry_type != STGTY_STREAM: 

2119 # TODO: Should it return zero instead of raising an exception ? 

2120 raise TypeError('object is not an OLE stream') 

2121 return entry.size 

2122 

2123 def get_rootentry_name(self): 

2124 """ 

2125 Return root entry name. Should usually be 'Root Entry' or 'R' in most 

2126 implementations. 

2127 """ 

2128 return self.root.name 

2129 

2130 def getproperties(self, filename, convert_time=False, no_conversion=None): 

2131 """ 

2132 Return properties described in substream. 

2133 

2134 :param filename: path of stream in storage tree (see openstream for syntax) 

2135 :param convert_time: bool, if True timestamps will be converted to Python datetime 

2136 :param no_conversion: None or list of int, timestamps not to be converted 

2137 (for example total editing time is not a real timestamp) 

2138 

2139 :returns: a dictionary of values indexed by id (integer) 

2140 """ 

2141 #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx 

2142 # make sure no_conversion is a list, just to simplify code below: 

2143 if no_conversion == None: 

2144 no_conversion = [] 

2145 # stream path as a string to report exceptions: 

2146 streampath = filename 

2147 if not isinstance(streampath, str): 

2148 streampath = '/'.join(streampath) 

2149 fp = self.openstream(filename) 

2150 data = {} 

2151 try: 

2152 # header 

2153 s = fp.read(28) 

2154 clsid = _clsid(s[8:24]) 

2155 # format id 

2156 s = fp.read(20) 

2157 fmtid = _clsid(s[:16]) 

2158 fp.seek(i32(s, 16)) 

2159 # get section 

2160 s = b"****" + fp.read(i32(fp.read(4))-4) 

2161 # number of properties: 

2162 num_props = i32(s, 4) 

2163 except BaseException as exc: 

2164 # catch exception while parsing property header, and only raise 

2165 # a DEFECT_INCORRECT then return an empty dict, because this is not 

2166 # a fatal error when parsing the whole file 

2167 msg = 'Error while parsing properties header in stream {}: {}'.format( 

2168 repr(streampath), exc) 

2169 self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) 

2170 return data 

2171 # clamp num_props based on the data length 

2172 num_props = min(num_props, int(len(s) / 8)) 

2173 for i in iterrange(num_props): 

2174 property_id = 0 # just in case of an exception 

2175 try: 

2176 property_id = i32(s, 8+i*8) 

2177 offset = i32(s, 12+i*8) 

2178 property_type = i32(s, offset) 

2179 

2180 vt_name = VT.get(property_type, 'UNKNOWN') 

2181 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset)) 

2182 

2183 value = self._parse_property(s, offset+4, property_id, property_type, convert_time, no_conversion) 

2184 data[property_id] = value 

2185 except BaseException as exc: 

2186 # catch exception while parsing each property, and only raise 

2187 # a DEFECT_INCORRECT, because parsing can go on 

2188 msg = 'Error while parsing property id %d in stream %s: %s' % ( 

2189 property_id, repr(streampath), exc) 

2190 self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) 

2191 

2192 return data 

2193 

2194 def _parse_property(self, s, offset, property_id, property_type, convert_time, no_conversion): 

2195 v = None 

2196 if property_type <= VT_BLOB or property_type in (VT_CLSID, VT_CF): 

2197 v, _ = self._parse_property_basic(s, offset, property_id, property_type, convert_time, no_conversion) 

2198 elif property_type == VT_VECTOR | VT_VARIANT: 

2199 log.debug('property_type == VT_VECTOR | VT_VARIANT') 

2200 off = 4 

2201 count = i32(s, offset) 

2202 values = [] 

2203 for _ in range(count): 

2204 property_type = i32(s, offset + off) 

2205 v, sz = self._parse_property_basic(s, offset + off + 4, property_id, property_type, convert_time, no_conversion) 

2206 values.append(v) 

2207 off += sz + 4 

2208 v = values 

2209 

2210 elif property_type & VT_VECTOR: 

2211 property_type_base = property_type & ~VT_VECTOR 

2212 log.debug('property_type == VT_VECTOR | %s' % VT.get(property_type_base, 'UNKNOWN')) 

2213 off = 4 

2214 count = i32(s, offset) 

2215 values = [] 

2216 for _ in range(count): 

2217 v, sz = self._parse_property_basic(s, offset + off, property_id, property_type & ~VT_VECTOR, convert_time, no_conversion) 

2218 values.append(v) 

2219 off += sz 

2220 v = values 

2221 else: 

2222 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type)) 

2223 return v 

2224 

2225 def _parse_property_basic(self, s, offset, property_id, property_type, convert_time, no_conversion): 

2226 value = None 

2227 size = 0 

2228 # test for common types first (should perhaps use 

2229 # a dictionary instead?) 

2230 

2231 if property_type == VT_I2: # 16-bit signed integer 

2232 value = i16(s, offset) 

2233 if value >= 32768: 

2234 value = value - 65536 

2235 size = 2 

2236 elif property_type == VT_UI2: # 2-byte unsigned integer 

2237 value = i16(s, offset) 

2238 size = 2 

2239 elif property_type in (VT_I4, VT_INT, VT_ERROR): 

2240 # VT_I4: 32-bit signed integer 

2241 # VT_ERROR: HRESULT, similar to 32-bit signed integer, 

2242 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx 

2243 value = i32(s, offset) 

2244 size = 4 

2245 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer 

2246 value = i32(s, offset) # FIXME 

2247 size = 4 

2248 elif property_type in (VT_BSTR, VT_LPSTR): 

2249 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx 

2250 # size is a 32 bits integer, including the null terminator, and 

2251 # possibly trailing or embedded null chars 

2252 #TODO: if codepage is unicode, the string should be converted as such 

2253 count = i32(s, offset) 

2254 value = s[offset+4:offset+4+count-1] 

2255 # remove all null chars: 

2256 value = value.replace(b'\x00', b'') 

2257 size = 4 + count 

2258 elif property_type == VT_BLOB: 

2259 # binary large object (BLOB) 

2260 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx 

2261 count = i32(s, offset) 

2262 value = s[offset+4:offset+4+count] 

2263 size = 4 + count 

2264 elif property_type == VT_LPWSTR: 

2265 # UnicodeString 

2266 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx 

2267 # "the string should NOT contain embedded or additional trailing 

2268 # null characters." 

2269 count = i32(s, offset+4) 

2270 value = self._decode_utf16_str(s[offset+4:offset+4+count*2]) 

2271 size = 4 + count * 2 

2272 elif property_type == VT_FILETIME: 

2273 value = long(i32(s, offset)) + (long(i32(s, offset+4))<<32) 

2274 # FILETIME is a 64-bit int: "number of 100ns periods 

2275 # since Jan 1,1601". 

2276 if convert_time and property_id not in no_conversion: 

2277 log.debug('Converting property #%d to python datetime, value=%d=%fs' 

2278 %(property_id, value, float(value)/10000000)) 

2279 # convert FILETIME to Python datetime.datetime 

2280 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/ 

2281 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) 

2282 log.debug('timedelta days=%d' % (value//(10*1000000*3600*24))) 

2283 value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10) 

2284 else: 

2285 # legacy code kept for backward compatibility: returns a 

2286 # number of seconds since Jan 1,1601 

2287 value = value // 10000000 # seconds 

2288 size = 8 

2289 elif property_type == VT_UI1: # 1-byte unsigned integer 

2290 value = i8(s[offset]) 

2291 size = 1 

2292 elif property_type == VT_CLSID: 

2293 value = _clsid(s[offset:offset+16]) 

2294 size = 16 

2295 elif property_type == VT_CF: 

2296 # PropertyIdentifier or ClipboardData?? 

2297 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx 

2298 count = i32(s, offset) 

2299 value = s[offset+4:offset+4+count] 

2300 size = 4 + count 

2301 elif property_type == VT_BOOL: 

2302 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True 

2303 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx 

2304 value = bool(i16(s, offset)) 

2305 size = 2 

2306 else: 

2307 value = None # everything else yields "None" 

2308 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type)) 

2309 

2310 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, 

2311 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, 

2312 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx 

2313 

2314 #print("%08x" % property_id, repr(value), end=" ") 

2315 #print("(%s)" % VT[i32(s, offset) & 0xFFF]) 

2316 return value, size 

2317 

2318 

2319 def get_metadata(self): 

2320 """ 

2321 Parse standard properties streams, return an OleMetadata object 

2322 containing all the available metadata. 

2323 (also stored in the metadata attribute of the OleFileIO object) 

2324 

2325 new in version 0.25 

2326 """ 

2327 self.metadata = OleMetadata() 

2328 self.metadata.parse_properties(self) 

2329 return self.metadata 

2330 

2331 def get_userdefined_properties(self, filename, convert_time=False, no_conversion=None): 

2332 """ 

2333 Return properties described in substream. 

2334 

2335 :param filename: path of stream in storage tree (see openstream for syntax) 

2336 :param convert_time: bool, if True timestamps will be converted to Python datetime 

2337 :param no_conversion: None or list of int, timestamps not to be converted 

2338 (for example total editing time is not a real timestamp) 

2339 

2340 :returns: a dictionary of values indexed by id (integer) 

2341 """ 

2342 # REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx 

2343 # REFERENCE: https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-oshared/2ea8be67-a4a0-4e2e-b42f-49a182645562 

2344 #'D5CDD502-2E9C-101B-9397-08002B2CF9AE' 

2345 # TODO: testing the code more rigorously 

2346 # TODO: adding exception handeling 

2347 FMTID_USERDEFINED_PROPERTIES = _clsid(b'\x05\xD5\xCD\xD5\x9C\x2E\x1B\x10\x93\x97\x08\x00\x2B\x2C\xF9\xAE') 

2348 

2349 # make sure no_conversion is a list, just to simplify code below: 

2350 if no_conversion == None: 

2351 no_conversion = [] 

2352 # stream path as a string to report exceptions: 

2353 streampath = filename 

2354 if not isinstance(streampath, str): 

2355 streampath = '/'.join(streampath) 

2356 

2357 fp = self.openstream(filename) 

2358 

2359 data = [] 

2360 

2361 # header 

2362 s = fp.read(28) 

2363 clsid = _clsid(s[8:24]) 

2364 

2365 # PropertySetStream.cSections (4 bytes starts at 1c): number of property sets in this stream 

2366 sections_count = i32(s, 24) 

2367 

2368 section_file_pointers = [] 

2369 

2370 try: 

2371 for i in range(sections_count): 

2372 # format id 

2373 s = fp.read(20) 

2374 fmtid = _clsid(s[:16]) 

2375 

2376 if fmtid == FMTID_USERDEFINED_PROPERTIES: 

2377 file_pointer = i32(s, 16) 

2378 fp.seek(file_pointer) 

2379 # read saved sections 

2380 s = b"****" + fp.read(i32(fp.read(4)) - 4) 

2381 # number of properties: 

2382 num_props = i32(s, 4) 

2383 

2384 PropertyIdentifierAndOffset = s[8: 8+8*num_props] 

2385 

2386 # property names (dictionary) 

2387 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/99127b7f-c440-4697-91a4-c853086d6b33 

2388 index = 8+8*num_props 

2389 entry_count = i32(s[index: index+4]) 

2390 index += 4 

2391 for i in range(entry_count): 

2392 identifier = s[index: index +4] 

2393 str_size = i32(s[index+4: index + 8]) 

2394 string = s[index+8: index+8+str_size].decode('utf_8').strip('\0') 

2395 data.append({'property_name':string, 'value':None}) 

2396 index = index+8+str_size 

2397 # clamp num_props based on the data length 

2398 num_props = min(num_props, int(len(s) / 8)) 

2399 

2400 # property values 

2401 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/f122b9d7-e5cf-4484-8466-83f6fd94b3cc 

2402 for i in iterrange(2, num_props): 

2403 property_id = 0 # just in case of an exception 

2404 try: 

2405 property_id = i32(s, 8 + i * 8) 

2406 offset = i32(s, 12 + i * 8) 

2407 property_type = i32(s, offset) 

2408 

2409 vt_name = VT.get(property_type, 'UNKNOWN') 

2410 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset)) 

2411 

2412 # test for common types first (should perhaps use 

2413 # a dictionary instead?) 

2414 

2415 if property_type == VT_I2: # 16-bit signed integer 

2416 value = i16(s, offset + 4) 

2417 if value >= 32768: 

2418 value = value - 65536 

2419 elif property_type == 1: 

2420 # supposed to be VT_NULL but seems it is not NULL 

2421 str_size = i32(s, offset + 8) 

2422 value = s[offset + 12:offset + 12 + str_size - 1] 

2423 

2424 elif property_type == VT_UI2: # 2-byte unsigned integer 

2425 value = i16(s, offset + 4) 

2426 elif property_type in (VT_I4, VT_INT, VT_ERROR): 

2427 # VT_I4: 32-bit signed integer 

2428 # VT_ERROR: HRESULT, similar to 32-bit signed integer, 

2429 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx 

2430 value = i32(s, offset + 4) 

2431 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer 

2432 value = i32(s, offset + 4) # FIXME 

2433 elif property_type in (VT_BSTR, VT_LPSTR): 

2434 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx 

2435 # size is a 32 bits integer, including the null terminator, and 

2436 # possibly trailing or embedded null chars 

2437 # TODO: if codepage is unicode, the string should be converted as such 

2438 count = i32(s, offset + 4) 

2439 value = s[offset + 8:offset + 8 + count - 1] 

2440 # remove all null chars: 

2441 value = value.replace(b'\x00', b'') 

2442 elif property_type == VT_BLOB: 

2443 # binary large object (BLOB) 

2444 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx 

2445 count = i32(s, offset + 4) 

2446 value = s[offset + 8:offset + 8 + count] 

2447 elif property_type == VT_LPWSTR: 

2448 # UnicodeString 

2449 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx 

2450 # "the string should NOT contain embedded or additional trailing 

2451 # null characters." 

2452 count = i32(s, offset + 4) 

2453 value = self._decode_utf16_str(s[offset + 8:offset + 8 + count * 2]) 

2454 elif property_type == VT_FILETIME: 

2455 value = long(i32(s, offset + 4)) + (long(i32(s, offset + 8)) << 32) 

2456 # FILETIME is a 64-bit int: "number of 100ns periods 

2457 # since Jan 1,1601". 

2458 if convert_time and property_id not in no_conversion: 

2459 log.debug('Converting property #%d to python datetime, value=%d=%fs' 

2460 % (property_id, value, float(value) / 10000000)) 

2461 # convert FILETIME to Python datetime.datetime 

2462 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/ 

2463 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) 

2464 log.debug('timedelta days=%d' % (value // (10 * 1000000 * 3600 * 24))) 

2465 value = _FILETIME_null_date + datetime.timedelta(microseconds=value // 10) 

2466 else: 

2467 # legacy code kept for backward compatibility: returns a 

2468 # number of seconds since Jan 1,1601 

2469 value = value // 10000000 # seconds 

2470 elif property_type == VT_UI1: # 1-byte unsigned integer 

2471 value = i8(s[offset + 4]) 

2472 elif property_type == VT_CLSID: 

2473 value = _clsid(s[offset + 4:offset + 20]) 

2474 elif property_type == VT_CF: 

2475 # PropertyIdentifier or ClipboardData?? 

2476 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx 

2477 count = i32(s, offset + 4) 

2478 value = s[offset + 8:offset + 8 + count] 

2479 elif property_type == VT_BOOL: 

2480 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True 

2481 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx 

2482 value = bool(i16(s, offset + 4)) 

2483 else: 

2484 value = None # everything else yields "None" 

2485 log.debug( 

2486 'property id=%d: type=%d not implemented in parser yet' % (property_id, property_type)) 

2487 

2488 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, 

2489 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, 

2490 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx 

2491 

2492 # FIXME: add support for VT_VECTOR 

2493 # VT_VECTOR is a 32 uint giving the number of items, followed by 

2494 # the items in sequence. The VT_VECTOR value is combined with the 

2495 # type of items, e.g. VT_VECTOR|VT_BSTR 

2496 # see https://msdn.microsoft.com/en-us/library/dd942011.aspx 

2497 

2498 # print("%08x" % property_id, repr(value), end=" ") 

2499 # print("(%s)" % VT[i32(s, offset) & 0xFFF]) 

2500 

2501 data[i-2]['value']=value 

2502 except BaseException as exc: 

2503 # catch exception while parsing each property, and only raise 

2504 # a DEFECT_INCORRECT, because parsing can go on 

2505 msg = 'Error while parsing property id %d in stream %s: %s' % ( 

2506 property_id, repr(streampath), exc) 

2507 self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) 

2508 

2509 except BaseException as exc: 

2510 # catch exception while parsing property header, and only raise 

2511 # a DEFECT_INCORRECT then return an empty dict, because this is not 

2512 # a fatal error when parsing the whole file 

2513 msg = 'Error while parsing properties header in stream %s: %s' % ( 

2514 repr(streampath), exc) 

2515 self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) 

2516 return data 

2517 

2518 return data 

2519 

2520 

2521 def get_document_variables(self): 

2522 """ 

2523 Extract the document variables from Microsft Word docs 

2524 :return: it returns a list of dictionaries, each of them contains var_name and value keys 

2525 """ 

2526 # TODO: testing the code more rigorously 

2527 # TODO: adding exception handeling 

2528 data = [] 

2529 word_fp = self.openstream(['WordDocument']) 

2530 

2531 # Read fcStwUser from the WordDocument stream 

2532 # fcStwUser (4 bytes): An unsigned integer which is an offset in 1Table Stream that StwUser locates. 

2533 # fcStwUser is the 121th field in fibRgFcLcb97 (index 120) 

2534 fib_base = word_fp.read(32) 

2535 nfib = i16(fib_base[2:4]) 

2536 if nfib == 0x00C1: # fibRgFcLcb97 

2537 csw = i16(word_fp.read(2)) 

2538 fibRgW = word_fp.read(csw * 2) 

2539 cslw = i16(word_fp.read(2)) 

2540 fibRgLw = word_fp.read(cslw * 4) 

2541 cbRgFcLcb = i16(word_fp.read(2)) 

2542 fibRgFcLcbBlob = word_fp.read(cbRgFcLcb * 4) 

2543 fcStwUser = i32(fibRgFcLcbBlob[120*4:121*4]) 

2544 lcbStwUser = i32(fibRgFcLcbBlob[121 * 4:122 * 4]) 

2545 

2546 if lcbStwUser > 0: 

2547 # Read StwUser from 1Table stream (WordDocument.fcStwUser points to this structure) 

2548 # this structure contains variable names and assigned values 

2549 table_fp = self.openstream(['1Table']) 

2550 table_fp.seek(fcStwUser) 

2551 

2552 # SttbNames (array, contain variable names) 

2553 ss = table_fp.read(6) 

2554 

2555 char_size = 1 

2556 if ss[:2] == b'\xff\xff': 

2557 char_size = 2 

2558 

2559 cdata = i16(ss[2:]) 

2560 

2561 cbExtra = i16(ss[4:]) 

2562 

2563 # SttbNames (array, contains variable names) 

2564 for i in range(cdata): 

2565 cchData = i16(table_fp.read(2)) 

2566 data_str = table_fp.read(cchData *char_size ) 

2567 if char_size == 2: 

2568 data_str = self._decode_utf16_str(data_str) 

2569 data.append({'var_name':data_str, 'value':''}) 

2570 extra = table_fp.read(cbExtra) 

2571 

2572 # rgxchNames (array, contains values corresponding to variable names in SttbNames) 

2573 for i in range(cdata): 

2574 cchData = i16(table_fp.read(2)) 

2575 data_str = table_fp.read(cchData *char_size) 

2576 if char_size == 2: 

2577 data_str = self._decode_utf16_str(data_str) 

2578 data[i]['value'] = data_str 

2579 

2580 return data 

2581 

2582# -------------------------------------------------------------------- 

2583# This script can be used to dump the directory of any OLE2 structured 

2584# storage file. 

2585 

2586def main(): 

2587 """ 

2588 Main function when olefile is runs as a script from the command line. 

2589 This will open an OLE2 file and display its structure and properties 

2590 :return: nothing 

2591 """ 

2592 import sys, optparse 

2593 

2594 DEFAULT_LOG_LEVEL = "warning" # Default log level 

2595 LOG_LEVELS = { 

2596 'debug': logging.DEBUG, 

2597 'info': logging.INFO, 

2598 'warning': logging.WARNING, 

2599 'error': logging.ERROR, 

2600 'critical': logging.CRITICAL 

2601 } 

2602 

2603 usage = 'usage: %prog [options] <filename> [filename2 ...]' 

2604 parser = optparse.OptionParser(usage=usage) 

2605 

2606 parser.add_option("-c", action="store_true", dest="check_streams", 

2607 help='check all streams (for debugging purposes)') 

2608 parser.add_option("-v", action="store_true", dest="extract_customvar", 

2609 help='extract all document variables') 

2610 parser.add_option("-p", action="store_true", dest="extract_customprop", 

2611 help='extract all user-defined propertires') 

2612 parser.add_option("-d", action="store_true", dest="debug_mode", 

2613 help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)') 

2614 parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, 

2615 help="logging level debug/info/warning/error/critical (default=%default)") 

2616 

2617 (options, args) = parser.parse_args() 

2618 

2619 print('olefile version {} {} - https://www.decalage.info/en/olefile\n'.format(__version__, __date__)) 

2620 

2621 # Print help if no arguments are passed 

2622 if len(args) == 0: 

2623 print(__doc__) 

2624 parser.print_help() 

2625 sys.exit() 

2626 

2627 if options.debug_mode: 

2628 options.loglevel = 'debug' 

2629 

2630 # setup logging to the console 

2631 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') 

2632 

2633 # also enable the module's logger: 

2634 enable_logging() 

2635 

2636 for filename in args: 

2637 try: 

2638 ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) 

2639 print("-" * 68) 

2640 print(filename) 

2641 print("-" * 68) 

2642 ole.dumpdirectory() 

2643 for streamname in ole.listdir(): 

2644 if streamname[-1][0] == "\005": 

2645 print("%r: properties" % streamname) 

2646 try: 

2647 props = ole.getproperties(streamname, convert_time=True) 

2648 props = sorted(props.items()) 

2649 for k, v in props: 

2650 # [PL]: avoid to display too large or binary values: 

2651 if isinstance(v, (basestring, bytes)): 

2652 if len(v) > 50: 

2653 v = v[:50] 

2654 if isinstance(v, bytes): 

2655 # quick and dirty binary check: 

2656 for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, 

2657 21,22,23,24,25,26,27,28,29,30,31): 

2658 if c in bytearray(v): 

2659 v = '(binary data)' 

2660 break 

2661 print(" ", k, v) 

2662 except Exception: 

2663 log.exception('Error while parsing property stream %r' % streamname) 

2664 

2665 try: 

2666 if options.extract_customprop: 

2667 variables = ole.get_userdefined_properties(streamname, convert_time=True) 

2668 if len(variables): 

2669 print("%r: user-defined properties" % streamname) 

2670 for index, variable in enumerate(variables): 

2671 print('\t{} {}: {}'.format(index, variable['property_name'],variable['value'])) 

2672 

2673 except: 

2674 log.exception('Error while parsing user-defined property stream %r' % streamname) 

2675 elif options.extract_customvar and streamname[-1]=="WordDocument": 

2676 print("%r: document variables" % streamname) 

2677 variables = ole.get_document_variables() 

2678 

2679 for index, var in enumerate(variables): 

2680 print('\t{} {}: {}'.format(index, var['var_name'], var['value'][:50])) 

2681 print("") 

2682 

2683 

2684 if options.check_streams: 

2685 # Read all streams to check if there are errors: 

2686 print('\nChecking streams...') 

2687 for streamname in ole.listdir(): 

2688 # print name using repr() to convert binary chars to \xNN: 

2689 print('-', repr('/'.join(streamname)),'-', end=' ') 

2690 st_type = ole.get_type(streamname) 

2691 if st_type == STGTY_STREAM: 

2692 print('size %d' % ole.get_size(streamname)) 

2693 # just try to read stream in memory: 

2694 ole.openstream(streamname) 

2695 else: 

2696 print('NOT a stream : type=%d' % st_type) 

2697 print() 

2698 

2699 # for streamname in ole.listdir(): 

2700 # # print name using repr() to convert binary chars to \xNN: 

2701 # print('-', repr('/'.join(streamname)),'-', end=' ') 

2702 # print(ole.getmtime(streamname)) 

2703 # print() 

2704 

2705 print('Modification/Creation times of all directory entries:') 

2706 for entry in ole.direntries: 

2707 if entry is not None: 

2708 print('- {}: mtime={} ctime={}'.format(entry.name, 

2709 entry.getmtime(), entry.getctime())) 

2710 print() 

2711 

2712 # parse and display metadata: 

2713 try: 

2714 meta = ole.get_metadata() 

2715 meta.dump() 

2716 except Exception: 

2717 log.exception('Error while parsing metadata') 

2718 print() 

2719 # [PL] Test a few new methods: 

2720 root = ole.get_rootentry_name() 

2721 print('Root entry name: "%s"' % root) 

2722 if ole.exists('worddocument'): 

2723 print("This is a Word document.") 

2724 print("type of stream 'WordDocument':", ole.get_type('worddocument')) 

2725 print("size :", ole.get_size('worddocument')) 

2726 if ole.exists('macros/vba'): 

2727 print("This document may contain VBA macros.") 

2728 

2729 # print parsing issues: 

2730 print('\nNon-fatal issues raised during parsing:') 

2731 if ole.parsing_issues: 

2732 for exctype, msg in ole.parsing_issues: 

2733 print('- {}: {}'.format(exctype.__name__, msg)) 

2734 else: 

2735 print('None') 

2736 ole.close() 

2737 except Exception: 

2738 log.exception('Error while parsing file %r' % filename) 

2739 

2740 

2741if __name__ == "__main__": 

2742 main() 

2743 

2744# this code was developed while listening to The Wedding Present "Sea Monsters"