Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/olefile/olefile.py: 44%

1"""

2olefile (formerly OleFileIO_PL)

4Module to read/write Microsoft OLE2 files (also called Structured Storage or

5Microsoft Compound Document File Format), such as Microsoft Office 97-2003

6documents, Image Composer and FlashPix files, Outlook messages, ...

7This version is compatible with Python 2.7 and 3.5+

9Project website: https://www.decalage.info/olefile

12(https://www.decalage.info)

14olefile is based on the OleFileIO module from the PIL library v1.1.7

15See: http://www.pythonware.com/products/pil/index.htm

16and http://svn.effbot.org/public/tags/pil-1.1.7/PIL/OleFileIO.py

18The Python Imaging Library (PIL) is

22See source code and LICENSE.txt for information on usage and redistribution.

23"""

25# Since olefile v0.47, only Python 2.7 and 3.5+ are supported

26# This import enables print() as a function rather than a keyword

27# (main requirement to be compatible with Python 3.x)

28# The comment on the line below should be printed on Python 2.5 or older:

29from __future__ import print_function # This version of olefile requires Python 2.7 or 3.5+.

32#--- LICENSE ------------------------------------------------------------------

35# (https://www.decalage.info)

36#

38#

39# Redistribution and use in source and binary forms, with or without modification,

40# are permitted provided that the following conditions are met:

41#

42# * Redistributions of source code must retain the above copyright notice, this

43# list of conditions and the following disclaimer.

44# * Redistributions in binary form must reproduce the above copyright notice,

45# this list of conditions and the following disclaimer in the documentation

46# and/or other materials provided with the distribution.

47#

48# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

49# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

50# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

51# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE

52# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

53# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

54# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

55# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

56# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

57# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

59# ----------

60# PIL License:

61#

62# olefile is based on source code from the OleFileIO module of the Python

63# Imaging Library (PIL) published by Fredrik Lundh under the following license:

65# The Python Imaging Library (PIL) is

68#

69# By obtaining, using, and/or copying this software and/or its associated

70# documentation, you agree that you have read, understood, and will comply with

71# the following terms and conditions:

72#

73# Permission to use, copy, modify, and distribute this software and its

74# associated documentation for any purpose and without fee is hereby granted,

75# provided that the above copyright notice appears in all copies, and that both

76# that copyright notice and this permission notice appear in supporting

77# documentation, and that the name of Secret Labs AB or the author(s) not be used

78# in advertising or publicity pertaining to distribution of the software

79# without specific, written prior permission.

80#

81# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS

82# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.

83# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,

84# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM

85# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR

86# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR

87# PERFORMANCE OF THIS SOFTWARE.

89__date__ = "2023-12-01"

90__version__ = '0.47'

91__author__ = "Philippe Lagadec"

93__all__ = ['isOleFile', 'OleFileIO', 'OleMetadata', 'enable_logging',

94 'MAGIC', 'STGTY_EMPTY',

95 'STGTY_STREAM', 'STGTY_STORAGE', 'STGTY_ROOT', 'STGTY_PROPERTY',

96 'STGTY_LOCKBYTES', 'MINIMAL_OLEFILE_SIZE',

97 'DEFECT_UNSURE', 'DEFECT_POTENTIAL', 'DEFECT_INCORRECT',

98 'DEFECT_FATAL', 'DEFAULT_PATH_ENCODING',

99 'MAXREGSECT', 'DIFSECT', 'FATSECT', 'ENDOFCHAIN', 'FREESECT',

100 'MAXREGSID', 'NOSTREAM', 'UNKNOWN_SIZE', 'WORD_CLSID',

101 'OleFileIONotClosed'

102]

103

104import io

105import sys

106import struct, array, os.path, datetime, logging, warnings, traceback

107

108#=== COMPATIBILITY WORKAROUNDS ================================================

109

110# For Python 3.x, need to redefine long as int:

111if str is not bytes:

112 long = int

113

114# Need to make sure we use xrange both on Python 2 and 3.x:

115try:

116 # on Python 2 we need xrange:

117 iterrange = xrange

118except Exception:

119 # no xrange, for Python 3 it was renamed as range:

120 iterrange = range

121

122# [PL] workaround to fix an issue with array item size on 64 bits systems:

123if array.array('L').itemsize == 4:

124 # on 32 bits platforms, long integers in an array are 32 bits:

125 UINT32 = 'L'

126elif array.array('I').itemsize == 4:

127 # on 64 bits platforms, integers in an array are 32 bits:

128 UINT32 = 'I'

129elif array.array('i').itemsize == 4:

130 # On 64 bit Jython, signed integers ('i') are the only way to store our 32

131 # bit values in an array in a *somewhat* reasonable way, as the otherwise

132 # perfectly suited 'H' (unsigned int, 32 bits) results in a completely

133 # unusable behaviour. This is most likely caused by the fact that Java

134 # doesn't have unsigned values, and thus Jython's "array" implementation,

135 # which is based on "jarray", doesn't have them either.

136 # NOTE: to trick Jython into converting the values it would normally

137 # interpret as "signed" into "unsigned", a binary-and operation with

138 # 0xFFFFFFFF can be used. This way it is possible to use the same comparing

139 # operations on all platforms / implementations. The corresponding code

140 # lines are flagged with a 'JYTHON-WORKAROUND' tag below.

141 UINT32 = 'i'

142else:

143 raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...')

144

145

146# [PL] These workarounds were inspired from the Path module

147# (see http://www.jorendorff.com/articles/python/path/)

148# TODO: remove the use of basestring, as it was removed in Python 3

149try:

150 basestring

151except NameError:

152 basestring = str

153

154if sys.version_info[0] < 3:

155 # On Python 2.x, the default encoding for path names is UTF-8:

156 DEFAULT_PATH_ENCODING = 'utf-8'

157else:

158 # On Python 3.x, the default encoding for path names is Unicode (None):

159 DEFAULT_PATH_ENCODING = None

160

161

162# === LOGGING =================================================================

163

164def get_logger(name, level=logging.CRITICAL+1):

165 """

166 Create a suitable logger object for this module.

167 The goal is not to change settings of the root logger, to avoid getting

168 other modules' logs on the screen.

169 If a logger exists with same name, reuse it. (Else it would have duplicate

170 handlers and messages would be doubled.)

171 The level is set to CRITICAL+1 by default, to avoid any logging.

172 """

173 # First, test if there is already a logger with the same name, else it

174 # will generate duplicate messages (due to duplicate handlers):

175 if name in logging.Logger.manager.loggerDict:

176 #NOTE: another less intrusive but more "hackish" solution would be to

177 # use getLogger then test if its effective level is not default.

178 logger = logging.getLogger(name)

179 # make sure level is OK:

180 logger.setLevel(level)

181 return logger

182 # get a new logger:

183 logger = logging.getLogger(name)

184 # only add a NullHandler for this logger, it is up to the application

185 # to configure its own logging:

186 logger.addHandler(logging.NullHandler())

187 logger.setLevel(level)

188 return logger

189

190

191# a global logger object used for debugging:

192log = get_logger('olefile')

193

194

195def enable_logging():

196 """

197 Enable logging for this module (disabled by default).

198 This will set the module-specific logger level to NOTSET, which

199 means the main application controls the actual logging level.

200 """

201 log.setLevel(logging.NOTSET)

202

203

204#=== CONSTANTS ===============================================================

205

206#: magic bytes that should be at the beginning of every OLE file:

207MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'

208

209# [PL]: added constants for Sector IDs (from AAF specifications)

210MAXREGSECT = 0xFFFFFFFA #: (-6) maximum SECT

211DIFSECT = 0xFFFFFFFC #: (-4) denotes a DIFAT sector in a FAT

212FATSECT = 0xFFFFFFFD #: (-3) denotes a FAT sector in a FAT

213ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain

214FREESECT = 0xFFFFFFFF #: (-1) unallocated sector

215

216# [PL]: added constants for Directory Entry IDs (from AAF specifications)

217MAXREGSID = 0xFFFFFFFA #: (-6) maximum directory entry ID

218NOSTREAM = 0xFFFFFFFF #: (-1) unallocated directory entry

219

220# [PL] object types in storage (from AAF specifications)

221STGTY_EMPTY = 0 #: empty directory entry

222STGTY_STORAGE = 1 #: element is a storage object

223STGTY_STREAM = 2 #: element is a stream object

224STGTY_LOCKBYTES = 3 #: element is an ILockBytes object

225STGTY_PROPERTY = 4 #: element is an IPropertyStorage object

226STGTY_ROOT = 5 #: element is a root storage

227

228# Unknown size for a stream (used by OleStream):

229UNKNOWN_SIZE = 0x7FFFFFFF

230

231#

232# --------------------------------------------------------------------

233# property types

234

235VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;

236VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;

237VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;

238VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;

239VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;

240VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;

241VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;

242VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;

243VT_VECTOR=0x1000;

244

245# map property id to name (for debugging purposes)

246VT = {}

247for keyword, var in list(vars().items()):

248 if keyword[:3] == "VT_":

249 VT[var] = keyword

250

251#

252# --------------------------------------------------------------------

253# Some common document types (root.clsid fields)

254

255WORD_CLSID = "00020900-0000-0000-C000-000000000046"

256# TODO: check Excel, PPT, ...

257

258# [PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()

259DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect

260DEFECT_POTENTIAL = 20 # a potential defect

261DEFECT_INCORRECT = 30 # an error according to specifications, but parsing

262 # can go on

263DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is

264 # impossible

265

266# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes

267# (this is used in isOleFile and OleFileIO.open)

268MINIMAL_OLEFILE_SIZE = 1536

269

270#=== FUNCTIONS ===============================================================

271

272def isOleFile (filename=None, data=None):

273 """

274 Test if a file is an OLE container (according to the magic bytes in its header).

275

276 .. note::

277 This function only checks the first 8 bytes of the file, not the

278 rest of the OLE structure.

279 If data is provided, it also checks if the file size is above

280 the minimal size of an OLE file (1536 bytes).

281 If filename is provided with the path of the file on disk, the file is

282 open only to read the first 8 bytes, then closed.

283

284 .. versionadded:: 0.16

285

286 :param filename: filename, contents or file-like object of the OLE file (string-like or file-like object)

287

288 - if data is provided, filename is ignored.

289 - if filename is a unicode string, it is used as path of the file to open on disk.

290 - if filename is a bytes string smaller than 1536 bytes, it is used as path

291 of the file to open on disk.

292 - [deprecated] if filename is a bytes string longer than 1535 bytes, it is parsed

293 as the content of an OLE file in memory. (bytes type only)

294 Note that this use case is deprecated and should be replaced by the new data parameter

295 - if filename is a file-like object (with read and seek methods),

296 it is parsed as-is.

297 :type filename: bytes, str, unicode or file-like object

298

299 :param data: bytes string with the contents of the file to be checked, when the file is in memory

300 (added in olefile 0.47)

301 :type data: bytes

302

303 :returns: True if OLE, False otherwise.

304 :rtype: bool

305 """

306 header = None

307 # first check if data is provided and large enough

308 if data is not None:

309 if len(data) >= MINIMAL_OLEFILE_SIZE:

310 header = data[:len(MAGIC)]

311 else:

312 # the file is too small, cannot be OLE

313 return False

314 # check if filename is a string-like or file-like object:

315 elif hasattr(filename, 'read') and hasattr(filename, 'seek'):

316 # file-like object: use it directly

317 header = filename.read(len(MAGIC))

318 # just in case, seek back to start of file:

319 filename.seek(0)

320 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:

321 # filename is a bytes string containing the OLE file to be parsed:

322 header = filename[:len(MAGIC)]

323 else:

324 # string-like object: filename of file on disk

325 with open(filename, 'rb') as fp:

326 header = fp.read(len(MAGIC))

327 if header == MAGIC:

328 return True

329 else:

330 return False

331

332

333if bytes is str:

334 # version for Python 2.x

335 def i8(c):

336 return ord(c)

337else:

338 # version for Python 3.x

339 def i8(c):

340 return c if c.__class__ is int else c[0]

341

342

343def i16(c, o = 0):

344 """

345 Converts a 2-bytes (16 bits) string to an integer.

346

347 :param c: string containing bytes to convert

348 :param o: offset of bytes to convert in string

349 """

350 return struct.unpack("<H", c[o:o+2])[0]

351

352

353def i32(c, o = 0):

354 """

355 Converts a 4-bytes (32 bits) string to an integer.

356

357 :param c: string containing bytes to convert

358 :param o: offset of bytes to convert in string

359 """

360 return struct.unpack("<I", c[o:o+4])[0]

361

362

363def _clsid(clsid):

364 """

365 Converts a CLSID to a human-readable string.

366

367 :param clsid: string of length 16.

368 """

369 assert len(clsid) == 16

370 # if clsid is only made of null bytes, return an empty string:

371 # (PL: why not simply return the string with zeroes?)

372 if not clsid.strip(b"\0"):

373 return ""

374 return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %

375 ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +

376 tuple(map(i8, clsid[8:16]))))

377

378

379

380def filetime2datetime(filetime):

381 """

382 convert FILETIME (64 bits int) to Python datetime.datetime

383 """

384 # TODO: manage exception when microseconds is too large

385 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/

386 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)

387 # log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24)))

388 return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10)

389

390

391

392#=== CLASSES ==================================================================

393

394class OleFileError(IOError):

395 """

396 Generic base error for this module.

397 """

398 pass

399

400class NotOleFileError(OleFileError):

401 """

402 Error raised when the opened file is not an OLE file.

403 """

404 pass

405

406class OleMetadata:

407 """

408 Class to parse and store metadata from standard properties of OLE files.

409

410 Available attributes:

411 codepage, title, subject, author, keywords, comments, template,

412 last_saved_by, revision_number, total_edit_time, last_printed, create_time,

413 last_saved_time, num_pages, num_words, num_chars, thumbnail,

414 creating_application, security, codepage_doc, category, presentation_target,

415 bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,

416 scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,

417 chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,

418 version, dig_sig, content_type, content_status, language, doc_version

419

420 Note: an attribute is set to None when not present in the properties of the

421 OLE file.

422

423 References for SummaryInformation stream:

424

425 - https://msdn.microsoft.com/en-us/library/dd942545.aspx

426 - https://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx

427 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx

428 - https://msdn.microsoft.com/en-us/library/aa372045.aspx

429 - http://sedna-soft.de/articles/summary-information-stream/

430 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html

431

432 References for DocumentSummaryInformation stream:

433

434 - https://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx

435 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx

436 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html

437

438 New in version 0.25

439 """

440

441 # attribute names for SummaryInformation stream properties:

442 # (ordered by property id, starting at 1)

443 SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',

444 'template', 'last_saved_by', 'revision_number', 'total_edit_time',

445 'last_printed', 'create_time', 'last_saved_time', 'num_pages',

446 'num_words', 'num_chars', 'thumbnail', 'creating_application',

447 'security']

448

449 # attribute names for DocumentSummaryInformation stream properties:

450 # (ordered by property id, starting at 1)

451 DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',

452 'slides', 'notes', 'hidden_slides', 'mm_clips',

453 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',

454 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',

455 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',

456 'content_type', 'content_status', 'language', 'doc_version']

457

458 def __init__(self):

459 """

460 Constructor for OleMetadata

461 All attributes are set to None by default

462 """

463 # properties from SummaryInformation stream

464 self.codepage = None

465 self.title = None

466 self.subject = None

467 self.author = None

468 self.keywords = None

469 self.comments = None

470 self.template = None

471 self.last_saved_by = None

472 self.revision_number = None

473 self.total_edit_time = None

474 self.last_printed = None

475 self.create_time = None

476 self.last_saved_time = None

477 self.num_pages = None

478 self.num_words = None

479 self.num_chars = None

480 self.thumbnail = None

481 self.creating_application = None

482 self.security = None

483 # properties from DocumentSummaryInformation stream

484 self.codepage_doc = None

485 self.category = None

486 self.presentation_target = None

487 self.bytes = None

488 self.lines = None

489 self.paragraphs = None

490 self.slides = None

491 self.notes = None

492 self.hidden_slides = None

493 self.mm_clips = None

494 self.scale_crop = None

495 self.heading_pairs = None

496 self.titles_of_parts = None

497 self.manager = None

498 self.company = None

499 self.links_dirty = None

500 self.chars_with_spaces = None

501 self.unused = None

502 self.shared_doc = None

503 self.link_base = None

504 self.hlinks = None

505 self.hlinks_changed = None

506 self.version = None

507 self.dig_sig = None

508 self.content_type = None

509 self.content_status = None

510 self.language = None

511 self.doc_version = None

512

513 def parse_properties(self, ole_file):

514 """

515 Parse standard properties of an OLE file, from the streams

516 ``\\x05SummaryInformation`` and ``\\x05DocumentSummaryInformation``,

517 if present.

518 Properties are converted to strings, integers or python datetime objects.

519 If a property is not present, its value is set to None.

520

521 :param ole_file: OleFileIO object from which to parse properties

522 """

523 # first set all attributes to None:

524 for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS):

525 setattr(self, attrib, None)

526 if ole_file.exists("\x05SummaryInformation"):

527 # get properties from the stream:

528 # (converting timestamps to python datetime, except total_edit_time,

529 # which is property #10)

530 props = ole_file.getproperties("\x05SummaryInformation",

531 convert_time=True, no_conversion=[10])

532 # store them into this object's attributes:

533 for i in range(len(self.SUMMARY_ATTRIBS)):

534 # ids for standards properties start at 0x01, until 0x13

535 value = props.get(i+1, None)

536 setattr(self, self.SUMMARY_ATTRIBS[i], value)

537 if ole_file.exists("\x05DocumentSummaryInformation"):

538 # get properties from the stream:

539 props = ole_file.getproperties("\x05DocumentSummaryInformation",

540 convert_time=True)

541 # store them into this object's attributes:

542 for i in range(len(self.DOCSUM_ATTRIBS)):

543 # ids for standards properties start at 0x01, until 0x13

544 value = props.get(i+1, None)

545 setattr(self, self.DOCSUM_ATTRIBS[i], value)

546

547 def dump(self):

548 """

549 Dump all metadata, for debugging purposes.

550 """

551 print('Properties from SummaryInformation stream:')

552 for prop in self.SUMMARY_ATTRIBS:

553 value = getattr(self, prop)

554 print('- {}: {}'.format(prop, repr(value)))

555 print('Properties from DocumentSummaryInformation stream:')

556 for prop in self.DOCSUM_ATTRIBS:

557 value = getattr(self, prop)

558 print('- {}: {}'.format(prop, repr(value)))

559

560class OleFileIONotClosed(RuntimeWarning):

561 """

562 Warning type used when OleFileIO is destructed but has open file handle.

563 """

564 def __init__(self, stack_of_open=None):

565 super(OleFileIONotClosed, self).__init__()

566 self.stack_of_open = stack_of_open

567

568 def __str__(self):

569 msg = 'Deleting OleFileIO instance with open file handle. ' \

570 'You should ensure that OleFileIO is never deleted ' \

571 'without calling close() first. Consider using '\

572 '"with OleFileIO(...) as ole: ...".'

573 if self.stack_of_open:

574 return ''.join([msg, '\n', 'Stacktrace of open() call:\n'] +

575 self.stack_of_open.format())

576 else:

577 return msg

578

579

580# --- OleStream ---------------------------------------------------------------

581

582class OleStream(io.BytesIO):

583 """

584 OLE2 Stream

585

586 Returns a read-only file object which can be used to read

587 the contents of a OLE stream (instance of the BytesIO class).

588 To open a stream, use the openstream method in the OleFileIO class.

589

590 This function can be used with either ordinary streams,

591 or ministreams, depending on the offset, sectorsize, and

592 fat table arguments.

593

594 Attributes:

595

596 - size: actual size of data stream, after it was opened.

597 """

598 # FIXME: should store the list of sects obtained by following

599 # the fat chain, and load new sectors on demand instead of

600 # loading it all in one go.

601

602 def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio):

603 """

604 Constructor for OleStream class.

605

606 :param fp: file object, the OLE container or the MiniFAT stream

607 :param sect: sector index of first sector in the stream

608 :param size: total size of the stream

609 :param offset: offset in bytes for the first FAT or MiniFAT sector

610 :param sectorsize: size of one sector

611 :param fat: array/list of sector indexes (FAT or MiniFAT)

612 :param filesize: size of OLE file (for debugging)

613 :param olefileio: OleFileIO object containing this stream

614 :returns: a BytesIO instance containing the OLE stream

615 """

616 log.debug('OleStream.__init__:')

617 log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'

618 %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))

619 self.ole = olefileio

620 # this check is necessary, otherwise when attempting to open a stream

621 # from a closed OleFileIO, a stream of size zero is returned without

622 # raising an exception. (see issue #81)

623 if self.ole.fp.closed:

624 raise OSError('Attempting to open a stream from a closed OLE File')

625 # [PL] To detect malformed documents with FAT loops, we compute the

626 # expected number of sectors in the stream:

627 unknown_size = False

628 if size == UNKNOWN_SIZE:

629 # this is the case when called from OleFileIO._open(), and stream

630 # size is not known in advance (for example when reading the

631 # Directory stream). Then we can only guess maximum size:

632 size = len(fat)*sectorsize

633 # and we keep a record that size was unknown:

634 unknown_size = True

635 log.debug(' stream with UNKNOWN SIZE')

636 nb_sectors = (size + (sectorsize-1)) // sectorsize

637 log.debug('nb_sectors = %d' % nb_sectors)

638 # This number should (at least) be less than the total number of

639 # sectors in the given FAT:

640 if nb_sectors > len(fat):

641 self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large')

642 # optimization(?): data is first a list of strings, and join() is called

643 # at the end to concatenate all in one string.

644 # (this may not be really useful with recent Python versions)

645 data = []

646 # if size is zero, then first sector index should be ENDOFCHAIN:

647 if size == 0 and sect != ENDOFCHAIN:

648 log.debug('size == 0 and sect != ENDOFCHAIN:')

649 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream')

650 # [PL] A fixed-length for loop is used instead of an undefined while

651 # loop to avoid DoS attacks:

652 for i in range(nb_sectors):

653 log.debug('Reading stream sector[%d] = %Xh' % (i, sect))

654 # Sector index may be ENDOFCHAIN, but only if size was unknown

655 if sect == ENDOFCHAIN:

656 if unknown_size:

657 log.debug('Reached ENDOFCHAIN sector for stream with unknown size')

658 break

659 else:

660 # else this means that the stream is smaller than declared:

661 log.debug('sect=ENDOFCHAIN before expected size')

662 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream')

663 # sector index should be within FAT:

664 if sect<0 or sect>=len(fat):

665 log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))

666 log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors))

667## tmp_data = b"".join(data)

668## f = open('test_debug.bin', 'wb')

669## f.write(tmp_data)

670## f.close()

671## log.debug('data read so far: %d bytes' % len(tmp_data))

672 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')

673 # stop reading here if the exception is ignored:

674 break

675 # TODO: merge this code with OleFileIO.getsect() ?

676 # TODO: check if this works with 4K sectors:

677 try:

678 fp.seek(offset + sectorsize * sect)

679 except Exception:

680 log.debug('sect=%d, seek=%d, filesize=%d' %

681 (sect, offset+sectorsize*sect, filesize))

682 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range')

683 # stop reading here if the exception is ignored:

684 break

685 sector_data = fp.read(sectorsize)

686 # [PL] check if there was enough data:

687 # Note: if sector is the last of the file, sometimes it is not a

688 # complete sector (of 512 or 4K), so we may read less than

689 # sectorsize.

690 if len(sector_data)!=sectorsize and sect!=(len(fat)-1):

691 log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %

692 (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))

693 log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))

694 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector')

695 data.append(sector_data)

696 # jump to next sector in the FAT:

697 try:

698 sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND

699 except IndexError:

700 # [PL] if pointer is out of the FAT an exception is raised

701 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')

702 # stop reading here if the exception is ignored:

703 break

704 # [PL] Last sector should be a "end of chain" marker:

705 # if sect != ENDOFCHAIN:

706 # raise IOError('incorrect last sector index in OLE stream')

707 data = b"".join(data)

708 # Data is truncated to the actual stream size:

709 if len(data) >= size:

710 log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size))

711 data = data[:size]

712 # actual stream size is stored for future use:

713 self.size = size

714 elif unknown_size:

715 # actual stream size was not known, now we know the size of read

716 # data:

717 log.debug('Read data of length %d, the stream size was unknown' % len(data))

718 self.size = len(data)

719 else:

720 # read data is less than expected:

721 log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size))

722 # TODO: provide details in exception message

723 self.size = len(data)

724 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared')

725 # when all data is read in memory, BytesIO constructor is called

726 io.BytesIO.__init__(self, data)

727 # Then the OleStream object can be used as a read-only file object.

728

729

730# --- OleDirectoryEntry -------------------------------------------------------

731

732class OleDirectoryEntry:

733 """

734 OLE2 Directory Entry pointing to a stream or a storage

735 """

736 # struct to parse directory entries:

737 # <: little-endian byte order, standard sizes

738 # (note: this should guarantee that Q returns a 64 bits int)

739 # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes

740 # H: uint16, number of bytes used in name buffer, including null = (len+1)*2

741 # B: uint8, dir entry type (between 0 and 5)

742 # B: uint8, color: 0=black, 1=red

743 # I: uint32, index of left child node in the red-black tree, NOSTREAM if none

744 # I: uint32, index of right child node in the red-black tree, NOSTREAM if none

745 # I: uint32, index of child root node if it is a storage, else NOSTREAM

746 # 16s: CLSID, unique identifier (only used if it is a storage)

747 # I: uint32, user flags

748 # Q (was 8s): uint64, creation timestamp or zero

749 # Q (was 8s): uint64, modification timestamp or zero

750 # I: uint32, SID of first sector if stream or ministream, SID of 1st sector

751 # of stream containing ministreams if root entry, 0 otherwise

752 # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise

753 # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise

754 STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII'

755 # size of a directory entry: 128 bytes

756 DIRENTRY_SIZE = 128

757 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE

758

759 def __init__(self, entry, sid, ole_file):

760 """

761 Constructor for an OleDirectoryEntry object.

762 Parses a 128-bytes entry from the OLE Directory stream.

763

764 :param bytes entry: bytes string (must be 128 bytes long)

765 :param int sid: index of this directory entry in the OLE file directory

766 :param OleFileIO ole_file: OleFileIO object containing this directory entry

767 """

768 self.sid = sid

769 # ref to ole_file is stored for future use

770 self.olefile = ole_file

771 # kids is a list of children entries, if this entry is a storage:

772 # (list of OleDirectoryEntry objects)

773 self.kids = []

774 # kids_dict is a dictionary of children entries, indexed by their

775 # name in lowercase: used to quickly find an entry, and to detect

776 # duplicates

777 self.kids_dict = {}

778 # flag used to detect if the entry is referenced more than once in

779 # directory:

780 self.used = False

781 # decode DirEntry

782 (

783 self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes

784 self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2

785 self.entry_type,

786 self.color,

787 self.sid_left,

788 self.sid_right,

789 self.sid_child,

790 clsid,

791 self.dwUserFlags,

792 self.createTime,

793 self.modifyTime,

794 self.isectStart,

795 self.sizeLow,

796 self.sizeHigh

797 ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry)

798 if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:

799 ole_file._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')

800 # only first directory entry can (and should) be root:

801 if self.entry_type == STGTY_ROOT and sid != 0:

802 ole_file._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')

803 if sid == 0 and self.entry_type != STGTY_ROOT:

804 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')

805 # log.debug(struct.unpack(fmt_entry, entry[:len_entry]))

806 # name should be at most 31 unicode characters + null character,

807 # so 64 bytes in total (31*2 + 2):

808 if self.namelength > 64:

809 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes')

810 # if exception not raised, namelength is set to the maximum value:

811 self.namelength = 64

812 # only characters without ending null char are kept:

813 self.name_utf16 = self.name_raw[:(self.namelength-2)]

814 # TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1)

815 # TODO: check if the name does not contain forbidden characters:

816 # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'."

817 # name is converted from UTF-16LE to the path encoding specified in the OleFileIO:

818 self.name = ole_file._decode_utf16_str(self.name_utf16)

819

820 log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))

821 log.debug(' - type: %d' % self.entry_type)

822 log.debug(' - sect: %Xh' % self.isectStart)

823 log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,

824 self.sid_right, self.sid_child))

825

826 # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes

827 # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1

828 # or some other value so it cannot be raised as a defect in general:

829 if ole_file.sectorsize == 512:

830 if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF:

831 log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %

832 (ole_file.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh))

833 ole_file._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')

834 self.size = self.sizeLow

835 else:

836 self.size = self.sizeLow + (long(self.sizeHigh)<<32)

837 log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh))

838

839 self.clsid = _clsid(clsid)

840 # a storage should have a null size, BUT some implementations such as

841 # Word 8 for Mac seem to allow non-null values => Potential defect:

842 if self.entry_type == STGTY_STORAGE and self.size != 0:

843 ole_file._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')

844 # check if stream is not already referenced elsewhere:

845 self.is_minifat = False

846 if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:

847 if self.size < ole_file.minisectorcutoff \

848 and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT

849 # ministream object

850 self.is_minifat = True

851 else:

852 self.is_minifat = False

853 ole_file._check_duplicate_stream(self.isectStart, self.is_minifat)

854 self.sect_chain = None

855

856 def build_sect_chain(self, ole_file):

857 """

858 Build the sector chain for a stream (from the FAT or the MiniFAT)

859

860 :param OleFileIO ole_file: OleFileIO object containing this directory entry

861 :return: nothing

862 """

863 # TODO: seems to be used only from _write_mini_stream, is it useful?

864 # TODO: use self.olefile instead of ole_file

865 if self.sect_chain:

866 return

867 if self.entry_type not in (STGTY_ROOT, STGTY_STREAM) or self.size == 0:

868 return

869

870 self.sect_chain = list()

871

872 if self.is_minifat and not ole_file.minifat:

873 ole_file.loadminifat()

874

875 next_sect = self.isectStart

876 while next_sect != ENDOFCHAIN:

877 self.sect_chain.append(next_sect)

878 if self.is_minifat:

879 next_sect = ole_file.minifat[next_sect]

880 else:

881 next_sect = ole_file.fat[next_sect]

882

883 def build_storage_tree(self):

884 """

885 Read and build the red-black tree attached to this OleDirectoryEntry

886 object, if it is a storage.

887 Note that this method builds a tree of all subentries, so it should

888 only be called for the root object once.

889 """

890 log.debug('build_storage_tree: SID=%d - %s - sid_child=%d'

891 % (self.sid, repr(self.name), self.sid_child))

892 if self.sid_child != NOSTREAM:

893 # if child SID is not NOSTREAM, then this entry is a storage.

894 # Let's walk through the tree of children to fill the kids list:

895 self.append_kids(self.sid_child)

896

897 # Note from OpenOffice documentation: the safest way is to

898 # recreate the tree because some implementations may store broken

899 # red-black trees...

900

901 # in the OLE file, entries are sorted on (length, name).

902 # for convenience, we sort them on name instead:

903 # (see rich comparison methods in this class)

904 self.kids.sort()

905

906 def append_kids(self, child_sid):

907 """

908 Walk through red-black tree of children of this directory entry to add

909 all of them to the kids list. (recursive method)

910

911 :param child_sid: index of child directory entry to use, or None when called

912 first time for the root. (only used during recursion)

913 """

914 log.debug('append_kids: child_sid=%d' % child_sid)

915 # [PL] this method was added to use simple recursion instead of a complex

916 # algorithm.

917 # if this is not a storage or a leaf of the tree, nothing to do:

918 if child_sid == NOSTREAM:

919 return

920 # check if child SID is in the proper range:

921 if child_sid<0 or child_sid>=len(self.olefile.direntries):

922 self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range')

923 else:

924 # get child direntry:

925 child = self.olefile._load_direntry(child_sid) #direntries[child_sid]

926 log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'

927 % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))

928 # Check if kid was not already referenced in a storage:

929 if child.used:

930 self.olefile._raise_defect(DEFECT_INCORRECT,

931 'OLE Entry referenced more than once')

932 return

933 child.used = True

934 # the directory entries are organized as a red-black tree.

935 # (cf. Wikipedia for details)

936 # First walk through left side of the tree:

937 self.append_kids(child.sid_left)

938 # Check if its name is not already used (case-insensitive):

939 name_lower = child.name.lower()

940 if name_lower in self.kids_dict:

941 self.olefile._raise_defect(DEFECT_INCORRECT,

942 "Duplicate filename in OLE storage")

943 # Then the child_sid OleDirectoryEntry object is appended to the

944 # kids list and dictionary:

945 self.kids.append(child)

946 self.kids_dict[name_lower] = child

947 # Finally walk through right side of the tree:

948 self.append_kids(child.sid_right)

949 # Afterwards build kid's own tree if it's also a storage:

950 child.build_storage_tree()

951

952 def __eq__(self, other):

953 "Compare entries by name"

954 return self.name == other.name

955

956 def __lt__(self, other):

957 "Compare entries by name"

958 return self.name < other.name

959

960 def __ne__(self, other):

961 return not self.__eq__(other)

962

963 def __le__(self, other):

964 return self.__eq__(other) or self.__lt__(other)

965

966 # Reflected __lt__() and __le__() will be used for __gt__() and __ge__()

967

968 # TODO: replace by the same function as MS implementation ?

969 # (order by name length first, then case-insensitive order)

970

971 def dump(self, tab = 0):

972 "Dump this entry, and all its subentries (for debug purposes only)"

973 TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",

974 "(property)", "(root)"]

975 try:

976 type_name = TYPES[self.entry_type]

977 except IndexError:

978 type_name = '(UNKNOWN)'

979 print(" "*tab + repr(self.name), type_name, end=' ')

980 if self.entry_type in (STGTY_STREAM, STGTY_ROOT):

981 print(self.size, "bytes", end=' ')

982 print()

983 if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:

984 print(" "*tab + "{%s}" % self.clsid)

985

986 for kid in self.kids:

987 kid.dump(tab + 2)

988

989 def getmtime(self):

990 """

991 Return modification time of a directory entry.

992

993 :returns: None if modification time is null, a python datetime object

994 otherwise (UTC timezone)

995

996 new in version 0.26

997 """

998 if self.modifyTime == 0:

999 return None

1000 return filetime2datetime(self.modifyTime)

1001

1002

1003 def getctime(self):

1004 """

1005 Return creation time of a directory entry.

1006

1007 :returns: None if modification time is null, a python datetime object

1008 otherwise (UTC timezone)

1009

1010 new in version 0.26

1011 """

1012 if self.createTime == 0:

1013 return None

1014 return filetime2datetime(self.createTime)

1015

1016

1017#--- OleFileIO ----------------------------------------------------------------

1018

1019class OleFileIO:

1020 """

1021 OLE container object

1022

1023 This class encapsulates the interface to an OLE 2 structured

1024 storage file. Use the listdir and openstream methods to

1025 access the contents of this file.

1026

1027 Object names are given as a list of strings, one for each subentry

1028 level. The root entry should be omitted. For example, the following

1029 code extracts all image streams from a Microsoft Image Composer file::

1030

1031 with OleFileIO("fan.mic") as ole:

1032

1033 for entry in ole.listdir():

1034 if entry[1:2] == "Image":

1035 fin = ole.openstream(entry)

1036 fout = open(entry[0:1], "wb")

1037 while True:

1038 s = fin.read(8192)

1039 if not s:

1040 break

1041 fout.write(s)

1042

1043 You can use the viewer application provided with the Python Imaging

1044 Library to view the resulting files (which happens to be standard

1045 TIFF files).

1046 """

1047

1048 def __init__(self, filename=None, raise_defects=DEFECT_FATAL,

1049 write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING):

1050 """

1051 Constructor for the OleFileIO class.

1052

1053 :param filename: file to open.

1054

1055 - if filename is a string smaller than 1536 bytes, it is the path

1056 of the file to open. (bytes or unicode string)

1057 - if filename is a string longer than 1535 bytes, it is parsed

1058 as the content of an OLE file in memory. (bytes type only)

1059 - if filename is a file-like object (with read, seek and tell methods),

1060 it is parsed as-is. The caller is responsible for closing it when done.

1061

1062 :param raise_defects: minimal level for defects to be raised as exceptions.

1063 (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a

1064 security-oriented application, see source code for details)

1065

1066 :param write_mode: bool, if True the file is opened in read/write mode instead

1067 of read-only by default.

1068

1069 :param debug: bool, set debug mode (deprecated, not used anymore)

1070

1071 :param path_encoding: None or str, name of the codec to use for path

1072 names (streams and storages), or None for Unicode.

1073 Unicode by default on Python 3+, UTF-8 on Python 2.x.

1074 (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)

1075 """

1076 # minimal level for defects to be raised as exceptions:

1077 self._raise_defects_level = raise_defects

1078 #: list of defects/issues not raised as exceptions:

1079 #: tuples of (exception type, message)

1080 self.parsing_issues = []

1081 self.write_mode = write_mode

1082 self.path_encoding = path_encoding

1083 # initialize all attributes to default values:

1084 self._filesize = None

1085 self.ministream = None

1086 self._used_streams_fat = []

1087 self._used_streams_minifat = []

1088 self.byte_order = None

1089 self.directory_fp = None

1090 self.direntries = None

1091 self.dll_version = None

1092 self.fat = None

1093 self.first_difat_sector = None

1094 self.first_dir_sector = None

1095 self.first_mini_fat_sector = None

1096 self.fp = None

1097 self.header_clsid = None

1098 self.header_signature = None

1099 self.metadata = None

1100 self.mini_sector_shift = None

1101 self.mini_sector_size = None

1102 self.mini_stream_cutoff_size = None

1103 self.minifat = None

1104 self.minifatsect = None

1105 # TODO: duplicates?

1106 self.minisectorcutoff = None

1107 self.minisectorsize = None

1108 self.ministream = None

1109 self.minor_version = None

1110 self.nb_sect = None

1111 self.num_difat_sectors = None

1112 self.num_dir_sectors = None

1113 self.num_fat_sectors = None

1114 self.num_mini_fat_sectors = None

1115 self.reserved1 = None

1116 self.reserved2 = None

1117 self.root = None

1118 self.sector_shift = None

1119 self.sector_size = None

1120 self.transaction_signature_number = None

1121 self.warn_if_not_closed = False

1122 self._we_opened_fp = False

1123 self._open_stack = None

1124 if filename:

1125 # try opening, ensure fp is closed if that fails

1126 try:

1127 self.open(filename, write_mode=write_mode)

1128 except Exception:

1129 # caller has no chance of calling close() now

1130 self._close(warn=False)

1131 raise

1132

1133 def __del__(self):

1134 """Destructor, ensures all file handles are closed that we opened."""

1135 self._close(warn=True)

1136 # super(OleFileIO, self).__del__() # there's no super-class destructor

1137

1138

1139 def __enter__(self):

1140 return self

1141

1142

1143 def __exit__(self, *args):

1144 self._close(warn=False)

1145

1146

1147 def _raise_defect(self, defect_level, message, exception_type=OleFileError):

1148 """

1149 This method should be called for any defect found during file parsing.

1150 It may raise an OleFileError exception according to the minimal level chosen

1151 for the OleFileIO object.

1152

1153 :param defect_level: defect level, possible values are:

1154

1155 - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect

1156 - DEFECT_POTENTIAL : a potential defect

1157 - DEFECT_INCORRECT : an error according to specifications, but parsing can go on

1158 - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible

1159

1160 :param message: string describing the defect, used with raised exception.

1161 :param exception_type: exception class to be raised, OleFileError by default

1162 """

1163 # added by [PL]

1164 if defect_level >= self._raise_defects_level:

1165 log.error(message)

1166 raise exception_type(message)

1167 else:

1168 # just record the issue, no exception raised:

1169 self.parsing_issues.append((exception_type, message))

1170 log.warning(message)

1171

1172

1173 def _decode_utf16_str(self, utf16_str, errors='replace'):

1174 """

1175 Decode a string encoded in UTF-16 LE format, as found in the OLE

1176 directory or in property streams. Return a string encoded

1177 according to the path_encoding specified for the OleFileIO object.

1178

1179 :param bytes utf16_str: bytes string encoded in UTF-16 LE format

1180 :param str errors: str, see python documentation for str.decode()

1181 :return: str, encoded according to path_encoding

1182 :rtype: str

1183 """

1184 unicode_str = utf16_str.decode('UTF-16LE', errors)

1185 if self.path_encoding:

1186 # an encoding has been specified for path names:

1187 return unicode_str.encode(self.path_encoding, errors)

1188 else:

1189 # path_encoding=None, return the Unicode string as-is:

1190 return unicode_str

1191

1192

1193 def open(self, filename, write_mode=False):

1194 """

1195 Open an OLE2 file in read-only or read/write mode.

1196 Read and parse the header, FAT and directory.

1197

1198 :param filename: string-like or file-like object, OLE file to parse

1199

1200 - if filename is a string smaller than 1536 bytes, it is the path

1201 of the file to open. (bytes or unicode string)

1202 - if filename is a string longer than 1535 bytes, it is parsed

1203 as the content of an OLE file in memory. (bytes type only)

1204 - if filename is a file-like object (with read, seek and tell methods),

1205 it is parsed as-is. The caller is responsible for closing it when done

1206

1207 :param write_mode: bool, if True the file is opened in read/write mode instead

1208 of read-only by default. (ignored if filename is not a path)

1209 """

1210 self.write_mode = write_mode

1211 # [PL] check if filename is a string-like or file-like object:

1212 # (it is better to check for a read() method)

1213 if hasattr(filename, 'read'):

1214 # TODO: also check seek and tell methods?

1215 # file-like object: use it directly

1216 self.fp = filename

1217 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:

1218 # filename is a bytes string containing the OLE file to be parsed:

1219 # convert it to BytesIO

1220 self.fp = io.BytesIO(filename)

1221 else:

1222 # string-like object: filename of file on disk

1223 if self.write_mode:

1224 # open file in mode 'read with update, binary'

1225 # According to https://docs.python.org/library/functions.html#open

1226 # 'w' would truncate the file, 'a' may only append on some Unixes

1227 mode = 'r+b'

1228 else:

1229 # read-only mode by default

1230 mode = 'rb'

1231 self.fp = open(filename, mode)

1232 self._we_opened_fp = True

1233 self._open_stack = traceback.extract_stack() # remember for warning

1234 # obtain the filesize by using seek and tell, which should work on most

1235 # file-like objects:

1236 # TODO: do it above, using getsize with filename when possible?

1237 # TODO: fix code to fail with clear exception when filesize cannot be obtained

1238 filesize = 0

1239 self.fp.seek(0, os.SEEK_END)

1240 try:

1241 filesize = self.fp.tell()

1242 finally:

1243 self.fp.seek(0)

1244 self._filesize = filesize

1245 log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize))

1246

1247 # lists of streams in FAT and MiniFAT, to detect duplicate references

1248 # (list of indexes of first sectors of each stream)

1249 self._used_streams_fat = []

1250 self._used_streams_minifat = []

1251

1252 header = self.fp.read(512)

1253

1254 if len(header) != 512 or header[:8] != MAGIC:

1255 log.debug('Magic = {!r} instead of {!r}'.format(header[:8], MAGIC))

1256 self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file", NotOleFileError)

1257

1258 # [PL] header structure according to AAF specifications:

1259 ##Header

1260 ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]

1261 ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,

1262 ## // 0x1a, 0xe1} for current version

1263 ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/

1264 ## // GetClassFile uses root directory class id)

1265 ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is

1266 ## // written by reference implementation

1267 ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for

1268 ## // 512-byte sectors, 4 for 4 KB sectors

1269 ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering

1270 ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two;

1271 ## // typically 9 indicating 512-byte sectors

1272 ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;

1273 ## // typically 6 indicating 64-byte mini-sectors

1274 ##USHORT _usReserved; // [22H,02] reserved, must be zero

1275 ##ULONG _ulReserved1; // [24H,04] reserved, must be zero

1276 ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,

1277 ## // number of SECTs in directory chain for 4 KB

1278 ## // sectors

1279 ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain

1280 ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain

1281 ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must

1282 ## // be zero. The reference implementation

1283 ## // does not support transactions

1284 ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;

1285 ## // typically 4096 bytes

1286 ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain

1287 ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain

1288 ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain

1289 ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain

1290 ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors

1291 ##};

1292

1293 # [PL] header decoding:

1294 # '<' indicates little-endian byte ordering for Intel (cf. struct module help)

1295 fmt_header = '<8s16sHHHHHHLLLLLLLLLL'

1296 header_size = struct.calcsize(fmt_header)

1297 log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )

1298 header1 = header[:header_size]

1299 (

1300 self.header_signature,

1301 self.header_clsid,

1302 self.minor_version,

1303 self.dll_version,

1304 self.byte_order,

1305 self.sector_shift,

1306 self.mini_sector_shift,

1307 self.reserved1,

1308 self.reserved2,

1309 self.num_dir_sectors,

1310 self.num_fat_sectors,

1311 self.first_dir_sector,

1312 self.transaction_signature_number,

1313 self.mini_stream_cutoff_size,

1314 self.first_mini_fat_sector,

1315 self.num_mini_fat_sectors,

1316 self.first_difat_sector,

1317 self.num_difat_sectors

1318 ) = struct.unpack(fmt_header, header1)

1319 log.debug( struct.unpack(fmt_header, header1))

1320

1321 if self.header_signature != MAGIC:

1322 # OLE signature should always be present

1323 self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")

1324 if self.header_clsid != bytearray(16):

1325 # according to AAF specs, CLSID should always be zero

1326 self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")

1327 log.debug( "Minor Version = %d" % self.minor_version )

1328 # TODO: according to MS-CFB, minor version should be 0x003E

1329 log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version )

1330 if self.dll_version not in [3, 4]:

1331 # version 3: usual format, 512 bytes per sector

1332 # version 4: large format, 4K per sector

1333 self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")

1334 log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order )

1335 if self.byte_order != 0xFFFE:

1336 # For now only common little-endian documents are handled correctly

1337 self._raise_defect(DEFECT_INCORRECT, "incorrect ByteOrder in OLE header")

1338 # TODO: add big-endian support for documents created on Mac ?

1339 # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE.

1340 self.sector_size = 2**self.sector_shift

1341 log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size )

1342 if self.sector_size not in [512, 4096]:

1343 self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header")

1344 if (self.dll_version==3 and self.sector_size!=512) \

1345 or (self.dll_version==4 and self.sector_size!=4096):

1346 self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header")

1347 self.mini_sector_size = 2**self.mini_sector_shift

1348 log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size )

1349 if self.mini_sector_size not in [64]:

1350 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header")

1351 if self.reserved1 != 0 or self.reserved2 != 0:

1352 self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")

1353 log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors )

1354 # Number of directory sectors (only allowed if DllVersion != 3)

1355 if self.sector_size==512 and self.num_dir_sectors!=0:

1356 self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header")

1357 log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors )

1358 # num_fat_sectors = number of FAT sectors in the file

1359 log.debug( "First Directory sector = %Xh" % self.first_dir_sector )

1360 # first_dir_sector = 1st sector containing the directory

1361 log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number )

1362 # Signature should be zero, BUT some implementations do not follow this

1363 # rule => only a potential defect:

1364 # (according to MS-CFB, may be != 0 for applications supporting file

1365 # transactions)

1366 if self.transaction_signature_number != 0:

1367 self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)")

1368 log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size )

1369 # MS-CFB: This integer field MUST be set to 0x00001000. This field

1370 # specifies the maximum size of a user-defined data stream allocated

1371 # from the mini FAT and mini stream, and that cutoff is 4096 bytes.

1372 # Any user-defined data stream larger than or equal to this cutoff size

1373 # must be allocated as normal sectors from the FAT.

1374 if self.mini_stream_cutoff_size != 0x1000:

1375 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header")

1376 # if no exception is raised, the cutoff size is fixed to 0x1000

1377 log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' %

1378 self.mini_stream_cutoff_size)

1379 self.mini_stream_cutoff_size = 0x1000

1380 # TODO: check if these values are OK

1381 log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector )

1382 log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors )

1383 log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector )

1384 log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors )

1385

1386 # calculate the number of sectors in the file

1387 # (-1 because header doesn't count)

1388 self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1

1389 log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect))

1390 # TODO: change this test, because an OLE file MAY contain other data

1391 # after the last sector.

1392

1393 # file clsid

1394 self.header_clsid = _clsid(header[8:24])

1395

1396 # TODO: remove redundant attributes, and fix the code which uses them?

1397 self.sectorsize = self.sector_size #1 << i16(header, 30)

1398 self.minisectorsize = self.mini_sector_size #1 << i16(header, 32)

1399 self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56)

1400

1401 # check known streams for duplicate references (these are always in FAT,

1402 # never in MiniFAT):

1403 self._check_duplicate_stream(self.first_dir_sector)

1404 # check MiniFAT only if it is not empty:

1405 if self.num_mini_fat_sectors:

1406 self._check_duplicate_stream(self.first_mini_fat_sector)

1407 # check DIFAT only if it is not empty:

1408 if self.num_difat_sectors:

1409 self._check_duplicate_stream(self.first_difat_sector)

1410

1411 # Load file allocation tables

1412 self.loadfat(header)

1413 # Load directory. This sets both the direntries list (ordered by sid)

1414 # and the root (ordered by hierarchy) members.

1415 self.loaddirectory(self.first_dir_sector)

1416 self.minifatsect = self.first_mini_fat_sector

1417

1418 def close(self):

1419 """

1420 close the OLE file, release the file object if we created it ourselves.

1421

1422 Leaves the file handle open if it was provided by the caller.

1423 """

1424 self._close(warn=False)

1425

1426 def _close(self, warn=False):

1427 """Implementation of close() with internal arg `warn`."""

1428 if self._we_opened_fp:

1429 if warn and self.warn_if_not_closed:

1430 # we only raise a warning if the file was not explicitly closed,

1431 # and if the option warn_if_not_closed is enabled

1432 warnings.warn(OleFileIONotClosed(self._open_stack))

1433 self.fp.close()

1434 self._we_opened_fp = False

1435

1436 def _check_duplicate_stream(self, first_sect, minifat=False):

1437 """

1438 Checks if a stream has not been already referenced elsewhere.

1439 This method should only be called once for each known stream, and only

1440 if stream size is not null.

1441

1442 :param first_sect: int, index of first sector of the stream in FAT

1443 :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT

1444 """

1445 if minifat:

1446 log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect)

1447 used_streams = self._used_streams_minifat

1448 else:

1449 log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect)

1450 # some values can be safely ignored (not a real stream):

1451 if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):

1452 return

1453 used_streams = self._used_streams_fat

1454 # TODO: would it be more efficient using a dict or hash values, instead

1455 # of a list of long ?

1456 if first_sect in used_streams:

1457 self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')

1458 else:

1459 used_streams.append(first_sect)

1460

1461 def dumpfat(self, fat, firstindex=0):

1462 """

1463 Display a part of FAT in human-readable form for debugging purposes

1464 """

1465 # dictionary to convert special FAT values in human-readable strings

1466 VPL = 8 # values per line (8+1 * 8+1 = 81)

1467 fatnames = {

1468 FREESECT: "..free..",

1469 ENDOFCHAIN: "[ END. ]",

1470 FATSECT: "FATSECT ",

1471 DIFSECT: "DIFSECT "

1472 }

1473 nbsect = len(fat)

1474 nlines = (nbsect+VPL-1)//VPL

1475 print("index", end=" ")

1476 for i in range(VPL):

1477 print("%8X" % i, end=" ")

1478 print()

1479 for l in range(nlines):

1480 index = l*VPL

1481 print("%6X:" % (firstindex+index), end=" ")

1482 for i in range(index, index+VPL):

1483 if i>=nbsect:

1484 break

1485 sect = fat[i]

1486 aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND

1487 if aux in fatnames:

1488 name = fatnames[aux]

1489 else:

1490 if sect == i+1:

1491 name = " --->"

1492 else:

1493 name = "%8X" % sect

1494 print(name, end=" ")

1495 print()

1496

1497 def dumpsect(self, sector, firstindex=0):

1498 """

1499 Display a sector in a human-readable form, for debugging purposes

1500 """

1501 VPL=8 # number of values per line (8+1 * 8+1 = 81)

1502 tab = array.array(UINT32, sector)

1503 if sys.byteorder == 'big':

1504 tab.byteswap()

1505 nbsect = len(tab)

1506 nlines = (nbsect+VPL-1)//VPL

1507 print("index", end=" ")

1508 for i in range(VPL):

1509 print("%8X" % i, end=" ")

1510 print()

1511 for l in range(nlines):

1512 index = l*VPL

1513 print("%6X:" % (firstindex+index), end=" ")

1514 for i in range(index, index+VPL):

1515 if i>=nbsect:

1516 break

1517 sect = tab[i]

1518 name = "%8X" % sect

1519 print(name, end=" ")

1520 print()

1521

1522 def sect2array(self, sect):

1523 """

1524 convert a sector to an array of 32 bits unsigned integers,

1525 swapping bytes on big endian CPUs such as PowerPC (old Macs)

1526 """

1527 # TODO: make this a static function

1528 a = array.array(UINT32, sect)

1529 # if CPU is big endian, swap bytes:

1530 if sys.byteorder == 'big':

1531 a.byteswap()

1532 return a

1533

1534 def loadfat_sect(self, sect):

1535 """

1536 Adds the indexes of the given sector to the FAT

1537

1538 :param sect: string containing the first FAT sector, or array of long integers

1539 :returns: index of last FAT sector.

1540 """

1541 # a FAT sector is an array of ulong integers.

1542 if isinstance(sect, array.array):

1543 # if sect is already an array it is directly used

1544 fat1 = sect

1545 else:

1546 # if it's a raw sector, it is parsed in an array

1547 fat1 = self.sect2array(sect)

1548 # Display the sector contents only if the logging level is debug:

1549 if log.isEnabledFor(logging.DEBUG):

1550 self.dumpsect(sect)

1551 # The FAT is a sector chain starting at the first index of itself.

1552 # initialize isect, just in case:

1553 isect = None

1554 for isect in fat1:

1555 isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND

1556 log.debug("isect = %X" % isect)

1557 if isect == ENDOFCHAIN or isect == FREESECT:

1558 # the end of the sector chain has been reached

1559 log.debug("found end of sector chain")

1560 break

1561 # read the FAT sector

1562 s = self.getsect(isect)

1563 # parse it as an array of 32 bits integers, and add it to the

1564 # global FAT array

1565 nextfat = self.sect2array(s)

1566 self.fat = self.fat + nextfat

1567 return isect

1568

1569 def loadfat(self, header):

1570 """

1571 Load the FAT table.

1572 """

1573 # The 1st sector of the file contains sector numbers for the first 109

1574 # FAT sectors, right after the header which is 76 bytes long.

1575 # (always 109, whatever the sector size: 512 bytes = 76+4*109)

1576 # Additional sectors are described by DIF blocks

1577

1578 log.debug('Loading the FAT table, starting with the 1st sector after the header')

1579 sect = header[76:512]

1580 log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) )

1581 # fat = []

1582 # FAT is an array of 32 bits unsigned ints, it's more effective

1583 # to use an array than a list in Python.

1584 # It's initialized as empty first:

1585 self.fat = array.array(UINT32)

1586 self.loadfat_sect(sect)

1587 # self.dumpfat(self.fat)

1588 # for i in range(0, len(sect), 4):

1589 # ix = i32(sect, i)

1590 # # [PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:

1591 # if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:

1592 # break

1593 # s = self.getsect(ix)

1594 # # fat = fat + [i32(s, i) for i in range(0, len(s), 4)]

1595 # fat = fat + array.array(UINT32, s)

1596 if self.num_difat_sectors != 0:

1597 log.debug('DIFAT is used, because file size > 6.8MB.')

1598 # [PL] There's a DIFAT because file is larger than 6.8MB

1599 # some checks just in case:

1600 if self.num_fat_sectors <= 109:

1601 # there must be at least 109 blocks in header and the rest in

1602 # DIFAT, so number of sectors must be >109.

1603 self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')

1604 if self.first_difat_sector >= self.nb_sect:

1605 # initial DIFAT block index must be valid

1606 self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')

1607 log.debug( "DIFAT analysis..." )

1608 # We compute the necessary number of DIFAT sectors :

1609 # Number of pointers per DIFAT sector = (sectorsize/4)-1

1610 # (-1 because the last pointer is the next DIFAT sector number)

1611 nb_difat_sectors = (self.sectorsize//4)-1

1612 # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)

1613 nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors

1614 log.debug( "nb_difat = %d" % nb_difat )

1615 if self.num_difat_sectors != nb_difat:

1616 raise IOError('incorrect DIFAT')

1617 isect_difat = self.first_difat_sector

1618 for i in iterrange(nb_difat):

1619 log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) )

1620 # TODO: check if corresponding FAT SID = DIFSECT

1621 sector_difat = self.getsect(isect_difat)

1622 difat = self.sect2array(sector_difat)

1623 # Display the sector contents only if the logging level is debug:

1624 if log.isEnabledFor(logging.DEBUG):

1625 self.dumpsect(sector_difat)

1626 self.loadfat_sect(difat[:nb_difat_sectors])

1627 # last DIFAT pointer is next DIFAT sector:

1628 isect_difat = difat[nb_difat_sectors]

1629 log.debug( "next DIFAT sector: %X" % isect_difat )

1630 # checks:

1631 if isect_difat not in [ENDOFCHAIN, FREESECT]:

1632 # last DIFAT pointer value must be ENDOFCHAIN or FREESECT

1633 raise IOError('incorrect end of DIFAT')

1634 # if len(self.fat) != self.num_fat_sectors:

1635 # # FAT should contain num_fat_sectors blocks

1636 # print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors))

1637 # raise IOError('incorrect DIFAT')

1638 else:

1639 log.debug('No DIFAT, because file size < 6.8MB.')

1640 # since FAT is read from fixed-size sectors, it may contain more values

1641 # than the actual number of sectors in the file.

1642 # Keep only the relevant sector indexes:

1643 if len(self.fat) > self.nb_sect:

1644 log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))

1645 self.fat = self.fat[:self.nb_sect]

1646 log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect))

1647 # Display the FAT contents only if the logging level is debug:

1648 if log.isEnabledFor(logging.DEBUG):

1649 log.debug('\nFAT:')

1650 self.dumpfat(self.fat)

1651

1652 def loadminifat(self):

1653 """

1654 Load the MiniFAT table.

1655 """

1656 # MiniFAT is stored in a standard sub-stream, pointed to by a header

1657 # field.

1658 # NOTE: there are two sizes to take into account for this stream:

1659 # 1) Stream size is calculated according to the number of sectors

1660 # declared in the OLE header. This allocated stream may be more than

1661 # needed to store the actual sector indexes.

1662 # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size)

1663 stream_size = self.num_mini_fat_sectors * self.sector_size

1664 # 2) Actually used size is calculated by dividing the MiniStream size

1665 # (given by root entry size) by the size of mini sectors, *4 for

1666 # 32 bits indexes:

1667 nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size

1668 used_size = nb_minisectors * 4

1669 log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %

1670 (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors))

1671 if used_size > stream_size:

1672 # This is not really a problem, but may indicate a wrong implementation:

1673 self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')

1674 # In any case, first read stream_size:

1675 s = self._open(self.minifatsect, stream_size, force_FAT=True).read()

1676 # [PL] Old code replaced by an array:

1677 #self.minifat = [i32(s, i) for i in range(0, len(s), 4)]

1678 self.minifat = self.sect2array(s)

1679 # Then shrink the array to used size, to avoid indexes out of MiniStream:

1680 log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))

1681 self.minifat = self.minifat[:nb_minisectors]

1682 log.debug('loadminifat(): len=%d' % len(self.minifat))

1683 # Display the FAT contents only if the logging level is debug:

1684 if log.isEnabledFor(logging.DEBUG):

1685 log.debug('\nMiniFAT:')

1686 self.dumpfat(self.minifat)

1687

1688 def getsect(self, sect):

1689 """

1690 Read given sector from file on disk.

1691

1692 :param sect: int, sector index

1693 :returns: a string containing the sector data.

1694 """

1695 # From [MS-CFB]: A sector number can be converted into a byte offset

1696 # into the file by using the following formula:

1697 # (sector number + 1) x Sector Size.

1698 # This implies that sector #0 of the file begins at byte offset Sector

1699 # Size, not at 0.

1700

1701 # [PL] the original code in PIL was wrong when sectors are 4KB instead of

1702 # 512 bytes:

1703 #self.fp.seek(512 + self.sectorsize * sect)

1704 # [PL]: added safety checks:

1705 #print("getsect(%X)" % sect)

1706 try:

1707 self.fp.seek(self.sectorsize * (sect+1))

1708 except Exception:

1709 log.debug('getsect(): sect=%X, seek=%d, filesize=%d' %

1710 (sect, self.sectorsize*(sect+1), self._filesize))

1711 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')

1712 sector = self.fp.read(self.sectorsize)

1713 if len(sector) != self.sectorsize:

1714 log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' %

1715 (sect, len(sector), self.sectorsize))

1716 self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')

1717 return sector

1718

1719 def write_sect(self, sect, data, padding=b'\x00'):

1720 """

1721 Write given sector to file on disk.

1722

1723 :param sect: int, sector index

1724 :param data: bytes, sector data

1725 :param padding: single byte, padding character if data < sector size

1726 """

1727 if not isinstance(data, bytes):

1728 raise TypeError("write_sect: data must be a bytes string")

1729 if not isinstance(padding, bytes) or len(padding)!=1:

1730 raise TypeError("write_sect: padding must be a bytes string of 1 char")

1731 # TODO: we could allow padding=None for no padding at all

1732 try:

1733 self.fp.seek(self.sectorsize * (sect+1))

1734 except Exception:

1735 log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' %

1736 (sect, self.sectorsize*(sect+1), self._filesize))

1737 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')

1738 if len(data) < self.sectorsize:

1739 # add padding

1740 data += padding * (self.sectorsize - len(data))

1741 elif len(data) > self.sectorsize:

1742 raise ValueError("Data is larger than sector size")

1743 self.fp.write(data)

1744

1745 def _write_mini_sect(self, fp_pos, data, padding = b'\x00'):

1746 """

1747 Write given sector to file on disk.

1748

1749 :param fp_pos: int, file position

1750 :param data: bytes, sector data

1751 :param padding: single byte, padding character if data < sector size

1752 """

1753 if not isinstance(data, bytes):

1754 raise TypeError("write_mini_sect: data must be a bytes string")

1755 if not isinstance(padding, bytes) or len(padding) != 1:

1756 raise TypeError("write_mini_sect: padding must be a bytes string of 1 char")

1757

1758 try:

1759 self.fp.seek(fp_pos)

1760 except Exception:

1761 log.debug('write_mini_sect(): fp_pos=%d, filesize=%d' %

1762 (fp_pos, self._filesize))

1763 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')

1764 len_data = len(data)

1765 if len_data < self.mini_sector_size:

1766 data += padding * (self.mini_sector_size - len_data)

1767 if self.mini_sector_size < len_data:

1768 raise ValueError("Data is larger than sector size")

1769 self.fp.write(data)

1770

1771 def loaddirectory(self, sect):

1772 """

1773 Load the directory.

1774

1775 :param sect: sector index of directory stream.

1776 """

1777 log.debug('Loading the Directory:')

1778 # The directory is stored in a standard

1779 # substream, independent of its size.

1780

1781 # open directory stream as a read-only file:

1782 # (stream size is not known in advance)

1783 self.directory_fp = self._open(sect, force_FAT=True)

1784

1785 # [PL] to detect malformed documents and avoid DoS attacks, the maximum

1786 # number of directory entries can be calculated:

1787 max_entries = self.directory_fp.size // 128

1788 log.debug('loaddirectory: size=%d, max_entries=%d' %

1789 (self.directory_fp.size, max_entries))

1790

1791 # Create list of directory entries

1792 # self.direntries = []

1793 # We start with a list of "None" object

1794 self.direntries = [None] * max_entries

1795 # for sid in iterrange(max_entries):

1796 # entry = fp.read(128)

1797 # if not entry:

1798 # break

1799 # self.direntries.append(OleDirectoryEntry(entry, sid, self))

1800 # load root entry:

1801 root_entry = self._load_direntry(0)

1802 # Root entry is the first entry:

1803 self.root = self.direntries[0]

1804 # TODO: read ALL directory entries (ignore bad entries?)

1805 # TODO: adapt build_storage_tree to avoid duplicate reads

1806 # for i in range(1, max_entries):

1807 # self._load_direntry(i)

1808 # read and build all storage trees, starting from the root:

1809 self.root.build_storage_tree()

1810

1811 def _load_direntry (self, sid):

1812 """

1813 Load a directory entry from the directory.

1814 This method should only be called once for each storage/stream when

1815 loading the directory.

1816

1817 :param sid: index of storage/stream in the directory.

1818 :returns: a OleDirectoryEntry object

1819

1820 :exception OleFileError: if the entry has always been referenced.

1821 """

1822 # check if SID is OK:

1823 if sid<0 or sid>=len(self.direntries):

1824 self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")

1825 # check if entry was already referenced:

1826 if self.direntries[sid] is not None:

1827 self._raise_defect(DEFECT_INCORRECT,

1828 "double reference for OLE stream/storage")

1829 # if exception not raised, return the object

1830 return self.direntries[sid]

1831 self.directory_fp.seek(sid * 128)

1832 entry = self.directory_fp.read(128)

1833 self.direntries[sid] = OleDirectoryEntry(entry, sid, self)

1834 return self.direntries[sid]

1835

1836 def dumpdirectory(self):

1837 """

1838 Dump directory (for debugging only)

1839 """

1840 self.root.dump()

1841

1842 def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False):

1843 """

1844 Open a stream, either in FAT or MiniFAT according to its size.

1845 (openstream helper)

1846

1847 :param start: index of first sector

1848 :param size: size of stream (or nothing if size is unknown)

1849 :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT

1850 according to size. If True, it will always be opened in FAT.

1851 """

1852 log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' %

1853 (start, size, str(force_FAT)))

1854 # stream size is compared to the mini_stream_cutoff_size threshold:

1855 if size < self.minisectorcutoff and not force_FAT:

1856 # ministream object

1857 if not self.ministream:

1858 # load MiniFAT if it wasn't already done:

1859 self.loadminifat()

1860 # The first sector index of the miniFAT stream is stored in the

1861 # root directory entry:

1862 size_ministream = self.root.size

1863 log.debug('Opening MiniStream: sect=%Xh, size=%d' %

1864 (self.root.isectStart, size_ministream))

1865 self.ministream = self._open(self.root.isectStart,

1866 size_ministream, force_FAT=True)

1867 return OleStream(fp=self.ministream, sect=start, size=size,

1868 offset=0, sectorsize=self.minisectorsize,

1869 fat=self.minifat, filesize=self.ministream.size,

1870 olefileio=self)

1871 else:

1872 # standard stream

1873 return OleStream(fp=self.fp, sect=start, size=size,

1874 offset=self.sectorsize,

1875 sectorsize=self.sectorsize, fat=self.fat,

1876 filesize=self._filesize,

1877 olefileio=self)

1878

1879 def _list(self, files, prefix, node, streams=True, storages=False):

1880 """

1881 listdir helper

1882

1883 :param files: list of files to fill in

1884 :param prefix: current location in storage tree (list of names)

1885 :param node: current node (OleDirectoryEntry object)

1886 :param streams: bool, include streams if True (True by default) - new in v0.26

1887 :param storages: bool, include storages if True (False by default) - new in v0.26

1888 (note: the root storage is never included)

1889 """

1890 prefix = prefix + [node.name]

1891 for entry in node.kids:

1892 if entry.entry_type == STGTY_STORAGE:

1893 # this is a storage

1894 if storages:

1895 # add it to the list

1896 files.append(prefix[1:] + [entry.name])

1897 # check its kids

1898 self._list(files, prefix, entry, streams, storages)

1899 elif entry.entry_type == STGTY_STREAM:

1900 # this is a stream

1901 if streams:

1902 # add it to the list

1903 files.append(prefix[1:] + [entry.name])

1904 else:

1905 self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.')

1906

1907 def listdir(self, streams=True, storages=False):

1908 """

1909 Return a list of streams and/or storages stored in this file

1910

1911 :param streams: bool, include streams if True (True by default) - new in v0.26

1912 :param storages: bool, include storages if True (False by default) - new in v0.26

1913 (note: the root storage is never included)

1914 :returns: list of stream and/or storage paths

1915 """

1916 files = []

1917 self._list(files, [], self.root, streams, storages)

1918 return files

1919

1920 def _find(self, filename):

1921 """

1922 Returns directory entry of given filename. (openstream helper)

1923 Note: this method is case-insensitive.

1924

1925 :param filename: path of stream in storage tree (except root entry), either:

1926

1927 - a string using Unix path syntax, for example:

1928 'storage_1/storage_1.2/stream'

1929 - or a list of storage filenames, path to the desired stream/storage.

1930 Example: ['storage_1', 'storage_1.2', 'stream']

1931

1932 :returns: sid of requested filename

1933 :exception IOError: if file not found

1934 """

1935

1936 # if filename is a string instead of a list, split it on slashes to

1937 # convert to a list:

1938 if isinstance(filename, basestring):

1939 filename = filename.split('/')

1940 # walk across storage tree, following given path:

1941 node = self.root

1942 for name in filename:

1943 for kid in node.kids:

1944 if kid.name.lower() == name.lower():

1945 break

1946 else:

1947 raise IOError("file not found")

1948 node = kid

1949 return node.sid

1950

1951 def openstream(self, filename):

1952 """

1953 Open a stream as a read-only file object (BytesIO).

1954 Note: filename is case-insensitive.

1955

1956 :param filename: path of stream in storage tree (except root entry), either:

1957

1958 - a string using Unix path syntax, for example:

1959 'storage_1/storage_1.2/stream'

1960 - or a list of storage filenames, path to the desired stream/storage.

1961 Example: ['storage_1', 'storage_1.2', 'stream']

1962

1963 :returns: file object (read-only)

1964 :exception IOError: if filename not found, or if this is not a stream.

1965 """

1966 sid = self._find(filename)

1967 entry = self.direntries[sid]

1968 if entry.entry_type != STGTY_STREAM:

1969 raise IOError("this file is not a stream")

1970 return self._open(entry.isectStart, entry.size)

1971

1972 def _write_mini_stream(self, entry, data_to_write):

1973 if not entry.sect_chain:

1974 entry.build_sect_chain(self)

1975 nb_sectors = len(entry.sect_chain)

1976

1977 if not self.root.sect_chain:

1978 self.root.build_sect_chain(self)

1979 block_size = self.sector_size // self.mini_sector_size

1980 for idx, sect in enumerate(entry.sect_chain):

1981 sect_base = sect // block_size

1982 sect_offset = sect % block_size

1983 fp_pos = (self.root.sect_chain[sect_base] + 1)*self.sector_size + sect_offset*self.mini_sector_size

1984 if idx < (nb_sectors - 1):

1985 data_per_sector = data_to_write[idx * self.mini_sector_size: (idx + 1) * self.mini_sector_size]

1986 else:

1987 data_per_sector = data_to_write[idx * self.mini_sector_size:]

1988 self._write_mini_sect(fp_pos, data_per_sector)

1989

1990 def write_stream(self, stream_name, data):

1991 """

1992 Write a stream to disk. For now, it is only possible to replace an

1993 existing stream by data of the same size.

1994

1995 :param stream_name: path of stream in storage tree (except root entry), either:

1996

1997 - a string using Unix path syntax, for example:

1998 'storage_1/storage_1.2/stream'

1999 - or a list of storage filenames, path to the desired stream/storage.

2000 Example: ['storage_1', 'storage_1.2', 'stream']

2001

2002 :param data: bytes, data to be written, must be the same size as the original

2003 stream.

2004 """

2005 if not isinstance(data, bytes):

2006 raise TypeError("write_stream: data must be a bytes string")

2007 sid = self._find(stream_name)

2008 entry = self.direntries[sid]

2009 if entry.entry_type != STGTY_STREAM:

2010 raise IOError("this is not a stream")

2011 size = entry.size

2012 if size != len(data):

2013 raise ValueError("write_stream: data must be the same size as the existing stream")

2014 if size < self.minisectorcutoff and entry.entry_type != STGTY_ROOT:

2015 return self._write_mini_stream(entry = entry, data_to_write = data)

2016

2017 sect = entry.isectStart

2018 # number of sectors to write

2019 nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize

2020 log.debug('nb_sectors = %d' % nb_sectors)

2021 for i in range(nb_sectors):

2022 # try:

2023 # self.fp.seek(offset + self.sectorsize * sect)

2024 # except Exception:

2025 # log.debug('sect=%d, seek=%d' %

2026 # (sect, offset+self.sectorsize*sect))

2027 # raise IOError('OLE sector index out of range')

2028 # extract one sector from data, the last one being smaller:

2029 if i<(nb_sectors-1):

2030 data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize]

2031 # TODO: comment this if it works

2032 assert(len(data_sector)==self.sectorsize)

2033 else:

2034 data_sector = data [i*self.sectorsize:]

2035 # TODO: comment this if it works

2036 log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d'

2037 % (size, self.sectorsize, len(data_sector), size % self.sectorsize))

2038 assert(len(data_sector) % self.sectorsize==size % self.sectorsize)

2039 self.write_sect(sect, data_sector)

2040 # self.fp.write(data_sector)

2041 # jump to next sector in the FAT:

2042 try:

2043 sect = self.fat[sect]

2044 except IndexError:

2045 # [PL] if pointer is out of the FAT an exception is raised

2046 raise IOError('incorrect OLE FAT, sector index out of range')

2047 # [PL] Last sector should be a "end of chain" marker:

2048 if sect != ENDOFCHAIN:

2049 raise IOError('incorrect last sector index in OLE stream')

2050

2051 def get_type(self, filename):

2052 """

2053 Test if given filename exists as a stream or a storage in the OLE

2054 container, and return its type.

2055

2056 :param filename: path of stream in storage tree. (see openstream for syntax)

2057 :returns: False if object does not exist, its entry type (>0) otherwise:

2058

2059 - STGTY_STREAM: a stream

2060 - STGTY_STORAGE: a storage

2061 - STGTY_ROOT: the root entry

2062 """

2063 try:

2064 sid = self._find(filename)

2065 entry = self.direntries[sid]

2066 return entry.entry_type

2067 except Exception:

2068 return False

2069

2070 def getclsid(self, filename):

2071 """

2072 Return clsid of a stream/storage.

2073

2074 :param filename: path of stream/storage in storage tree. (see openstream for

2075 syntax)

2076 :returns: Empty string if clsid is null, a printable representation of the clsid otherwise

2077

2078 new in version 0.44

2079 """

2080 sid = self._find(filename)

2081 entry = self.direntries[sid]

2082 return entry.clsid

2083

2084 def getmtime(self, filename):

2085 """

2086 Return modification time of a stream/storage.

2087

2088 :param filename: path of stream/storage in storage tree. (see openstream for

2089 syntax)

2090 :returns: None if modification time is null, a python datetime object

2091 otherwise (UTC timezone)

2092

2093 new in version 0.26

2094 """

2095 sid = self._find(filename)

2096 entry = self.direntries[sid]

2097 return entry.getmtime()

2098

2099 def getctime(self, filename):

2100 """

2101 Return creation time of a stream/storage.

2102

2103 :param filename: path of stream/storage in storage tree. (see openstream for

2104 syntax)

2105 :returns: None if creation time is null, a python datetime object

2106 otherwise (UTC timezone)

2107

2108 new in version 0.26

2109 """

2110 sid = self._find(filename)

2111 entry = self.direntries[sid]

2112 return entry.getctime()

2113

2114 def exists(self, filename):

2115 """

2116 Test if given filename exists as a stream or a storage in the OLE

2117 container.

2118 Note: filename is case-insensitive.

2119

2120 :param filename: path of stream in storage tree. (see openstream for syntax)

2121 :returns: True if object exist, else False.

2122 """

2123 try:

2124 sid = self._find(filename)

2125 return True

2126 except Exception:

2127 return False

2128

2129 def get_size(self, filename):

2130 """

2131 Return size of a stream in the OLE container, in bytes.

2132

2133 :param filename: path of stream in storage tree (see openstream for syntax)

2134 :returns: size in bytes (long integer)

2135 :exception IOError: if file not found

2136 :exception TypeError: if this is not a stream.

2137 """

2138 sid = self._find(filename)

2139 entry = self.direntries[sid]

2140 if entry.entry_type != STGTY_STREAM:

2141 # TODO: Should it return zero instead of raising an exception ?

2142 raise TypeError('object is not an OLE stream')

2143 return entry.size

2144

2145 def get_rootentry_name(self):

2146 """

2147 Return root entry name. Should usually be 'Root Entry' or 'R' in most

2148 implementations.

2149 """

2150 return self.root.name

2151

2152 def getproperties(self, filename, convert_time=False, no_conversion=None):

2153 """

2154 Return properties described in substream.

2155

2156 :param filename: path of stream in storage tree (see openstream for syntax)

2157 :param convert_time: bool, if True timestamps will be converted to Python datetime

2158 :param no_conversion: None or list of int, timestamps not to be converted

2159 (for example total editing time is not a real timestamp)

2160

2161 :returns: a dictionary of values indexed by id (integer)

2162 """

2163 #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx

2164 # make sure no_conversion is a list, just to simplify code below:

2165 if no_conversion == None:

2166 no_conversion = []

2167 # stream path as a string to report exceptions:

2168 streampath = filename

2169 if not isinstance(streampath, str):

2170 streampath = '/'.join(streampath)

2171 fp = self.openstream(filename)

2172 data = {}

2173 try:

2174 # header

2175 s = fp.read(28)

2176 clsid = _clsid(s[8:24])

2177 # format id

2178 s = fp.read(20)

2179 fmtid = _clsid(s[:16])

2180 fp.seek(i32(s, 16))

2181 # get section

2182 s = b"****" + fp.read(i32(fp.read(4))-4)

2183 # number of properties:

2184 num_props = i32(s, 4)

2185 except BaseException as exc:

2186 # catch exception while parsing property header, and only raise

2187 # a DEFECT_INCORRECT then return an empty dict, because this is not

2188 # a fatal error when parsing the whole file

2189 msg = 'Error while parsing properties header in stream {}: {}'.format(

2190 repr(streampath), exc)

2191 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2192 return data

2193 # clamp num_props based on the data length

2194 num_props = min(num_props, int(len(s) / 8))

2195 for i in iterrange(num_props):

2196 property_id = 0 # just in case of an exception

2197 try:

2198 property_id = i32(s, 8+i*8)

2199 offset = i32(s, 12+i*8)

2200 property_type = i32(s, offset)

2201

2202 vt_name = VT.get(property_type, 'UNKNOWN')

2203 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))

2204

2205 value = self._parse_property(s, offset+4, property_id, property_type, convert_time, no_conversion)

2206 data[property_id] = value

2207 except BaseException as exc:

2208 # catch exception while parsing each property, and only raise

2209 # a DEFECT_INCORRECT, because parsing can go on

2210 msg = 'Error while parsing property id %d in stream %s: %s' % (

2211 property_id, repr(streampath), exc)

2212 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2213

2214 return data

2215

2216 def _parse_property(self, s, offset, property_id, property_type, convert_time, no_conversion):

2217 v = None

2218 if property_type <= VT_BLOB or property_type in (VT_CLSID, VT_CF):

2219 v, _ = self._parse_property_basic(s, offset, property_id, property_type, convert_time, no_conversion)

2220 elif property_type == VT_VECTOR | VT_VARIANT:

2221 log.debug('property_type == VT_VECTOR | VT_VARIANT')

2222 off = 4

2223 count = i32(s, offset)

2224 values = []

2225 for _ in range(count):

2226 property_type = i32(s, offset + off)

2227 v, sz = self._parse_property_basic(s, offset + off + 4, property_id, property_type, convert_time, no_conversion)

2228 values.append(v)

2229 off += sz + 4

2230 v = values

2231

2232 elif property_type & VT_VECTOR:

2233 property_type_base = property_type & ~VT_VECTOR

2234 log.debug('property_type == VT_VECTOR | %s' % VT.get(property_type_base, 'UNKNOWN'))

2235 off = 4

2236 count = i32(s, offset)

2237 values = []

2238 for _ in range(count):

2239 v, sz = self._parse_property_basic(s, offset + off, property_id, property_type & ~VT_VECTOR, convert_time, no_conversion)

2240 values.append(v)

2241 off += sz

2242 v = values

2243 else:

2244 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))

2245 return v

2246

2247 def _parse_property_basic(self, s, offset, property_id, property_type, convert_time, no_conversion):

2248 value = None

2249 size = 0

2250 # test for common types first (should perhaps use

2251 # a dictionary instead?)

2252

2253 if property_type == VT_I2: # 16-bit signed integer

2254 value = i16(s, offset)

2255 if value >= 32768:

2256 value = value - 65536

2257 size = 2

2258 elif property_type == VT_UI2: # 2-byte unsigned integer

2259 value = i16(s, offset)

2260 size = 2

2261 elif property_type in (VT_I4, VT_INT, VT_ERROR):

2262 # VT_I4: 32-bit signed integer

2263 # VT_ERROR: HRESULT, similar to 32-bit signed integer,

2264 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx

2265 value = i32(s, offset)

2266 size = 4

2267 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer

2268 value = i32(s, offset) # FIXME

2269 size = 4

2270 elif property_type in (VT_BSTR, VT_LPSTR):

2271 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx

2272 # size is a 32 bits integer, including the null terminator, and

2273 # possibly trailing or embedded null chars

2274 #TODO: if codepage is unicode, the string should be converted as such

2275 count = i32(s, offset)

2276 value = s[offset+4:offset+4+count-1]

2277 # remove all null chars:

2278 value = value.replace(b'\x00', b'')

2279 size = 4 + count

2280 elif property_type == VT_BLOB:

2281 # binary large object (BLOB)

2282 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx

2283 count = i32(s, offset)

2284 value = s[offset+4:offset+4+count]

2285 size = 4 + count

2286 elif property_type == VT_LPWSTR:

2287 # UnicodeString

2288 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx

2289 # "the string should NOT contain embedded or additional trailing

2290 # null characters."

2291 count = i32(s, offset+4)

2292 value = self._decode_utf16_str(s[offset+4:offset+4+count*2])

2293 size = 4 + count * 2

2294 elif property_type == VT_FILETIME:

2295 value = long(i32(s, offset)) + (long(i32(s, offset+4))<<32)

2296 # FILETIME is a 64-bit int: "number of 100ns periods

2297 # since Jan 1,1601".

2298 if convert_time and property_id not in no_conversion:

2299 log.debug('Converting property #%d to python datetime, value=%d=%fs'

2300 %(property_id, value, float(value)/10000000))

2301 # convert FILETIME to Python datetime.datetime

2302 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/

2303 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)

2304 log.debug('timedelta days=%d' % (value//(10*1000000*3600*24)))

2305 value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10)

2306 else:

2307 # legacy code kept for backward compatibility: returns a

2308 # number of seconds since Jan 1,1601

2309 value = value // 10000000 # seconds

2310 size = 8

2311 elif property_type == VT_UI1: # 1-byte unsigned integer

2312 value = i8(s[offset])

2313 size = 1

2314 elif property_type == VT_CLSID:

2315 value = _clsid(s[offset:offset+16])

2316 size = 16

2317 elif property_type == VT_CF:

2318 # PropertyIdentifier or ClipboardData??

2319 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx

2320 count = i32(s, offset)

2321 value = s[offset+4:offset+4+count]

2322 size = 4 + count

2323 elif property_type == VT_BOOL:

2324 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True

2325 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx

2326 value = bool(i16(s, offset))

2327 size = 2

2328 else:

2329 value = None # everything else yields "None"

2330 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))

2331

2332 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,

2333 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,

2334 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx

2335

2336 #print("%08x" % property_id, repr(value), end=" ")

2337 #print("(%s)" % VT[i32(s, offset) & 0xFFF])

2338 return value, size

2339

2340

2341 def get_metadata(self):

2342 """

2343 Parse standard properties streams, return an OleMetadata object

2344 containing all the available metadata.

2345 (also stored in the metadata attribute of the OleFileIO object)

2346

2347 new in version 0.25

2348 """

2349 self.metadata = OleMetadata()

2350 self.metadata.parse_properties(self)

2351 return self.metadata

2352

2353 def get_userdefined_properties(self, filename, convert_time=False, no_conversion=None):

2354 """

2355 Return properties described in substream.

2356

2357 :param filename: path of stream in storage tree (see openstream for syntax)

2358 :param convert_time: bool, if True timestamps will be converted to Python datetime

2359 :param no_conversion: None or list of int, timestamps not to be converted

2360 (for example total editing time is not a real timestamp)

2361

2362 :returns: a dictionary of values indexed by id (integer)

2363 """

2364 # REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx

2365 # REFERENCE: https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-oshared/2ea8be67-a4a0-4e2e-b42f-49a182645562

2366 #'D5CDD502-2E9C-101B-9397-08002B2CF9AE'

2367 # TODO: testing the code more rigorously

2368 # TODO: adding exception handeling

2369 FMTID_USERDEFINED_PROPERTIES = _clsid(b'\x05\xD5\xCD\xD5\x9C\x2E\x1B\x10\x93\x97\x08\x00\x2B\x2C\xF9\xAE')

2370

2371 # make sure no_conversion is a list, just to simplify code below:

2372 if no_conversion == None:

2373 no_conversion = []

2374 # stream path as a string to report exceptions:

2375 streampath = filename

2376 if not isinstance(streampath, str):

2377 streampath = '/'.join(streampath)

2378

2379 fp = self.openstream(filename)

2380

2381 data = []

2382

2383 # header

2384 s = fp.read(28)

2385 clsid = _clsid(s[8:24])

2386

2387 # PropertySetStream.cSections (4 bytes starts at 1c): number of property sets in this stream

2388 sections_count = i32(s, 24)

2389

2390 section_file_pointers = []

2391

2392 try:

2393 for i in range(sections_count):

2394 # format id

2395 s = fp.read(20)

2396 fmtid = _clsid(s[:16])

2397

2398 if fmtid == FMTID_USERDEFINED_PROPERTIES:

2399 file_pointer = i32(s, 16)

2400 fp.seek(file_pointer)

2401 # read saved sections

2402 s = b"****" + fp.read(i32(fp.read(4)) - 4)

2403 # number of properties:

2404 num_props = i32(s, 4)

2405

2406 PropertyIdentifierAndOffset = s[8: 8+8*num_props]

2407

2408 # property names (dictionary)

2409 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/99127b7f-c440-4697-91a4-c853086d6b33

2410 index = 8+8*num_props

2411 entry_count = i32(s[index: index+4])

2412 index += 4

2413 for i in range(entry_count):

2414 identifier = s[index: index +4]

2415 str_size = i32(s[index+4: index + 8])

2416 string = s[index+8: index+8+str_size].decode('utf_8').strip('\0')

2417 data.append({'property_name':string, 'value':None})

2418 index = index+8+str_size

2419 # clamp num_props based on the data length

2420 num_props = min(num_props, int(len(s) / 8))

2421

2422 # property values

2423 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/f122b9d7-e5cf-4484-8466-83f6fd94b3cc

2424 for i in iterrange(2, num_props):

2425 property_id = 0 # just in case of an exception

2426 try:

2427 property_id = i32(s, 8 + i * 8)

2428 offset = i32(s, 12 + i * 8)

2429 property_type = i32(s, offset)

2430

2431 vt_name = VT.get(property_type, 'UNKNOWN')

2432 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))

2433

2434 # test for common types first (should perhaps use

2435 # a dictionary instead?)

2436

2437 if property_type == VT_I2: # 16-bit signed integer

2438 value = i16(s, offset + 4)

2439 if value >= 32768:

2440 value = value - 65536

2441 elif property_type == 1:

2442 # supposed to be VT_NULL but seems it is not NULL

2443 str_size = i32(s, offset + 8)

2444 value = s[offset + 12:offset + 12 + str_size - 1]

2445

2446 elif property_type == VT_UI2: # 2-byte unsigned integer

2447 value = i16(s, offset + 4)

2448 elif property_type in (VT_I4, VT_INT, VT_ERROR):

2449 # VT_I4: 32-bit signed integer

2450 # VT_ERROR: HRESULT, similar to 32-bit signed integer,

2451 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx

2452 value = i32(s, offset + 4)

2453 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer

2454 value = i32(s, offset + 4) # FIXME

2455 elif property_type in (VT_BSTR, VT_LPSTR):

2456 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx

2457 # size is a 32 bits integer, including the null terminator, and

2458 # possibly trailing or embedded null chars

2459 # TODO: if codepage is unicode, the string should be converted as such

2460 count = i32(s, offset + 4)

2461 value = s[offset + 8:offset + 8 + count - 1]

2462 # remove all null chars:

2463 value = value.replace(b'\x00', b'')

2464 elif property_type == VT_BLOB:

2465 # binary large object (BLOB)

2466 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx

2467 count = i32(s, offset + 4)

2468 value = s[offset + 8:offset + 8 + count]

2469 elif property_type == VT_LPWSTR:

2470 # UnicodeString

2471 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx

2472 # "the string should NOT contain embedded or additional trailing

2473 # null characters."

2474 count = i32(s, offset + 4)

2475 value = self._decode_utf16_str(s[offset + 8:offset + 8 + count * 2])

2476 elif property_type == VT_FILETIME:

2477 value = long(i32(s, offset + 4)) + (long(i32(s, offset + 8)) << 32)

2478 # FILETIME is a 64-bit int: "number of 100ns periods

2479 # since Jan 1,1601".

2480 if convert_time and property_id not in no_conversion:

2481 log.debug('Converting property #%d to python datetime, value=%d=%fs'

2482 % (property_id, value, float(value) / 10000000))

2483 # convert FILETIME to Python datetime.datetime

2484 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/

2485 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)

2486 log.debug('timedelta days=%d' % (value // (10 * 1000000 * 3600 * 24)))

2487 value = _FILETIME_null_date + datetime.timedelta(microseconds=value // 10)

2488 else:

2489 # legacy code kept for backward compatibility: returns a

2490 # number of seconds since Jan 1,1601

2491 value = value // 10000000 # seconds

2492 elif property_type == VT_UI1: # 1-byte unsigned integer

2493 value = i8(s[offset + 4])

2494 elif property_type == VT_CLSID:

2495 value = _clsid(s[offset + 4:offset + 20])

2496 elif property_type == VT_CF:

2497 # PropertyIdentifier or ClipboardData??

2498 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx

2499 count = i32(s, offset + 4)

2500 value = s[offset + 8:offset + 8 + count]

2501 elif property_type == VT_BOOL:

2502 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True

2503 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx

2504 value = bool(i16(s, offset + 4))

2505 else:

2506 value = None # everything else yields "None"

2507 log.debug(

2508 'property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))

2509

2510 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,

2511 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,

2512 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx

2513

2514 # FIXME: add support for VT_VECTOR

2515 # VT_VECTOR is a 32 uint giving the number of items, followed by

2516 # the items in sequence. The VT_VECTOR value is combined with the

2517 # type of items, e.g. VT_VECTOR|VT_BSTR

2518 # see https://msdn.microsoft.com/en-us/library/dd942011.aspx

2519

2520 # print("%08x" % property_id, repr(value), end=" ")

2521 # print("(%s)" % VT[i32(s, offset) & 0xFFF])

2522

2523 data[i-2]['value']=value

2524 except BaseException as exc:

2525 # catch exception while parsing each property, and only raise

2526 # a DEFECT_INCORRECT, because parsing can go on

2527 msg = 'Error while parsing property id %d in stream %s: %s' % (

2528 property_id, repr(streampath), exc)

2529 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2530

2531 except BaseException as exc:

2532 # catch exception while parsing property header, and only raise

2533 # a DEFECT_INCORRECT then return an empty dict, because this is not

2534 # a fatal error when parsing the whole file

2535 msg = 'Error while parsing properties header in stream %s: %s' % (

2536 repr(streampath), exc)

2537 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2538 return data

2539

2540 return data

2541

2542

2543# --------------------------------------------------------------------

2544# This script can be used to dump the directory of any OLE2 structured

2545# storage file.

2546

2547def main():

2548 """

2549 Main function when olefile is runs as a script from the command line.

2550 This will open an OLE2 file and display its structure and properties

2551 :return: nothing

2552 """

2553 import sys, optparse

2554

2555 DEFAULT_LOG_LEVEL = "warning" # Default log level

2556 LOG_LEVELS = {

2557 'debug': logging.DEBUG,

2558 'info': logging.INFO,

2559 'warning': logging.WARNING,

2560 'error': logging.ERROR,

2561 'critical': logging.CRITICAL

2562 }

2563

2564 usage = 'usage: %prog [options] <filename> [filename2 ...]'

2565 parser = optparse.OptionParser(usage=usage)

2566

2567 parser.add_option("-c", action="store_true", dest="check_streams",

2568 help='check all streams (for debugging purposes)')

2569 parser.add_option("-p", action="store_true", dest="extract_customprop",

2570 help='extract all user-defined propertires')

2571 parser.add_option("-d", action="store_true", dest="debug_mode",

2572 help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)')

2573 parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,

2574 help="logging level debug/info/warning/error/critical (default=%default)")

2575

2576 (options, args) = parser.parse_args()

2577

2578 print('olefile version {} {} - https://www.decalage.info/en/olefile\n'.format(__version__, __date__))

2579

2580 # Print help if no arguments are passed

2581 if len(args) == 0:

2582 print(__doc__)

2583 parser.print_help()

2584 sys.exit()

2585

2586 if options.debug_mode:

2587 options.loglevel = 'debug'

2588

2589 # setup logging to the console

2590 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')

2591

2592 # also enable the module's logger:

2593 enable_logging()

2594

2595 for filename in args:

2596 try:

2597 ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT)

2598 print("-" * 68)

2599 print(filename)

2600 print("-" * 68)

2601 ole.dumpdirectory()

2602 for streamname in ole.listdir():

2603 if streamname[-1][0] == "\005":

2604 print("%r: properties" % streamname)

2605 try:

2606 props = ole.getproperties(streamname, convert_time=True)

2607 props = sorted(props.items())

2608 for k, v in props:

2609 # [PL]: avoid to display too large or binary values:

2610 if isinstance(v, (basestring, bytes)):

2611 if len(v) > 50:

2612 v = v[:50]

2613 if isinstance(v, bytes):

2614 # quick and dirty binary check:

2615 for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,

2616 21,22,23,24,25,26,27,28,29,30,31):

2617 if c in bytearray(v):

2618 v = '(binary data)'

2619 break

2620 print(" ", k, v)

2621 except Exception:

2622 log.exception('Error while parsing property stream %r' % streamname)

2623

2624 try:

2625 if options.extract_customprop:

2626 variables = ole.get_userdefined_properties(streamname, convert_time=True)

2627 if len(variables):

2628 print("%r: user-defined properties" % streamname)

2629 for index, variable in enumerate(variables):

2630 print('\t{} {}: {}'.format(index, variable['property_name'],variable['value']))

2631

2632 except:

2633 log.exception('Error while parsing user-defined property stream %r' % streamname)

2634

2635

2636 if options.check_streams:

2637 # Read all streams to check if there are errors:

2638 print('\nChecking streams...')

2639 for streamname in ole.listdir():

2640 # print name using repr() to convert binary chars to \xNN:

2641 print('-', repr('/'.join(streamname)),'-', end=' ')

2642 st_type = ole.get_type(streamname)

2643 if st_type == STGTY_STREAM:

2644 print('size %d' % ole.get_size(streamname))

2645 # just try to read stream in memory:

2646 ole.openstream(streamname)

2647 else:

2648 print('NOT a stream : type=%d' % st_type)

2649 print()

2650

2651 # for streamname in ole.listdir():

2652 # # print name using repr() to convert binary chars to \xNN:

2653 # print('-', repr('/'.join(streamname)),'-', end=' ')

2654 # print(ole.getmtime(streamname))

2655 # print()

2656

2657 print('Modification/Creation times of all directory entries:')

2658 for entry in ole.direntries:

2659 if entry is not None:

2660 print('- {}: mtime={} ctime={}'.format(entry.name,

2661 entry.getmtime(), entry.getctime()))

2662 print()

2663

2664 # parse and display metadata:

2665 try:

2666 meta = ole.get_metadata()

2667 meta.dump()

2668 except Exception:

2669 log.exception('Error while parsing metadata')

2670 print()

2671 # [PL] Test a few new methods:

2672 root = ole.get_rootentry_name()

2673 print('Root entry name: "%s"' % root)

2674 if ole.exists('worddocument'):

2675 print("This is a Word document.")

2676 print("type of stream 'WordDocument':", ole.get_type('worddocument'))

2677 print("size :", ole.get_size('worddocument'))

2678 if ole.exists('macros/vba'):

2679 print("This document may contain VBA macros.")

2680

2681 # print parsing issues:

2682 print('\nNon-fatal issues raised during parsing:')

2683 if ole.parsing_issues:

2684 for exctype, msg in ole.parsing_issues:

2685 print('- {}: {}'.format(exctype.__name__, msg))

2686 else:

2687 print('None')

2688 ole.close()

2689 except Exception:

2690 log.exception('Error while parsing file %r' % filename)

2691

2692

2693if __name__ == "__main__":

2694 main()

2695

2696# this code was developed while listening to The Wedding Present "Sea Monsters"