Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/olefile/olefile.py: 42%
1186 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:37 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:37 +0000
1"""
2olefile (formerly OleFileIO_PL)
4Module to read/write Microsoft OLE2 files (also called Structured Storage or
5Microsoft Compound Document File Format), such as Microsoft Office 97-2003
6documents, Image Composer and FlashPix files, Outlook messages, ...
7This version is compatible with Python 2.7 and 3.5+
9Project website: https://www.decalage.info/olefile
11olefile is copyright (c) 2005-2020 Philippe Lagadec
12(https://www.decalage.info)
14olefile is based on the OleFileIO module from the PIL library v1.1.7
15See: http://www.pythonware.com/products/pil/index.htm
16and http://svn.effbot.org/public/tags/pil-1.1.7/PIL/OleFileIO.py
18The Python Imaging Library (PIL) is
19Copyright (c) 1997-2009 by Secret Labs AB
20Copyright (c) 1995-2009 by Fredrik Lundh
22See source code and LICENSE.txt for information on usage and redistribution.
23"""
25# Since olefile v0.47, only Python 2.7 and 3.5+ are supported
26# This import enables print() as a function rather than a keyword
27# (main requirement to be compatible with Python 3.x)
28# The comment on the line below should be printed on Python 2.5 or older:
29from __future__ import print_function # This version of olefile requires Python 2.7 or 3.5+.
32#--- LICENSE ------------------------------------------------------------------
34# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2020 Philippe Lagadec
35# (https://www.decalage.info)
36#
37# All rights reserved.
38#
39# Redistribution and use in source and binary forms, with or without modification,
40# are permitted provided that the following conditions are met:
41#
42# * Redistributions of source code must retain the above copyright notice, this
43# list of conditions and the following disclaimer.
44# * Redistributions in binary form must reproduce the above copyright notice,
45# this list of conditions and the following disclaimer in the documentation
46# and/or other materials provided with the distribution.
47#
48# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
49# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
50# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
51# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
52# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
54# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
55# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
56# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
57# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59# ----------
60# PIL License:
61#
62# olefile is based on source code from the OleFileIO module of the Python
63# Imaging Library (PIL) published by Fredrik Lundh under the following license:
65# The Python Imaging Library (PIL) is
66# Copyright (c) 1997-2009 by Secret Labs AB
67# Copyright (c) 1995-2009 by Fredrik Lundh
68#
69# By obtaining, using, and/or copying this software and/or its associated
70# documentation, you agree that you have read, understood, and will comply with
71# the following terms and conditions:
72#
73# Permission to use, copy, modify, and distribute this software and its
74# associated documentation for any purpose and without fee is hereby granted,
75# provided that the above copyright notice appears in all copies, and that both
76# that copyright notice and this permission notice appear in supporting
77# documentation, and that the name of Secret Labs AB or the author(s) not be used
78# in advertising or publicity pertaining to distribution of the software
79# without specific, written prior permission.
80#
81# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
82# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
83# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,
84# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
85# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
86# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
87# PERFORMANCE OF THIS SOFTWARE.
89__date__ = "2020-10-07"
90__version__ = '0.47.dev4'
91__author__ = "Philippe Lagadec"
93__all__ = ['isOleFile', 'OleFileIO', 'OleMetadata', 'enable_logging',
94 'MAGIC', 'STGTY_EMPTY',
95 'STGTY_STREAM', 'STGTY_STORAGE', 'STGTY_ROOT', 'STGTY_PROPERTY',
96 'STGTY_LOCKBYTES', 'MINIMAL_OLEFILE_SIZE',
97 'DEFECT_UNSURE', 'DEFECT_POTENTIAL', 'DEFECT_INCORRECT',
98 'DEFECT_FATAL', 'DEFAULT_PATH_ENCODING',
99 'MAXREGSECT', 'DIFSECT', 'FATSECT', 'ENDOFCHAIN', 'FREESECT',
100 'MAXREGSID', 'NOSTREAM', 'UNKNOWN_SIZE', 'WORD_CLSID',
101 'OleFileIONotClosed'
102]
104import io
105import sys
106import struct, array, os.path, datetime, logging, warnings, traceback
108#=== COMPATIBILITY WORKAROUNDS ================================================
110# For Python 3.x, need to redefine long as int:
111if str is not bytes:
112 long = int
114# Need to make sure we use xrange both on Python 2 and 3.x:
115try:
116 # on Python 2 we need xrange:
117 iterrange = xrange
118except Exception:
119 # no xrange, for Python 3 it was renamed as range:
120 iterrange = range
122# [PL] workaround to fix an issue with array item size on 64 bits systems:
123if array.array('L').itemsize == 4:
124 # on 32 bits platforms, long integers in an array are 32 bits:
125 UINT32 = 'L'
126elif array.array('I').itemsize == 4:
127 # on 64 bits platforms, integers in an array are 32 bits:
128 UINT32 = 'I'
129elif array.array('i').itemsize == 4:
130 # On 64 bit Jython, signed integers ('i') are the only way to store our 32
131 # bit values in an array in a *somewhat* reasonable way, as the otherwise
132 # perfectly suited 'H' (unsigned int, 32 bits) results in a completely
133 # unusable behaviour. This is most likely caused by the fact that Java
134 # doesn't have unsigned values, and thus Jython's "array" implementation,
135 # which is based on "jarray", doesn't have them either.
136 # NOTE: to trick Jython into converting the values it would normally
137 # interpret as "signed" into "unsigned", a binary-and operation with
138 # 0xFFFFFFFF can be used. This way it is possible to use the same comparing
139 # operations on all platforms / implementations. The corresponding code
140 # lines are flagged with a 'JYTHON-WORKAROUND' tag below.
141 UINT32 = 'i'
142else:
143 raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...')
146# [PL] These workarounds were inspired from the Path module
147# (see http://www.jorendorff.com/articles/python/path/)
148# TODO: remove the use of basestring, as it was removed in Python 3
149try:
150 basestring
151except NameError:
152 basestring = str
154if sys.version_info[0] < 3:
155 # On Python 2.x, the default encoding for path names is UTF-8:
156 DEFAULT_PATH_ENCODING = 'utf-8'
157else:
158 # On Python 3.x, the default encoding for path names is Unicode (None):
159 DEFAULT_PATH_ENCODING = None
162# === LOGGING =================================================================
164def get_logger(name, level=logging.CRITICAL+1):
165 """
166 Create a suitable logger object for this module.
167 The goal is not to change settings of the root logger, to avoid getting
168 other modules' logs on the screen.
169 If a logger exists with same name, reuse it. (Else it would have duplicate
170 handlers and messages would be doubled.)
171 The level is set to CRITICAL+1 by default, to avoid any logging.
172 """
173 # First, test if there is already a logger with the same name, else it
174 # will generate duplicate messages (due to duplicate handlers):
175 if name in logging.Logger.manager.loggerDict:
176 #NOTE: another less intrusive but more "hackish" solution would be to
177 # use getLogger then test if its effective level is not default.
178 logger = logging.getLogger(name)
179 # make sure level is OK:
180 logger.setLevel(level)
181 return logger
182 # get a new logger:
183 logger = logging.getLogger(name)
184 # only add a NullHandler for this logger, it is up to the application
185 # to configure its own logging:
186 logger.addHandler(logging.NullHandler())
187 logger.setLevel(level)
188 return logger
191# a global logger object used for debugging:
192log = get_logger('olefile')
195def enable_logging():
196 """
197 Enable logging for this module (disabled by default).
198 This will set the module-specific logger level to NOTSET, which
199 means the main application controls the actual logging level.
200 """
201 log.setLevel(logging.NOTSET)
204#=== CONSTANTS ===============================================================
206#: magic bytes that should be at the beginning of every OLE file:
207MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'
209# [PL]: added constants for Sector IDs (from AAF specifications)
210MAXREGSECT = 0xFFFFFFFA #: (-6) maximum SECT
211DIFSECT = 0xFFFFFFFC #: (-4) denotes a DIFAT sector in a FAT
212FATSECT = 0xFFFFFFFD #: (-3) denotes a FAT sector in a FAT
213ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain
214FREESECT = 0xFFFFFFFF #: (-1) unallocated sector
216# [PL]: added constants for Directory Entry IDs (from AAF specifications)
217MAXREGSID = 0xFFFFFFFA #: (-6) maximum directory entry ID
218NOSTREAM = 0xFFFFFFFF #: (-1) unallocated directory entry
220# [PL] object types in storage (from AAF specifications)
221STGTY_EMPTY = 0 #: empty directory entry
222STGTY_STORAGE = 1 #: element is a storage object
223STGTY_STREAM = 2 #: element is a stream object
224STGTY_LOCKBYTES = 3 #: element is an ILockBytes object
225STGTY_PROPERTY = 4 #: element is an IPropertyStorage object
226STGTY_ROOT = 5 #: element is a root storage
228# Unknown size for a stream (used by OleStream):
229UNKNOWN_SIZE = 0x7FFFFFFF
231#
232# --------------------------------------------------------------------
233# property types
235VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;
236VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;
237VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;
238VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;
239VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;
240VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;
241VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;
242VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;
243VT_VECTOR=0x1000;
245# map property id to name (for debugging purposes)
246VT = {}
247for keyword, var in list(vars().items()):
248 if keyword[:3] == "VT_":
249 VT[var] = keyword
251#
252# --------------------------------------------------------------------
253# Some common document types (root.clsid fields)
255WORD_CLSID = "00020900-0000-0000-C000-000000000046"
256# TODO: check Excel, PPT, ...
258# [PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()
259DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect
260DEFECT_POTENTIAL = 20 # a potential defect
261DEFECT_INCORRECT = 30 # an error according to specifications, but parsing
262 # can go on
263DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is
264 # impossible
266# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes
267# (this is used in isOleFile and OleFileIO.open)
268MINIMAL_OLEFILE_SIZE = 1536
270#=== FUNCTIONS ===============================================================
272def isOleFile (filename):
273 """
274 Test if a file is an OLE container (according to the magic bytes in its header).
276 .. note::
277 This function only checks the first 8 bytes of the file, not the
278 rest of the OLE structure.
280 .. versionadded:: 0.16
282 :param filename: filename, contents or file-like object of the OLE file (string-like or file-like object)
284 - if filename is a string smaller than 1536 bytes, it is the path
285 of the file to open. (bytes or unicode string)
286 - if filename is a string longer than 1535 bytes, it is parsed
287 as the content of an OLE file in memory. (bytes type only)
288 - if filename is a file-like object (with read and seek methods),
289 it is parsed as-is.
291 :type filename: bytes or str or unicode or file
292 :returns: True if OLE, False otherwise.
293 :rtype: bool
294 """
295 # check if filename is a string-like or file-like object:
296 if hasattr(filename, 'read'):
297 # file-like object: use it directly
298 header = filename.read(len(MAGIC))
299 # just in case, seek back to start of file:
300 filename.seek(0)
301 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:
302 # filename is a bytes string containing the OLE file to be parsed:
303 header = filename[:len(MAGIC)]
304 else:
305 # string-like object: filename of file on disk
306 with open(filename, 'rb') as fp:
307 header = fp.read(len(MAGIC))
308 if header == MAGIC:
309 return True
310 else:
311 return False
314if bytes is str:
315 # version for Python 2.x
316 def i8(c):
317 return ord(c)
318else:
319 # version for Python 3.x
320 def i8(c):
321 return c if c.__class__ is int else c[0]
324def i16(c, o = 0):
325 """
326 Converts a 2-bytes (16 bits) string to an integer.
328 :param c: string containing bytes to convert
329 :param o: offset of bytes to convert in string
330 """
331 return struct.unpack("<H", c[o:o+2])[0]
334def i32(c, o = 0):
335 """
336 Converts a 4-bytes (32 bits) string to an integer.
338 :param c: string containing bytes to convert
339 :param o: offset of bytes to convert in string
340 """
341 return struct.unpack("<I", c[o:o+4])[0]
344def _clsid(clsid):
345 """
346 Converts a CLSID to a human-readable string.
348 :param clsid: string of length 16.
349 """
350 assert len(clsid) == 16
351 # if clsid is only made of null bytes, return an empty string:
352 # (PL: why not simply return the string with zeroes?)
353 if not clsid.strip(b"\0"):
354 return ""
355 return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %
356 ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +
357 tuple(map(i8, clsid[8:16]))))
361def filetime2datetime(filetime):
362 """
363 convert FILETIME (64 bits int) to Python datetime.datetime
364 """
365 # TODO: manage exception when microseconds is too large
366 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/
367 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
368 # log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24)))
369 return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10)
373#=== CLASSES ==================================================================
375class OleFileError(IOError):
376 """
377 Generic base error for this module.
378 """
379 pass
381class NotOleFileError(OleFileError):
382 """
383 Error raised when the opened file is not an OLE file.
384 """
385 pass
387class OleMetadata:
388 """
389 Class to parse and store metadata from standard properties of OLE files.
391 Available attributes:
392 codepage, title, subject, author, keywords, comments, template,
393 last_saved_by, revision_number, total_edit_time, last_printed, create_time,
394 last_saved_time, num_pages, num_words, num_chars, thumbnail,
395 creating_application, security, codepage_doc, category, presentation_target,
396 bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,
397 scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,
398 chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,
399 version, dig_sig, content_type, content_status, language, doc_version
401 Note: an attribute is set to None when not present in the properties of the
402 OLE file.
404 References for SummaryInformation stream:
406 - https://msdn.microsoft.com/en-us/library/dd942545.aspx
407 - https://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx
408 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx
409 - https://msdn.microsoft.com/en-us/library/aa372045.aspx
410 - http://sedna-soft.de/articles/summary-information-stream/
411 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html
413 References for DocumentSummaryInformation stream:
415 - https://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
416 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx
417 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html
419 New in version 0.25
420 """
422 # attribute names for SummaryInformation stream properties:
423 # (ordered by property id, starting at 1)
424 SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',
425 'template', 'last_saved_by', 'revision_number', 'total_edit_time',
426 'last_printed', 'create_time', 'last_saved_time', 'num_pages',
427 'num_words', 'num_chars', 'thumbnail', 'creating_application',
428 'security']
430 # attribute names for DocumentSummaryInformation stream properties:
431 # (ordered by property id, starting at 1)
432 DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',
433 'slides', 'notes', 'hidden_slides', 'mm_clips',
434 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',
435 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',
436 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',
437 'content_type', 'content_status', 'language', 'doc_version']
439 def __init__(self):
440 """
441 Constructor for OleMetadata
442 All attributes are set to None by default
443 """
444 # properties from SummaryInformation stream
445 self.codepage = None
446 self.title = None
447 self.subject = None
448 self.author = None
449 self.keywords = None
450 self.comments = None
451 self.template = None
452 self.last_saved_by = None
453 self.revision_number = None
454 self.total_edit_time = None
455 self.last_printed = None
456 self.create_time = None
457 self.last_saved_time = None
458 self.num_pages = None
459 self.num_words = None
460 self.num_chars = None
461 self.thumbnail = None
462 self.creating_application = None
463 self.security = None
464 # properties from DocumentSummaryInformation stream
465 self.codepage_doc = None
466 self.category = None
467 self.presentation_target = None
468 self.bytes = None
469 self.lines = None
470 self.paragraphs = None
471 self.slides = None
472 self.notes = None
473 self.hidden_slides = None
474 self.mm_clips = None
475 self.scale_crop = None
476 self.heading_pairs = None
477 self.titles_of_parts = None
478 self.manager = None
479 self.company = None
480 self.links_dirty = None
481 self.chars_with_spaces = None
482 self.unused = None
483 self.shared_doc = None
484 self.link_base = None
485 self.hlinks = None
486 self.hlinks_changed = None
487 self.version = None
488 self.dig_sig = None
489 self.content_type = None
490 self.content_status = None
491 self.language = None
492 self.doc_version = None
494 def parse_properties(self, ole_file):
495 """
496 Parse standard properties of an OLE file, from the streams
497 ``\\x05SummaryInformation`` and ``\\x05DocumentSummaryInformation``,
498 if present.
499 Properties are converted to strings, integers or python datetime objects.
500 If a property is not present, its value is set to None.
502 :param ole_file: OleFileIO object from which to parse properties
503 """
504 # first set all attributes to None:
505 for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS):
506 setattr(self, attrib, None)
507 if ole_file.exists("\x05SummaryInformation"):
508 # get properties from the stream:
509 # (converting timestamps to python datetime, except total_edit_time,
510 # which is property #10)
511 props = ole_file.getproperties("\x05SummaryInformation",
512 convert_time=True, no_conversion=[10])
513 # store them into this object's attributes:
514 for i in range(len(self.SUMMARY_ATTRIBS)):
515 # ids for standards properties start at 0x01, until 0x13
516 value = props.get(i+1, None)
517 setattr(self, self.SUMMARY_ATTRIBS[i], value)
518 if ole_file.exists("\x05DocumentSummaryInformation"):
519 # get properties from the stream:
520 props = ole_file.getproperties("\x05DocumentSummaryInformation",
521 convert_time=True)
522 # store them into this object's attributes:
523 for i in range(len(self.DOCSUM_ATTRIBS)):
524 # ids for standards properties start at 0x01, until 0x13
525 value = props.get(i+1, None)
526 setattr(self, self.DOCSUM_ATTRIBS[i], value)
528 def dump(self):
529 """
530 Dump all metadata, for debugging purposes.
531 """
532 print('Properties from SummaryInformation stream:')
533 for prop in self.SUMMARY_ATTRIBS:
534 value = getattr(self, prop)
535 print('- {}: {}'.format(prop, repr(value)))
536 print('Properties from DocumentSummaryInformation stream:')
537 for prop in self.DOCSUM_ATTRIBS:
538 value = getattr(self, prop)
539 print('- {}: {}'.format(prop, repr(value)))
541class OleFileIONotClosed(RuntimeWarning):
542 """
543 Warning type used when OleFileIO is destructed but has open file handle.
544 """
545 def __init__(self, stack_of_open=None):
546 super(OleFileIONotClosed, self).__init__()
547 self.stack_of_open = stack_of_open
549 def __str__(self):
550 msg = 'Deleting OleFileIO instance with open file handle. ' \
551 'You should ensure that OleFileIO is never deleted ' \
552 'without calling close() first. Consider using '\
553 '"with OleFileIO(...) as ole: ...".'
554 if self.stack_of_open:
555 return ''.join([msg, '\n', 'Stacktrace of open() call:\n'] +
556 self.stack_of_open.format())
557 else:
558 return msg
561# --- OleStream ---------------------------------------------------------------
563class OleStream(io.BytesIO):
564 """
565 OLE2 Stream
567 Returns a read-only file object which can be used to read
568 the contents of a OLE stream (instance of the BytesIO class).
569 To open a stream, use the openstream method in the OleFileIO class.
571 This function can be used with either ordinary streams,
572 or ministreams, depending on the offset, sectorsize, and
573 fat table arguments.
575 Attributes:
577 - size: actual size of data stream, after it was opened.
578 """
579 # FIXME: should store the list of sects obtained by following
580 # the fat chain, and load new sectors on demand instead of
581 # loading it all in one go.
583 def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio):
584 """
585 Constructor for OleStream class.
587 :param fp: file object, the OLE container or the MiniFAT stream
588 :param sect: sector index of first sector in the stream
589 :param size: total size of the stream
590 :param offset: offset in bytes for the first FAT or MiniFAT sector
591 :param sectorsize: size of one sector
592 :param fat: array/list of sector indexes (FAT or MiniFAT)
593 :param filesize: size of OLE file (for debugging)
594 :param olefileio: OleFileIO object containing this stream
595 :returns: a BytesIO instance containing the OLE stream
596 """
597 log.debug('OleStream.__init__:')
598 log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'
599 %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))
600 self.ole = olefileio
601 # this check is necessary, otherwise when attempting to open a stream
602 # from a closed OleFileIO, a stream of size zero is returned without
603 # raising an exception. (see issue #81)
604 if self.ole.fp.closed:
605 raise OSError('Attempting to open a stream from a closed OLE File')
606 # [PL] To detect malformed documents with FAT loops, we compute the
607 # expected number of sectors in the stream:
608 unknown_size = False
609 if size == UNKNOWN_SIZE:
610 # this is the case when called from OleFileIO._open(), and stream
611 # size is not known in advance (for example when reading the
612 # Directory stream). Then we can only guess maximum size:
613 size = len(fat)*sectorsize
614 # and we keep a record that size was unknown:
615 unknown_size = True
616 log.debug(' stream with UNKNOWN SIZE')
617 nb_sectors = (size + (sectorsize-1)) // sectorsize
618 log.debug('nb_sectors = %d' % nb_sectors)
619 # This number should (at least) be less than the total number of
620 # sectors in the given FAT:
621 if nb_sectors > len(fat):
622 self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large')
623 # optimization(?): data is first a list of strings, and join() is called
624 # at the end to concatenate all in one string.
625 # (this may not be really useful with recent Python versions)
626 data = []
627 # if size is zero, then first sector index should be ENDOFCHAIN:
628 if size == 0 and sect != ENDOFCHAIN:
629 log.debug('size == 0 and sect != ENDOFCHAIN:')
630 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream')
631 # [PL] A fixed-length for loop is used instead of an undefined while
632 # loop to avoid DoS attacks:
633 for i in range(nb_sectors):
634 log.debug('Reading stream sector[%d] = %Xh' % (i, sect))
635 # Sector index may be ENDOFCHAIN, but only if size was unknown
636 if sect == ENDOFCHAIN:
637 if unknown_size:
638 log.debug('Reached ENDOFCHAIN sector for stream with unknown size')
639 break
640 else:
641 # else this means that the stream is smaller than declared:
642 log.debug('sect=ENDOFCHAIN before expected size')
643 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream')
644 # sector index should be within FAT:
645 if sect<0 or sect>=len(fat):
646 log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))
647 log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors))
648## tmp_data = b"".join(data)
649## f = open('test_debug.bin', 'wb')
650## f.write(tmp_data)
651## f.close()
652## log.debug('data read so far: %d bytes' % len(tmp_data))
653 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')
654 # stop reading here if the exception is ignored:
655 break
656 # TODO: merge this code with OleFileIO.getsect() ?
657 # TODO: check if this works with 4K sectors:
658 try:
659 fp.seek(offset + sectorsize * sect)
660 except Exception:
661 log.debug('sect=%d, seek=%d, filesize=%d' %
662 (sect, offset+sectorsize*sect, filesize))
663 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range')
664 # stop reading here if the exception is ignored:
665 break
666 sector_data = fp.read(sectorsize)
667 # [PL] check if there was enough data:
668 # Note: if sector is the last of the file, sometimes it is not a
669 # complete sector (of 512 or 4K), so we may read less than
670 # sectorsize.
671 if len(sector_data)!=sectorsize and sect!=(len(fat)-1):
672 log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %
673 (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))
674 log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))
675 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector')
676 data.append(sector_data)
677 # jump to next sector in the FAT:
678 try:
679 sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND
680 except IndexError:
681 # [PL] if pointer is out of the FAT an exception is raised
682 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')
683 # stop reading here if the exception is ignored:
684 break
685 # [PL] Last sector should be a "end of chain" marker:
686 # if sect != ENDOFCHAIN:
687 # raise IOError('incorrect last sector index in OLE stream')
688 data = b"".join(data)
689 # Data is truncated to the actual stream size:
690 if len(data) >= size:
691 log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size))
692 data = data[:size]
693 # actual stream size is stored for future use:
694 self.size = size
695 elif unknown_size:
696 # actual stream size was not known, now we know the size of read
697 # data:
698 log.debug('Read data of length %d, the stream size was unknown' % len(data))
699 self.size = len(data)
700 else:
701 # read data is less than expected:
702 log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size))
703 # TODO: provide details in exception message
704 self.size = len(data)
705 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared')
706 # when all data is read in memory, BytesIO constructor is called
707 io.BytesIO.__init__(self, data)
708 # Then the OleStream object can be used as a read-only file object.
711# --- OleDirectoryEntry -------------------------------------------------------
713class OleDirectoryEntry:
714 """
715 OLE2 Directory Entry pointing to a stream or a storage
716 """
717 # struct to parse directory entries:
718 # <: little-endian byte order, standard sizes
719 # (note: this should guarantee that Q returns a 64 bits int)
720 # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes
721 # H: uint16, number of bytes used in name buffer, including null = (len+1)*2
722 # B: uint8, dir entry type (between 0 and 5)
723 # B: uint8, color: 0=black, 1=red
724 # I: uint32, index of left child node in the red-black tree, NOSTREAM if none
725 # I: uint32, index of right child node in the red-black tree, NOSTREAM if none
726 # I: uint32, index of child root node if it is a storage, else NOSTREAM
727 # 16s: CLSID, unique identifier (only used if it is a storage)
728 # I: uint32, user flags
729 # Q (was 8s): uint64, creation timestamp or zero
730 # Q (was 8s): uint64, modification timestamp or zero
731 # I: uint32, SID of first sector if stream or ministream, SID of 1st sector
732 # of stream containing ministreams if root entry, 0 otherwise
733 # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise
734 # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise
735 STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII'
736 # size of a directory entry: 128 bytes
737 DIRENTRY_SIZE = 128
738 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE
740 def __init__(self, entry, sid, ole_file):
741 """
742 Constructor for an OleDirectoryEntry object.
743 Parses a 128-bytes entry from the OLE Directory stream.
745 :param bytes entry: bytes string (must be 128 bytes long)
746 :param int sid: index of this directory entry in the OLE file directory
747 :param OleFileIO ole_file: OleFileIO object containing this directory entry
748 """
749 self.sid = sid
750 # ref to ole_file is stored for future use
751 self.olefile = ole_file
752 # kids is a list of children entries, if this entry is a storage:
753 # (list of OleDirectoryEntry objects)
754 self.kids = []
755 # kids_dict is a dictionary of children entries, indexed by their
756 # name in lowercase: used to quickly find an entry, and to detect
757 # duplicates
758 self.kids_dict = {}
759 # flag used to detect if the entry is referenced more than once in
760 # directory:
761 self.used = False
762 # decode DirEntry
763 (
764 self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes
765 self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2
766 self.entry_type,
767 self.color,
768 self.sid_left,
769 self.sid_right,
770 self.sid_child,
771 clsid,
772 self.dwUserFlags,
773 self.createTime,
774 self.modifyTime,
775 self.isectStart,
776 self.sizeLow,
777 self.sizeHigh
778 ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry)
779 if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:
780 ole_file._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')
781 # only first directory entry can (and should) be root:
782 if self.entry_type == STGTY_ROOT and sid != 0:
783 ole_file._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')
784 if sid == 0 and self.entry_type != STGTY_ROOT:
785 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')
786 # log.debug(struct.unpack(fmt_entry, entry[:len_entry]))
787 # name should be at most 31 unicode characters + null character,
788 # so 64 bytes in total (31*2 + 2):
789 if self.namelength > 64:
790 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes')
791 # if exception not raised, namelength is set to the maximum value:
792 self.namelength = 64
793 # only characters without ending null char are kept:
794 self.name_utf16 = self.name_raw[:(self.namelength-2)]
795 # TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1)
796 # TODO: check if the name does not contain forbidden characters:
797 # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'."
798 # name is converted from UTF-16LE to the path encoding specified in the OleFileIO:
799 self.name = ole_file._decode_utf16_str(self.name_utf16)
801 log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
802 log.debug(' - type: %d' % self.entry_type)
803 log.debug(' - sect: %Xh' % self.isectStart)
804 log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,
805 self.sid_right, self.sid_child))
807 # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes
808 # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1
809 # or some other value so it cannot be raised as a defect in general:
810 if ole_file.sectorsize == 512:
811 if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF:
812 log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %
813 (ole_file.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh))
814 ole_file._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')
815 self.size = self.sizeLow
816 else:
817 self.size = self.sizeLow + (long(self.sizeHigh)<<32)
818 log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh))
820 self.clsid = _clsid(clsid)
821 # a storage should have a null size, BUT some implementations such as
822 # Word 8 for Mac seem to allow non-null values => Potential defect:
823 if self.entry_type == STGTY_STORAGE and self.size != 0:
824 ole_file._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')
825 # check if stream is not already referenced elsewhere:
826 self.is_minifat = False
827 if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:
828 if self.size < ole_file.minisectorcutoff \
829 and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT
830 # ministream object
831 self.is_minifat = True
832 else:
833 self.is_minifat = False
834 ole_file._check_duplicate_stream(self.isectStart, self.is_minifat)
835 self.sect_chain = None
837 def build_sect_chain(self, ole_file):
838 """
839 Build the sector chain for a stream (from the FAT or the MiniFAT)
841 :param OleFileIO ole_file: OleFileIO object containing this directory entry
842 :return: nothing
843 """
844 # TODO: seems to be used only from _write_mini_stream, is it useful?
845 # TODO: use self.olefile instead of ole_file
846 if self.sect_chain:
847 return
848 if self.entry_type not in (STGTY_ROOT, STGTY_STREAM) or self.size == 0:
849 return
851 self.sect_chain = list()
853 if self.is_minifat and not ole_file.minifat:
854 ole_file.loadminifat()
856 next_sect = self.isectStart
857 while next_sect != ENDOFCHAIN:
858 self.sect_chain.append(next_sect)
859 if self.is_minifat:
860 next_sect = ole_file.minifat[next_sect]
861 else:
862 next_sect = ole_file.fat[next_sect]
864 def build_storage_tree(self):
865 """
866 Read and build the red-black tree attached to this OleDirectoryEntry
867 object, if it is a storage.
868 Note that this method builds a tree of all subentries, so it should
869 only be called for the root object once.
870 """
871 log.debug('build_storage_tree: SID=%d - %s - sid_child=%d'
872 % (self.sid, repr(self.name), self.sid_child))
873 if self.sid_child != NOSTREAM:
874 # if child SID is not NOSTREAM, then this entry is a storage.
875 # Let's walk through the tree of children to fill the kids list:
876 self.append_kids(self.sid_child)
878 # Note from OpenOffice documentation: the safest way is to
879 # recreate the tree because some implementations may store broken
880 # red-black trees...
882 # in the OLE file, entries are sorted on (length, name).
883 # for convenience, we sort them on name instead:
884 # (see rich comparison methods in this class)
885 self.kids.sort()
887 def append_kids(self, child_sid):
888 """
889 Walk through red-black tree of children of this directory entry to add
890 all of them to the kids list. (recursive method)
892 :param child_sid: index of child directory entry to use, or None when called
893 first time for the root. (only used during recursion)
894 """
895 log.debug('append_kids: child_sid=%d' % child_sid)
896 # [PL] this method was added to use simple recursion instead of a complex
897 # algorithm.
898 # if this is not a storage or a leaf of the tree, nothing to do:
899 if child_sid == NOSTREAM:
900 return
901 # check if child SID is in the proper range:
902 if child_sid<0 or child_sid>=len(self.olefile.direntries):
903 self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range')
904 else:
905 # get child direntry:
906 child = self.olefile._load_direntry(child_sid) #direntries[child_sid]
907 log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'
908 % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))
909 # Check if kid was not already referenced in a storage:
910 if child.used:
911 self.olefile._raise_defect(DEFECT_INCORRECT,
912 'OLE Entry referenced more than once')
913 return
914 child.used = True
915 # the directory entries are organized as a red-black tree.
916 # (cf. Wikipedia for details)
917 # First walk through left side of the tree:
918 self.append_kids(child.sid_left)
919 # Check if its name is not already used (case-insensitive):
920 name_lower = child.name.lower()
921 if name_lower in self.kids_dict:
922 self.olefile._raise_defect(DEFECT_INCORRECT,
923 "Duplicate filename in OLE storage")
924 # Then the child_sid OleDirectoryEntry object is appended to the
925 # kids list and dictionary:
926 self.kids.append(child)
927 self.kids_dict[name_lower] = child
928 # Finally walk through right side of the tree:
929 self.append_kids(child.sid_right)
930 # Afterwards build kid's own tree if it's also a storage:
931 child.build_storage_tree()
933 def __eq__(self, other):
934 "Compare entries by name"
935 return self.name == other.name
937 def __lt__(self, other):
938 "Compare entries by name"
939 return self.name < other.name
941 def __ne__(self, other):
942 return not self.__eq__(other)
944 def __le__(self, other):
945 return self.__eq__(other) or self.__lt__(other)
947 # Reflected __lt__() and __le__() will be used for __gt__() and __ge__()
949 # TODO: replace by the same function as MS implementation ?
950 # (order by name length first, then case-insensitive order)
952 def dump(self, tab = 0):
953 "Dump this entry, and all its subentries (for debug purposes only)"
954 TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",
955 "(property)", "(root)"]
956 try:
957 type_name = TYPES[self.entry_type]
958 except IndexError:
959 type_name = '(UNKNOWN)'
960 print(" "*tab + repr(self.name), type_name, end=' ')
961 if self.entry_type in (STGTY_STREAM, STGTY_ROOT):
962 print(self.size, "bytes", end=' ')
963 print()
964 if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:
965 print(" "*tab + "{%s}" % self.clsid)
967 for kid in self.kids:
968 kid.dump(tab + 2)
970 def getmtime(self):
971 """
972 Return modification time of a directory entry.
974 :returns: None if modification time is null, a python datetime object
975 otherwise (UTC timezone)
977 new in version 0.26
978 """
979 if self.modifyTime == 0:
980 return None
981 return filetime2datetime(self.modifyTime)
984 def getctime(self):
985 """
986 Return creation time of a directory entry.
988 :returns: None if modification time is null, a python datetime object
989 otherwise (UTC timezone)
991 new in version 0.26
992 """
993 if self.createTime == 0:
994 return None
995 return filetime2datetime(self.createTime)
998#--- OleFileIO ----------------------------------------------------------------
1000class OleFileIO:
1001 """
1002 OLE container object
1004 This class encapsulates the interface to an OLE 2 structured
1005 storage file. Use the listdir and openstream methods to
1006 access the contents of this file.
1008 Object names are given as a list of strings, one for each subentry
1009 level. The root entry should be omitted. For example, the following
1010 code extracts all image streams from a Microsoft Image Composer file::
1012 with OleFileIO("fan.mic") as ole:
1014 for entry in ole.listdir():
1015 if entry[1:2] == "Image":
1016 fin = ole.openstream(entry)
1017 fout = open(entry[0:1], "wb")
1018 while True:
1019 s = fin.read(8192)
1020 if not s:
1021 break
1022 fout.write(s)
1024 You can use the viewer application provided with the Python Imaging
1025 Library to view the resulting files (which happens to be standard
1026 TIFF files).
1027 """
1029 def __init__(self, filename=None, raise_defects=DEFECT_FATAL,
1030 write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING):
1031 """
1032 Constructor for the OleFileIO class.
1034 :param filename: file to open.
1036 - if filename is a string smaller than 1536 bytes, it is the path
1037 of the file to open. (bytes or unicode string)
1038 - if filename is a string longer than 1535 bytes, it is parsed
1039 as the content of an OLE file in memory. (bytes type only)
1040 - if filename is a file-like object (with read, seek and tell methods),
1041 it is parsed as-is. The caller is responsible for closing it when done.
1043 :param raise_defects: minimal level for defects to be raised as exceptions.
1044 (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a
1045 security-oriented application, see source code for details)
1047 :param write_mode: bool, if True the file is opened in read/write mode instead
1048 of read-only by default.
1050 :param debug: bool, set debug mode (deprecated, not used anymore)
1052 :param path_encoding: None or str, name of the codec to use for path
1053 names (streams and storages), or None for Unicode.
1054 Unicode by default on Python 3+, UTF-8 on Python 2.x.
1055 (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)
1056 """
1057 # minimal level for defects to be raised as exceptions:
1058 self._raise_defects_level = raise_defects
1059 #: list of defects/issues not raised as exceptions:
1060 #: tuples of (exception type, message)
1061 self.parsing_issues = []
1062 self.write_mode = write_mode
1063 self.path_encoding = path_encoding
1064 # initialize all attributes to default values:
1065 self._filesize = None
1066 self.ministream = None
1067 self._used_streams_fat = []
1068 self._used_streams_minifat = []
1069 self.byte_order = None
1070 self.directory_fp = None
1071 self.direntries = None
1072 self.dll_version = None
1073 self.fat = None
1074 self.first_difat_sector = None
1075 self.first_dir_sector = None
1076 self.first_mini_fat_sector = None
1077 self.fp = None
1078 self.header_clsid = None
1079 self.header_signature = None
1080 self.metadata = None
1081 self.mini_sector_shift = None
1082 self.mini_sector_size = None
1083 self.mini_stream_cutoff_size = None
1084 self.minifat = None
1085 self.minifatsect = None
1086 # TODO: duplicates?
1087 self.minisectorcutoff = None
1088 self.minisectorsize = None
1089 self.ministream = None
1090 self.minor_version = None
1091 self.nb_sect = None
1092 self.num_difat_sectors = None
1093 self.num_dir_sectors = None
1094 self.num_fat_sectors = None
1095 self.num_mini_fat_sectors = None
1096 self.reserved1 = None
1097 self.reserved2 = None
1098 self.root = None
1099 self.sector_shift = None
1100 self.sector_size = None
1101 self.transaction_signature_number = None
1102 self._we_opened_fp = False
1103 self._open_stack = None
1104 if filename:
1105 # try opening, ensure fp is closed if that fails
1106 try:
1107 self.open(filename, write_mode=write_mode)
1108 except Exception:
1109 # caller has no chance of calling close() now
1110 self._close(warn=False)
1111 raise
1113 def __del__(self):
1114 """Destructor, ensures all file handles are closed that we opened."""
1115 self._close(warn=True)
1116 # super(OleFileIO, self).__del__() # there's no super-class destructor
1119 def __enter__(self):
1120 return self
1123 def __exit__(self, *args):
1124 self._close(warn=False)
1127 def _raise_defect(self, defect_level, message, exception_type=OleFileError):
1128 """
1129 This method should be called for any defect found during file parsing.
1130 It may raise an OleFileError exception according to the minimal level chosen
1131 for the OleFileIO object.
1133 :param defect_level: defect level, possible values are:
1135 - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect
1136 - DEFECT_POTENTIAL : a potential defect
1137 - DEFECT_INCORRECT : an error according to specifications, but parsing can go on
1138 - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible
1140 :param message: string describing the defect, used with raised exception.
1141 :param exception_type: exception class to be raised, OleFileError by default
1142 """
1143 # added by [PL]
1144 if defect_level >= self._raise_defects_level:
1145 log.error(message)
1146 raise exception_type(message)
1147 else:
1148 # just record the issue, no exception raised:
1149 self.parsing_issues.append((exception_type, message))
1150 log.warning(message)
1153 def _decode_utf16_str(self, utf16_str, errors='replace'):
1154 """
1155 Decode a string encoded in UTF-16 LE format, as found in the OLE
1156 directory or in property streams. Return a string encoded
1157 according to the path_encoding specified for the OleFileIO object.
1159 :param bytes utf16_str: bytes string encoded in UTF-16 LE format
1160 :param str errors: str, see python documentation for str.decode()
1161 :return: str, encoded according to path_encoding
1162 :rtype: str
1163 """
1164 unicode_str = utf16_str.decode('UTF-16LE', errors)
1165 if self.path_encoding:
1166 # an encoding has been specified for path names:
1167 return unicode_str.encode(self.path_encoding, errors)
1168 else:
1169 # path_encoding=None, return the Unicode string as-is:
1170 return unicode_str
1173 def open(self, filename, write_mode=False):
1174 """
1175 Open an OLE2 file in read-only or read/write mode.
1176 Read and parse the header, FAT and directory.
1178 :param filename: string-like or file-like object, OLE file to parse
1180 - if filename is a string smaller than 1536 bytes, it is the path
1181 of the file to open. (bytes or unicode string)
1182 - if filename is a string longer than 1535 bytes, it is parsed
1183 as the content of an OLE file in memory. (bytes type only)
1184 - if filename is a file-like object (with read, seek and tell methods),
1185 it is parsed as-is. The caller is responsible for closing it when done
1187 :param write_mode: bool, if True the file is opened in read/write mode instead
1188 of read-only by default. (ignored if filename is not a path)
1189 """
1190 self.write_mode = write_mode
1191 # [PL] check if filename is a string-like or file-like object:
1192 # (it is better to check for a read() method)
1193 if hasattr(filename, 'read'):
1194 # TODO: also check seek and tell methods?
1195 # file-like object: use it directly
1196 self.fp = filename
1197 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:
1198 # filename is a bytes string containing the OLE file to be parsed:
1199 # convert it to BytesIO
1200 self.fp = io.BytesIO(filename)
1201 else:
1202 # string-like object: filename of file on disk
1203 if self.write_mode:
1204 # open file in mode 'read with update, binary'
1205 # According to https://docs.python.org/library/functions.html#open
1206 # 'w' would truncate the file, 'a' may only append on some Unixes
1207 mode = 'r+b'
1208 else:
1209 # read-only mode by default
1210 mode = 'rb'
1211 self.fp = open(filename, mode)
1212 self._we_opened_fp = True
1213 self._open_stack = traceback.extract_stack() # remember for warning
1214 # obtain the filesize by using seek and tell, which should work on most
1215 # file-like objects:
1216 # TODO: do it above, using getsize with filename when possible?
1217 # TODO: fix code to fail with clear exception when filesize cannot be obtained
1218 filesize = 0
1219 self.fp.seek(0, os.SEEK_END)
1220 try:
1221 filesize = self.fp.tell()
1222 finally:
1223 self.fp.seek(0)
1224 self._filesize = filesize
1225 log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize))
1227 # lists of streams in FAT and MiniFAT, to detect duplicate references
1228 # (list of indexes of first sectors of each stream)
1229 self._used_streams_fat = []
1230 self._used_streams_minifat = []
1232 header = self.fp.read(512)
1234 if len(header) != 512 or header[:8] != MAGIC:
1235 log.debug('Magic = {!r} instead of {!r}'.format(header[:8], MAGIC))
1236 self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file", NotOleFileError)
1238 # [PL] header structure according to AAF specifications:
1239 ##Header
1240 ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
1241 ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
1242 ## // 0x1a, 0xe1} for current version
1243 ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/
1244 ## // GetClassFile uses root directory class id)
1245 ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is
1246 ## // written by reference implementation
1247 ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for
1248 ## // 512-byte sectors, 4 for 4 KB sectors
1249 ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
1250 ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two;
1251 ## // typically 9 indicating 512-byte sectors
1252 ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
1253 ## // typically 6 indicating 64-byte mini-sectors
1254 ##USHORT _usReserved; // [22H,02] reserved, must be zero
1255 ##ULONG _ulReserved1; // [24H,04] reserved, must be zero
1256 ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,
1257 ## // number of SECTs in directory chain for 4 KB
1258 ## // sectors
1259 ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain
1260 ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain
1261 ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must
1262 ## // be zero. The reference implementation
1263 ## // does not support transactions
1264 ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;
1265 ## // typically 4096 bytes
1266 ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
1267 ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
1268 ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain
1269 ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain
1270 ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
1271 ##};
1273 # [PL] header decoding:
1274 # '<' indicates little-endian byte ordering for Intel (cf. struct module help)
1275 fmt_header = '<8s16sHHHHHHLLLLLLLLLL'
1276 header_size = struct.calcsize(fmt_header)
1277 log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )
1278 header1 = header[:header_size]
1279 (
1280 self.header_signature,
1281 self.header_clsid,
1282 self.minor_version,
1283 self.dll_version,
1284 self.byte_order,
1285 self.sector_shift,
1286 self.mini_sector_shift,
1287 self.reserved1,
1288 self.reserved2,
1289 self.num_dir_sectors,
1290 self.num_fat_sectors,
1291 self.first_dir_sector,
1292 self.transaction_signature_number,
1293 self.mini_stream_cutoff_size,
1294 self.first_mini_fat_sector,
1295 self.num_mini_fat_sectors,
1296 self.first_difat_sector,
1297 self.num_difat_sectors
1298 ) = struct.unpack(fmt_header, header1)
1299 log.debug( struct.unpack(fmt_header, header1))
1301 if self.header_signature != MAGIC:
1302 # OLE signature should always be present
1303 self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
1304 if self.header_clsid != bytearray(16):
1305 # according to AAF specs, CLSID should always be zero
1306 self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
1307 log.debug( "Minor Version = %d" % self.minor_version )
1308 # TODO: according to MS-CFB, minor version should be 0x003E
1309 log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version )
1310 if self.dll_version not in [3, 4]:
1311 # version 3: usual format, 512 bytes per sector
1312 # version 4: large format, 4K per sector
1313 self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")
1314 log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order )
1315 if self.byte_order != 0xFFFE:
1316 # For now only common little-endian documents are handled correctly
1317 self._raise_defect(DEFECT_INCORRECT, "incorrect ByteOrder in OLE header")
1318 # TODO: add big-endian support for documents created on Mac ?
1319 # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE.
1320 self.sector_size = 2**self.sector_shift
1321 log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size )
1322 if self.sector_size not in [512, 4096]:
1323 self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header")
1324 if (self.dll_version==3 and self.sector_size!=512) \
1325 or (self.dll_version==4 and self.sector_size!=4096):
1326 self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header")
1327 self.mini_sector_size = 2**self.mini_sector_shift
1328 log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size )
1329 if self.mini_sector_size not in [64]:
1330 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header")
1331 if self.reserved1 != 0 or self.reserved2 != 0:
1332 self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")
1333 log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors )
1334 # Number of directory sectors (only allowed if DllVersion != 3)
1335 if self.sector_size==512 and self.num_dir_sectors!=0:
1336 self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header")
1337 log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors )
1338 # num_fat_sectors = number of FAT sectors in the file
1339 log.debug( "First Directory sector = %Xh" % self.first_dir_sector )
1340 # first_dir_sector = 1st sector containing the directory
1341 log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number )
1342 # Signature should be zero, BUT some implementations do not follow this
1343 # rule => only a potential defect:
1344 # (according to MS-CFB, may be != 0 for applications supporting file
1345 # transactions)
1346 if self.transaction_signature_number != 0:
1347 self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)")
1348 log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size )
1349 # MS-CFB: This integer field MUST be set to 0x00001000. This field
1350 # specifies the maximum size of a user-defined data stream allocated
1351 # from the mini FAT and mini stream, and that cutoff is 4096 bytes.
1352 # Any user-defined data stream larger than or equal to this cutoff size
1353 # must be allocated as normal sectors from the FAT.
1354 if self.mini_stream_cutoff_size != 0x1000:
1355 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header")
1356 # if no exception is raised, the cutoff size is fixed to 0x1000
1357 log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' %
1358 self.mini_stream_cutoff_size)
1359 self.mini_stream_cutoff_size = 0x1000
1360 # TODO: check if these values are OK
1361 log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector )
1362 log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors )
1363 log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector )
1364 log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors )
1366 # calculate the number of sectors in the file
1367 # (-1 because header doesn't count)
1368 self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1
1369 log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect))
1370 # TODO: change this test, because an OLE file MAY contain other data
1371 # after the last sector.
1373 # file clsid
1374 self.header_clsid = _clsid(header[8:24])
1376 # TODO: remove redundant attributes, and fix the code which uses them?
1377 self.sectorsize = self.sector_size #1 << i16(header, 30)
1378 self.minisectorsize = self.mini_sector_size #1 << i16(header, 32)
1379 self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56)
1381 # check known streams for duplicate references (these are always in FAT,
1382 # never in MiniFAT):
1383 self._check_duplicate_stream(self.first_dir_sector)
1384 # check MiniFAT only if it is not empty:
1385 if self.num_mini_fat_sectors:
1386 self._check_duplicate_stream(self.first_mini_fat_sector)
1387 # check DIFAT only if it is not empty:
1388 if self.num_difat_sectors:
1389 self._check_duplicate_stream(self.first_difat_sector)
1391 # Load file allocation tables
1392 self.loadfat(header)
1393 # Load directory. This sets both the direntries list (ordered by sid)
1394 # and the root (ordered by hierarchy) members.
1395 self.loaddirectory(self.first_dir_sector)
1396 self.minifatsect = self.first_mini_fat_sector
1398 def close(self):
1399 """
1400 close the OLE file, release the file object if we created it ourselves.
1402 Leaves the file handle open if it was provided by the caller.
1403 """
1404 self._close(warn=False)
1406 def _close(self, warn=False):
1407 """Implementation of close() with internal arg `warn`."""
1408 if self._we_opened_fp:
1409 if warn:
1410 warnings.warn(OleFileIONotClosed(self._open_stack))
1411 self.fp.close()
1412 self._we_opened_fp = False
1414 def _check_duplicate_stream(self, first_sect, minifat=False):
1415 """
1416 Checks if a stream has not been already referenced elsewhere.
1417 This method should only be called once for each known stream, and only
1418 if stream size is not null.
1420 :param first_sect: int, index of first sector of the stream in FAT
1421 :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT
1422 """
1423 if minifat:
1424 log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect)
1425 used_streams = self._used_streams_minifat
1426 else:
1427 log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect)
1428 # some values can be safely ignored (not a real stream):
1429 if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):
1430 return
1431 used_streams = self._used_streams_fat
1432 # TODO: would it be more efficient using a dict or hash values, instead
1433 # of a list of long ?
1434 if first_sect in used_streams:
1435 self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')
1436 else:
1437 used_streams.append(first_sect)
1439 def dumpfat(self, fat, firstindex=0):
1440 """
1441 Display a part of FAT in human-readable form for debugging purposes
1442 """
1443 # dictionary to convert special FAT values in human-readable strings
1444 VPL = 8 # values per line (8+1 * 8+1 = 81)
1445 fatnames = {
1446 FREESECT: "..free..",
1447 ENDOFCHAIN: "[ END. ]",
1448 FATSECT: "FATSECT ",
1449 DIFSECT: "DIFSECT "
1450 }
1451 nbsect = len(fat)
1452 nlines = (nbsect+VPL-1)//VPL
1453 print("index", end=" ")
1454 for i in range(VPL):
1455 print("%8X" % i, end=" ")
1456 print()
1457 for l in range(nlines):
1458 index = l*VPL
1459 print("%6X:" % (firstindex+index), end=" ")
1460 for i in range(index, index+VPL):
1461 if i>=nbsect:
1462 break
1463 sect = fat[i]
1464 aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND
1465 if aux in fatnames:
1466 name = fatnames[aux]
1467 else:
1468 if sect == i+1:
1469 name = " --->"
1470 else:
1471 name = "%8X" % sect
1472 print(name, end=" ")
1473 print()
1475 def dumpsect(self, sector, firstindex=0):
1476 """
1477 Display a sector in a human-readable form, for debugging purposes
1478 """
1479 VPL=8 # number of values per line (8+1 * 8+1 = 81)
1480 tab = array.array(UINT32, sector)
1481 if sys.byteorder == 'big':
1482 tab.byteswap()
1483 nbsect = len(tab)
1484 nlines = (nbsect+VPL-1)//VPL
1485 print("index", end=" ")
1486 for i in range(VPL):
1487 print("%8X" % i, end=" ")
1488 print()
1489 for l in range(nlines):
1490 index = l*VPL
1491 print("%6X:" % (firstindex+index), end=" ")
1492 for i in range(index, index+VPL):
1493 if i>=nbsect:
1494 break
1495 sect = tab[i]
1496 name = "%8X" % sect
1497 print(name, end=" ")
1498 print()
1500 def sect2array(self, sect):
1501 """
1502 convert a sector to an array of 32 bits unsigned integers,
1503 swapping bytes on big endian CPUs such as PowerPC (old Macs)
1504 """
1505 # TODO: make this a static function
1506 a = array.array(UINT32, sect)
1507 # if CPU is big endian, swap bytes:
1508 if sys.byteorder == 'big':
1509 a.byteswap()
1510 return a
1512 def loadfat_sect(self, sect):
1513 """
1514 Adds the indexes of the given sector to the FAT
1516 :param sect: string containing the first FAT sector, or array of long integers
1517 :returns: index of last FAT sector.
1518 """
1519 # a FAT sector is an array of ulong integers.
1520 if isinstance(sect, array.array):
1521 # if sect is already an array it is directly used
1522 fat1 = sect
1523 else:
1524 # if it's a raw sector, it is parsed in an array
1525 fat1 = self.sect2array(sect)
1526 # Display the sector contents only if the logging level is debug:
1527 if log.isEnabledFor(logging.DEBUG):
1528 self.dumpsect(sect)
1529 # The FAT is a sector chain starting at the first index of itself.
1530 # initialize isect, just in case:
1531 isect = None
1532 for isect in fat1:
1533 isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND
1534 log.debug("isect = %X" % isect)
1535 if isect == ENDOFCHAIN or isect == FREESECT:
1536 # the end of the sector chain has been reached
1537 log.debug("found end of sector chain")
1538 break
1539 # read the FAT sector
1540 s = self.getsect(isect)
1541 # parse it as an array of 32 bits integers, and add it to the
1542 # global FAT array
1543 nextfat = self.sect2array(s)
1544 self.fat = self.fat + nextfat
1545 return isect
1547 def loadfat(self, header):
1548 """
1549 Load the FAT table.
1550 """
1551 # The 1st sector of the file contains sector numbers for the first 109
1552 # FAT sectors, right after the header which is 76 bytes long.
1553 # (always 109, whatever the sector size: 512 bytes = 76+4*109)
1554 # Additional sectors are described by DIF blocks
1556 log.debug('Loading the FAT table, starting with the 1st sector after the header')
1557 sect = header[76:512]
1558 log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) )
1559 # fat = []
1560 # FAT is an array of 32 bits unsigned ints, it's more effective
1561 # to use an array than a list in Python.
1562 # It's initialized as empty first:
1563 self.fat = array.array(UINT32)
1564 self.loadfat_sect(sect)
1565 # self.dumpfat(self.fat)
1566 # for i in range(0, len(sect), 4):
1567 # ix = i32(sect, i)
1568 # # [PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:
1569 # if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:
1570 # break
1571 # s = self.getsect(ix)
1572 # # fat = fat + [i32(s, i) for i in range(0, len(s), 4)]
1573 # fat = fat + array.array(UINT32, s)
1574 if self.num_difat_sectors != 0:
1575 log.debug('DIFAT is used, because file size > 6.8MB.')
1576 # [PL] There's a DIFAT because file is larger than 6.8MB
1577 # some checks just in case:
1578 if self.num_fat_sectors <= 109:
1579 # there must be at least 109 blocks in header and the rest in
1580 # DIFAT, so number of sectors must be >109.
1581 self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')
1582 if self.first_difat_sector >= self.nb_sect:
1583 # initial DIFAT block index must be valid
1584 self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')
1585 log.debug( "DIFAT analysis..." )
1586 # We compute the necessary number of DIFAT sectors :
1587 # Number of pointers per DIFAT sector = (sectorsize/4)-1
1588 # (-1 because the last pointer is the next DIFAT sector number)
1589 nb_difat_sectors = (self.sectorsize//4)-1
1590 # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)
1591 nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors
1592 log.debug( "nb_difat = %d" % nb_difat )
1593 if self.num_difat_sectors != nb_difat:
1594 raise IOError('incorrect DIFAT')
1595 isect_difat = self.first_difat_sector
1596 for i in iterrange(nb_difat):
1597 log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) )
1598 # TODO: check if corresponding FAT SID = DIFSECT
1599 sector_difat = self.getsect(isect_difat)
1600 difat = self.sect2array(sector_difat)
1601 # Display the sector contents only if the logging level is debug:
1602 if log.isEnabledFor(logging.DEBUG):
1603 self.dumpsect(sector_difat)
1604 self.loadfat_sect(difat[:nb_difat_sectors])
1605 # last DIFAT pointer is next DIFAT sector:
1606 isect_difat = difat[nb_difat_sectors]
1607 log.debug( "next DIFAT sector: %X" % isect_difat )
1608 # checks:
1609 if isect_difat not in [ENDOFCHAIN, FREESECT]:
1610 # last DIFAT pointer value must be ENDOFCHAIN or FREESECT
1611 raise IOError('incorrect end of DIFAT')
1612 # if len(self.fat) != self.num_fat_sectors:
1613 # # FAT should contain num_fat_sectors blocks
1614 # print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors))
1615 # raise IOError('incorrect DIFAT')
1616 else:
1617 log.debug('No DIFAT, because file size < 6.8MB.')
1618 # since FAT is read from fixed-size sectors, it may contain more values
1619 # than the actual number of sectors in the file.
1620 # Keep only the relevant sector indexes:
1621 if len(self.fat) > self.nb_sect:
1622 log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))
1623 self.fat = self.fat[:self.nb_sect]
1624 log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect))
1625 # Display the FAT contents only if the logging level is debug:
1626 if log.isEnabledFor(logging.DEBUG):
1627 log.debug('\nFAT:')
1628 self.dumpfat(self.fat)
1630 def loadminifat(self):
1631 """
1632 Load the MiniFAT table.
1633 """
1634 # MiniFAT is stored in a standard sub-stream, pointed to by a header
1635 # field.
1636 # NOTE: there are two sizes to take into account for this stream:
1637 # 1) Stream size is calculated according to the number of sectors
1638 # declared in the OLE header. This allocated stream may be more than
1639 # needed to store the actual sector indexes.
1640 # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size)
1641 stream_size = self.num_mini_fat_sectors * self.sector_size
1642 # 2) Actually used size is calculated by dividing the MiniStream size
1643 # (given by root entry size) by the size of mini sectors, *4 for
1644 # 32 bits indexes:
1645 nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size
1646 used_size = nb_minisectors * 4
1647 log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %
1648 (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors))
1649 if used_size > stream_size:
1650 # This is not really a problem, but may indicate a wrong implementation:
1651 self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')
1652 # In any case, first read stream_size:
1653 s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
1654 # [PL] Old code replaced by an array:
1655 #self.minifat = [i32(s, i) for i in range(0, len(s), 4)]
1656 self.minifat = self.sect2array(s)
1657 # Then shrink the array to used size, to avoid indexes out of MiniStream:
1658 log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))
1659 self.minifat = self.minifat[:nb_minisectors]
1660 log.debug('loadminifat(): len=%d' % len(self.minifat))
1661 # Display the FAT contents only if the logging level is debug:
1662 if log.isEnabledFor(logging.DEBUG):
1663 log.debug('\nMiniFAT:')
1664 self.dumpfat(self.minifat)
1666 def getsect(self, sect):
1667 """
1668 Read given sector from file on disk.
1670 :param sect: int, sector index
1671 :returns: a string containing the sector data.
1672 """
1673 # From [MS-CFB]: A sector number can be converted into a byte offset
1674 # into the file by using the following formula:
1675 # (sector number + 1) x Sector Size.
1676 # This implies that sector #0 of the file begins at byte offset Sector
1677 # Size, not at 0.
1679 # [PL] the original code in PIL was wrong when sectors are 4KB instead of
1680 # 512 bytes:
1681 #self.fp.seek(512 + self.sectorsize * sect)
1682 # [PL]: added safety checks:
1683 #print("getsect(%X)" % sect)
1684 try:
1685 self.fp.seek(self.sectorsize * (sect+1))
1686 except Exception:
1687 log.debug('getsect(): sect=%X, seek=%d, filesize=%d' %
1688 (sect, self.sectorsize*(sect+1), self._filesize))
1689 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1690 sector = self.fp.read(self.sectorsize)
1691 if len(sector) != self.sectorsize:
1692 log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
1693 (sect, len(sector), self.sectorsize))
1694 self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
1695 return sector
1697 def write_sect(self, sect, data, padding=b'\x00'):
1698 """
1699 Write given sector to file on disk.
1701 :param sect: int, sector index
1702 :param data: bytes, sector data
1703 :param padding: single byte, padding character if data < sector size
1704 """
1705 if not isinstance(data, bytes):
1706 raise TypeError("write_sect: data must be a bytes string")
1707 if not isinstance(padding, bytes) or len(padding)!=1:
1708 raise TypeError("write_sect: padding must be a bytes string of 1 char")
1709 # TODO: we could allow padding=None for no padding at all
1710 try:
1711 self.fp.seek(self.sectorsize * (sect+1))
1712 except Exception:
1713 log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' %
1714 (sect, self.sectorsize*(sect+1), self._filesize))
1715 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1716 if len(data) < self.sectorsize:
1717 # add padding
1718 data += padding * (self.sectorsize - len(data))
1719 elif len(data) < self.sectorsize:
1720 raise ValueError("Data is larger than sector size")
1721 self.fp.write(data)
1723 def _write_mini_sect(self, fp_pos, data, padding = b'\x00'):
1724 """
1725 Write given sector to file on disk.
1727 :param fp_pos: int, file position
1728 :param data: bytes, sector data
1729 :param padding: single byte, padding character if data < sector size
1730 """
1731 if not isinstance(data, bytes):
1732 raise TypeError("write_mini_sect: data must be a bytes string")
1733 if not isinstance(padding, bytes) or len(padding) != 1:
1734 raise TypeError("write_mini_sect: padding must be a bytes string of 1 char")
1736 try:
1737 self.fp.seek(fp_pos)
1738 except Exception:
1739 log.debug('write_mini_sect(): fp_pos=%d, filesize=%d' %
1740 (fp_pos, self._filesize))
1741 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1742 len_data = len(data)
1743 if len_data < self.mini_sector_size:
1744 data += padding * (self.mini_sector_size - len_data)
1745 if self.mini_sector_size < len_data:
1746 raise ValueError("Data is larger than sector size")
1747 self.fp.write(data)
1749 def loaddirectory(self, sect):
1750 """
1751 Load the directory.
1753 :param sect: sector index of directory stream.
1754 """
1755 log.debug('Loading the Directory:')
1756 # The directory is stored in a standard
1757 # substream, independent of its size.
1759 # open directory stream as a read-only file:
1760 # (stream size is not known in advance)
1761 self.directory_fp = self._open(sect, force_FAT=True)
1763 # [PL] to detect malformed documents and avoid DoS attacks, the maximum
1764 # number of directory entries can be calculated:
1765 max_entries = self.directory_fp.size // 128
1766 log.debug('loaddirectory: size=%d, max_entries=%d' %
1767 (self.directory_fp.size, max_entries))
1769 # Create list of directory entries
1770 # self.direntries = []
1771 # We start with a list of "None" object
1772 self.direntries = [None] * max_entries
1773 # for sid in iterrange(max_entries):
1774 # entry = fp.read(128)
1775 # if not entry:
1776 # break
1777 # self.direntries.append(OleDirectoryEntry(entry, sid, self))
1778 # load root entry:
1779 root_entry = self._load_direntry(0)
1780 # Root entry is the first entry:
1781 self.root = self.direntries[0]
1782 # TODO: read ALL directory entries (ignore bad entries?)
1783 # TODO: adapt build_storage_tree to avoid duplicate reads
1784 # for i in range(1, max_entries):
1785 # self._load_direntry(i)
1786 # read and build all storage trees, starting from the root:
1787 self.root.build_storage_tree()
1789 def _load_direntry (self, sid):
1790 """
1791 Load a directory entry from the directory.
1792 This method should only be called once for each storage/stream when
1793 loading the directory.
1795 :param sid: index of storage/stream in the directory.
1796 :returns: a OleDirectoryEntry object
1798 :exception OleFileError: if the entry has always been referenced.
1799 """
1800 # check if SID is OK:
1801 if sid<0 or sid>=len(self.direntries):
1802 self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")
1803 # check if entry was already referenced:
1804 if self.direntries[sid] is not None:
1805 self._raise_defect(DEFECT_INCORRECT,
1806 "double reference for OLE stream/storage")
1807 # if exception not raised, return the object
1808 return self.direntries[sid]
1809 self.directory_fp.seek(sid * 128)
1810 entry = self.directory_fp.read(128)
1811 self.direntries[sid] = OleDirectoryEntry(entry, sid, self)
1812 return self.direntries[sid]
1814 def dumpdirectory(self):
1815 """
1816 Dump directory (for debugging only)
1817 """
1818 self.root.dump()
1820 def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False):
1821 """
1822 Open a stream, either in FAT or MiniFAT according to its size.
1823 (openstream helper)
1825 :param start: index of first sector
1826 :param size: size of stream (or nothing if size is unknown)
1827 :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT
1828 according to size. If True, it will always be opened in FAT.
1829 """
1830 log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' %
1831 (start, size, str(force_FAT)))
1832 # stream size is compared to the mini_stream_cutoff_size threshold:
1833 if size < self.minisectorcutoff and not force_FAT:
1834 # ministream object
1835 if not self.ministream:
1836 # load MiniFAT if it wasn't already done:
1837 self.loadminifat()
1838 # The first sector index of the miniFAT stream is stored in the
1839 # root directory entry:
1840 size_ministream = self.root.size
1841 log.debug('Opening MiniStream: sect=%Xh, size=%d' %
1842 (self.root.isectStart, size_ministream))
1843 self.ministream = self._open(self.root.isectStart,
1844 size_ministream, force_FAT=True)
1845 return OleStream(fp=self.ministream, sect=start, size=size,
1846 offset=0, sectorsize=self.minisectorsize,
1847 fat=self.minifat, filesize=self.ministream.size,
1848 olefileio=self)
1849 else:
1850 # standard stream
1851 return OleStream(fp=self.fp, sect=start, size=size,
1852 offset=self.sectorsize,
1853 sectorsize=self.sectorsize, fat=self.fat,
1854 filesize=self._filesize,
1855 olefileio=self)
1857 def _list(self, files, prefix, node, streams=True, storages=False):
1858 """
1859 listdir helper
1861 :param files: list of files to fill in
1862 :param prefix: current location in storage tree (list of names)
1863 :param node: current node (OleDirectoryEntry object)
1864 :param streams: bool, include streams if True (True by default) - new in v0.26
1865 :param storages: bool, include storages if True (False by default) - new in v0.26
1866 (note: the root storage is never included)
1867 """
1868 prefix = prefix + [node.name]
1869 for entry in node.kids:
1870 if entry.entry_type == STGTY_STORAGE:
1871 # this is a storage
1872 if storages:
1873 # add it to the list
1874 files.append(prefix[1:] + [entry.name])
1875 # check its kids
1876 self._list(files, prefix, entry, streams, storages)
1877 elif entry.entry_type == STGTY_STREAM:
1878 # this is a stream
1879 if streams:
1880 # add it to the list
1881 files.append(prefix[1:] + [entry.name])
1882 else:
1883 self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.')
1885 def listdir(self, streams=True, storages=False):
1886 """
1887 Return a list of streams and/or storages stored in this file
1889 :param streams: bool, include streams if True (True by default) - new in v0.26
1890 :param storages: bool, include storages if True (False by default) - new in v0.26
1891 (note: the root storage is never included)
1892 :returns: list of stream and/or storage paths
1893 """
1894 files = []
1895 self._list(files, [], self.root, streams, storages)
1896 return files
1898 def _find(self, filename):
1899 """
1900 Returns directory entry of given filename. (openstream helper)
1901 Note: this method is case-insensitive.
1903 :param filename: path of stream in storage tree (except root entry), either:
1905 - a string using Unix path syntax, for example:
1906 'storage_1/storage_1.2/stream'
1907 - or a list of storage filenames, path to the desired stream/storage.
1908 Example: ['storage_1', 'storage_1.2', 'stream']
1910 :returns: sid of requested filename
1911 :exception IOError: if file not found
1912 """
1914 # if filename is a string instead of a list, split it on slashes to
1915 # convert to a list:
1916 if isinstance(filename, basestring):
1917 filename = filename.split('/')
1918 # walk across storage tree, following given path:
1919 node = self.root
1920 for name in filename:
1921 for kid in node.kids:
1922 if kid.name.lower() == name.lower():
1923 break
1924 else:
1925 raise IOError("file not found")
1926 node = kid
1927 return node.sid
1929 def openstream(self, filename):
1930 """
1931 Open a stream as a read-only file object (BytesIO).
1932 Note: filename is case-insensitive.
1934 :param filename: path of stream in storage tree (except root entry), either:
1936 - a string using Unix path syntax, for example:
1937 'storage_1/storage_1.2/stream'
1938 - or a list of storage filenames, path to the desired stream/storage.
1939 Example: ['storage_1', 'storage_1.2', 'stream']
1941 :returns: file object (read-only)
1942 :exception IOError: if filename not found, or if this is not a stream.
1943 """
1944 sid = self._find(filename)
1945 entry = self.direntries[sid]
1946 if entry.entry_type != STGTY_STREAM:
1947 raise IOError("this file is not a stream")
1948 return self._open(entry.isectStart, entry.size)
1950 def _write_mini_stream(self, entry, data_to_write):
1951 if not entry.sect_chain:
1952 entry.build_sect_chain(self)
1953 nb_sectors = len(entry.sect_chain)
1955 if not self.root.sect_chain:
1956 self.root.build_sect_chain(self)
1957 block_size = self.sector_size // self.mini_sector_size
1958 for idx, sect in enumerate(entry.sect_chain):
1959 sect_base = sect // block_size
1960 sect_offset = sect % block_size
1961 fp_pos = (self.root.sect_chain[sect_base] + 1)*self.sector_size + sect_offset*self.mini_sector_size
1962 if idx < (nb_sectors - 1):
1963 data_per_sector = data_to_write[idx * self.mini_sector_size: (idx + 1) * self.mini_sector_size]
1964 else:
1965 data_per_sector = data_to_write[idx * self.mini_sector_size:]
1966 self._write_mini_sect(fp_pos, data_per_sector)
1968 def write_stream(self, stream_name, data):
1969 """
1970 Write a stream to disk. For now, it is only possible to replace an
1971 existing stream by data of the same size.
1973 :param stream_name: path of stream in storage tree (except root entry), either:
1975 - a string using Unix path syntax, for example:
1976 'storage_1/storage_1.2/stream'
1977 - or a list of storage filenames, path to the desired stream/storage.
1978 Example: ['storage_1', 'storage_1.2', 'stream']
1980 :param data: bytes, data to be written, must be the same size as the original
1981 stream.
1982 """
1983 if not isinstance(data, bytes):
1984 raise TypeError("write_stream: data must be a bytes string")
1985 sid = self._find(stream_name)
1986 entry = self.direntries[sid]
1987 if entry.entry_type != STGTY_STREAM:
1988 raise IOError("this is not a stream")
1989 size = entry.size
1990 if size != len(data):
1991 raise ValueError("write_stream: data must be the same size as the existing stream")
1992 if size < self.minisectorcutoff and entry.entry_type != STGTY_ROOT:
1993 return self._write_mini_stream(entry = entry, data_to_write = data)
1995 sect = entry.isectStart
1996 # number of sectors to write
1997 nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize
1998 log.debug('nb_sectors = %d' % nb_sectors)
1999 for i in range(nb_sectors):
2000 # try:
2001 # self.fp.seek(offset + self.sectorsize * sect)
2002 # except Exception:
2003 # log.debug('sect=%d, seek=%d' %
2004 # (sect, offset+self.sectorsize*sect))
2005 # raise IOError('OLE sector index out of range')
2006 # extract one sector from data, the last one being smaller:
2007 if i<(nb_sectors-1):
2008 data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize]
2009 # TODO: comment this if it works
2010 assert(len(data_sector)==self.sectorsize)
2011 else:
2012 data_sector = data [i*self.sectorsize:]
2013 # TODO: comment this if it works
2014 log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d'
2015 % (size, self.sectorsize, len(data_sector), size % self.sectorsize))
2016 assert(len(data_sector) % self.sectorsize==size % self.sectorsize)
2017 self.write_sect(sect, data_sector)
2018 # self.fp.write(data_sector)
2019 # jump to next sector in the FAT:
2020 try:
2021 sect = self.fat[sect]
2022 except IndexError:
2023 # [PL] if pointer is out of the FAT an exception is raised
2024 raise IOError('incorrect OLE FAT, sector index out of range')
2025 # [PL] Last sector should be a "end of chain" marker:
2026 if sect != ENDOFCHAIN:
2027 raise IOError('incorrect last sector index in OLE stream')
2029 def get_type(self, filename):
2030 """
2031 Test if given filename exists as a stream or a storage in the OLE
2032 container, and return its type.
2034 :param filename: path of stream in storage tree. (see openstream for syntax)
2035 :returns: False if object does not exist, its entry type (>0) otherwise:
2037 - STGTY_STREAM: a stream
2038 - STGTY_STORAGE: a storage
2039 - STGTY_ROOT: the root entry
2040 """
2041 try:
2042 sid = self._find(filename)
2043 entry = self.direntries[sid]
2044 return entry.entry_type
2045 except Exception:
2046 return False
2048 def getclsid(self, filename):
2049 """
2050 Return clsid of a stream/storage.
2052 :param filename: path of stream/storage in storage tree. (see openstream for
2053 syntax)
2054 :returns: Empty string if clsid is null, a printable representation of the clsid otherwise
2056 new in version 0.44
2057 """
2058 sid = self._find(filename)
2059 entry = self.direntries[sid]
2060 return entry.clsid
2062 def getmtime(self, filename):
2063 """
2064 Return modification time of a stream/storage.
2066 :param filename: path of stream/storage in storage tree. (see openstream for
2067 syntax)
2068 :returns: None if modification time is null, a python datetime object
2069 otherwise (UTC timezone)
2071 new in version 0.26
2072 """
2073 sid = self._find(filename)
2074 entry = self.direntries[sid]
2075 return entry.getmtime()
2077 def getctime(self, filename):
2078 """
2079 Return creation time of a stream/storage.
2081 :param filename: path of stream/storage in storage tree. (see openstream for
2082 syntax)
2083 :returns: None if creation time is null, a python datetime object
2084 otherwise (UTC timezone)
2086 new in version 0.26
2087 """
2088 sid = self._find(filename)
2089 entry = self.direntries[sid]
2090 return entry.getctime()
2092 def exists(self, filename):
2093 """
2094 Test if given filename exists as a stream or a storage in the OLE
2095 container.
2096 Note: filename is case-insensitive.
2098 :param filename: path of stream in storage tree. (see openstream for syntax)
2099 :returns: True if object exist, else False.
2100 """
2101 try:
2102 sid = self._find(filename)
2103 return True
2104 except Exception:
2105 return False
2107 def get_size(self, filename):
2108 """
2109 Return size of a stream in the OLE container, in bytes.
2111 :param filename: path of stream in storage tree (see openstream for syntax)
2112 :returns: size in bytes (long integer)
2113 :exception IOError: if file not found
2114 :exception TypeError: if this is not a stream.
2115 """
2116 sid = self._find(filename)
2117 entry = self.direntries[sid]
2118 if entry.entry_type != STGTY_STREAM:
2119 # TODO: Should it return zero instead of raising an exception ?
2120 raise TypeError('object is not an OLE stream')
2121 return entry.size
2123 def get_rootentry_name(self):
2124 """
2125 Return root entry name. Should usually be 'Root Entry' or 'R' in most
2126 implementations.
2127 """
2128 return self.root.name
2130 def getproperties(self, filename, convert_time=False, no_conversion=None):
2131 """
2132 Return properties described in substream.
2134 :param filename: path of stream in storage tree (see openstream for syntax)
2135 :param convert_time: bool, if True timestamps will be converted to Python datetime
2136 :param no_conversion: None or list of int, timestamps not to be converted
2137 (for example total editing time is not a real timestamp)
2139 :returns: a dictionary of values indexed by id (integer)
2140 """
2141 #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx
2142 # make sure no_conversion is a list, just to simplify code below:
2143 if no_conversion == None:
2144 no_conversion = []
2145 # stream path as a string to report exceptions:
2146 streampath = filename
2147 if not isinstance(streampath, str):
2148 streampath = '/'.join(streampath)
2149 fp = self.openstream(filename)
2150 data = {}
2151 try:
2152 # header
2153 s = fp.read(28)
2154 clsid = _clsid(s[8:24])
2155 # format id
2156 s = fp.read(20)
2157 fmtid = _clsid(s[:16])
2158 fp.seek(i32(s, 16))
2159 # get section
2160 s = b"****" + fp.read(i32(fp.read(4))-4)
2161 # number of properties:
2162 num_props = i32(s, 4)
2163 except BaseException as exc:
2164 # catch exception while parsing property header, and only raise
2165 # a DEFECT_INCORRECT then return an empty dict, because this is not
2166 # a fatal error when parsing the whole file
2167 msg = 'Error while parsing properties header in stream {}: {}'.format(
2168 repr(streampath), exc)
2169 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2170 return data
2171 # clamp num_props based on the data length
2172 num_props = min(num_props, int(len(s) / 8))
2173 for i in iterrange(num_props):
2174 property_id = 0 # just in case of an exception
2175 try:
2176 property_id = i32(s, 8+i*8)
2177 offset = i32(s, 12+i*8)
2178 property_type = i32(s, offset)
2180 vt_name = VT.get(property_type, 'UNKNOWN')
2181 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))
2183 value = self._parse_property(s, offset+4, property_id, property_type, convert_time, no_conversion)
2184 data[property_id] = value
2185 except BaseException as exc:
2186 # catch exception while parsing each property, and only raise
2187 # a DEFECT_INCORRECT, because parsing can go on
2188 msg = 'Error while parsing property id %d in stream %s: %s' % (
2189 property_id, repr(streampath), exc)
2190 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2192 return data
2194 def _parse_property(self, s, offset, property_id, property_type, convert_time, no_conversion):
2195 v = None
2196 if property_type <= VT_BLOB or property_type in (VT_CLSID, VT_CF):
2197 v, _ = self._parse_property_basic(s, offset, property_id, property_type, convert_time, no_conversion)
2198 elif property_type == VT_VECTOR | VT_VARIANT:
2199 log.debug('property_type == VT_VECTOR | VT_VARIANT')
2200 off = 4
2201 count = i32(s, offset)
2202 values = []
2203 for _ in range(count):
2204 property_type = i32(s, offset + off)
2205 v, sz = self._parse_property_basic(s, offset + off + 4, property_id, property_type, convert_time, no_conversion)
2206 values.append(v)
2207 off += sz + 4
2208 v = values
2210 elif property_type & VT_VECTOR:
2211 property_type_base = property_type & ~VT_VECTOR
2212 log.debug('property_type == VT_VECTOR | %s' % VT.get(property_type_base, 'UNKNOWN'))
2213 off = 4
2214 count = i32(s, offset)
2215 values = []
2216 for _ in range(count):
2217 v, sz = self._parse_property_basic(s, offset + off, property_id, property_type & ~VT_VECTOR, convert_time, no_conversion)
2218 values.append(v)
2219 off += sz
2220 v = values
2221 else:
2222 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))
2223 return v
2225 def _parse_property_basic(self, s, offset, property_id, property_type, convert_time, no_conversion):
2226 value = None
2227 size = 0
2228 # test for common types first (should perhaps use
2229 # a dictionary instead?)
2231 if property_type == VT_I2: # 16-bit signed integer
2232 value = i16(s, offset)
2233 if value >= 32768:
2234 value = value - 65536
2235 size = 2
2236 elif property_type == VT_UI2: # 2-byte unsigned integer
2237 value = i16(s, offset)
2238 size = 2
2239 elif property_type in (VT_I4, VT_INT, VT_ERROR):
2240 # VT_I4: 32-bit signed integer
2241 # VT_ERROR: HRESULT, similar to 32-bit signed integer,
2242 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx
2243 value = i32(s, offset)
2244 size = 4
2245 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer
2246 value = i32(s, offset) # FIXME
2247 size = 4
2248 elif property_type in (VT_BSTR, VT_LPSTR):
2249 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx
2250 # size is a 32 bits integer, including the null terminator, and
2251 # possibly trailing or embedded null chars
2252 #TODO: if codepage is unicode, the string should be converted as such
2253 count = i32(s, offset)
2254 value = s[offset+4:offset+4+count-1]
2255 # remove all null chars:
2256 value = value.replace(b'\x00', b'')
2257 size = 4 + count
2258 elif property_type == VT_BLOB:
2259 # binary large object (BLOB)
2260 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx
2261 count = i32(s, offset)
2262 value = s[offset+4:offset+4+count]
2263 size = 4 + count
2264 elif property_type == VT_LPWSTR:
2265 # UnicodeString
2266 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx
2267 # "the string should NOT contain embedded or additional trailing
2268 # null characters."
2269 count = i32(s, offset+4)
2270 value = self._decode_utf16_str(s[offset+4:offset+4+count*2])
2271 size = 4 + count * 2
2272 elif property_type == VT_FILETIME:
2273 value = long(i32(s, offset)) + (long(i32(s, offset+4))<<32)
2274 # FILETIME is a 64-bit int: "number of 100ns periods
2275 # since Jan 1,1601".
2276 if convert_time and property_id not in no_conversion:
2277 log.debug('Converting property #%d to python datetime, value=%d=%fs'
2278 %(property_id, value, float(value)/10000000))
2279 # convert FILETIME to Python datetime.datetime
2280 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/
2281 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
2282 log.debug('timedelta days=%d' % (value//(10*1000000*3600*24)))
2283 value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10)
2284 else:
2285 # legacy code kept for backward compatibility: returns a
2286 # number of seconds since Jan 1,1601
2287 value = value // 10000000 # seconds
2288 size = 8
2289 elif property_type == VT_UI1: # 1-byte unsigned integer
2290 value = i8(s[offset])
2291 size = 1
2292 elif property_type == VT_CLSID:
2293 value = _clsid(s[offset:offset+16])
2294 size = 16
2295 elif property_type == VT_CF:
2296 # PropertyIdentifier or ClipboardData??
2297 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx
2298 count = i32(s, offset)
2299 value = s[offset+4:offset+4+count]
2300 size = 4 + count
2301 elif property_type == VT_BOOL:
2302 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True
2303 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx
2304 value = bool(i16(s, offset))
2305 size = 2
2306 else:
2307 value = None # everything else yields "None"
2308 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))
2310 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,
2311 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,
2312 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx
2314 #print("%08x" % property_id, repr(value), end=" ")
2315 #print("(%s)" % VT[i32(s, offset) & 0xFFF])
2316 return value, size
2319 def get_metadata(self):
2320 """
2321 Parse standard properties streams, return an OleMetadata object
2322 containing all the available metadata.
2323 (also stored in the metadata attribute of the OleFileIO object)
2325 new in version 0.25
2326 """
2327 self.metadata = OleMetadata()
2328 self.metadata.parse_properties(self)
2329 return self.metadata
2331 def get_userdefined_properties(self, filename, convert_time=False, no_conversion=None):
2332 """
2333 Return properties described in substream.
2335 :param filename: path of stream in storage tree (see openstream for syntax)
2336 :param convert_time: bool, if True timestamps will be converted to Python datetime
2337 :param no_conversion: None or list of int, timestamps not to be converted
2338 (for example total editing time is not a real timestamp)
2340 :returns: a dictionary of values indexed by id (integer)
2341 """
2342 # REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx
2343 # REFERENCE: https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-oshared/2ea8be67-a4a0-4e2e-b42f-49a182645562
2344 #'D5CDD502-2E9C-101B-9397-08002B2CF9AE'
2345 # TODO: testing the code more rigorously
2346 # TODO: adding exception handeling
2347 FMTID_USERDEFINED_PROPERTIES = _clsid(b'\x05\xD5\xCD\xD5\x9C\x2E\x1B\x10\x93\x97\x08\x00\x2B\x2C\xF9\xAE')
2349 # make sure no_conversion is a list, just to simplify code below:
2350 if no_conversion == None:
2351 no_conversion = []
2352 # stream path as a string to report exceptions:
2353 streampath = filename
2354 if not isinstance(streampath, str):
2355 streampath = '/'.join(streampath)
2357 fp = self.openstream(filename)
2359 data = []
2361 # header
2362 s = fp.read(28)
2363 clsid = _clsid(s[8:24])
2365 # PropertySetStream.cSections (4 bytes starts at 1c): number of property sets in this stream
2366 sections_count = i32(s, 24)
2368 section_file_pointers = []
2370 try:
2371 for i in range(sections_count):
2372 # format id
2373 s = fp.read(20)
2374 fmtid = _clsid(s[:16])
2376 if fmtid == FMTID_USERDEFINED_PROPERTIES:
2377 file_pointer = i32(s, 16)
2378 fp.seek(file_pointer)
2379 # read saved sections
2380 s = b"****" + fp.read(i32(fp.read(4)) - 4)
2381 # number of properties:
2382 num_props = i32(s, 4)
2384 PropertyIdentifierAndOffset = s[8: 8+8*num_props]
2386 # property names (dictionary)
2387 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/99127b7f-c440-4697-91a4-c853086d6b33
2388 index = 8+8*num_props
2389 entry_count = i32(s[index: index+4])
2390 index += 4
2391 for i in range(entry_count):
2392 identifier = s[index: index +4]
2393 str_size = i32(s[index+4: index + 8])
2394 string = s[index+8: index+8+str_size].decode('utf_8').strip('\0')
2395 data.append({'property_name':string, 'value':None})
2396 index = index+8+str_size
2397 # clamp num_props based on the data length
2398 num_props = min(num_props, int(len(s) / 8))
2400 # property values
2401 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/f122b9d7-e5cf-4484-8466-83f6fd94b3cc
2402 for i in iterrange(2, num_props):
2403 property_id = 0 # just in case of an exception
2404 try:
2405 property_id = i32(s, 8 + i * 8)
2406 offset = i32(s, 12 + i * 8)
2407 property_type = i32(s, offset)
2409 vt_name = VT.get(property_type, 'UNKNOWN')
2410 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))
2412 # test for common types first (should perhaps use
2413 # a dictionary instead?)
2415 if property_type == VT_I2: # 16-bit signed integer
2416 value = i16(s, offset + 4)
2417 if value >= 32768:
2418 value = value - 65536
2419 elif property_type == 1:
2420 # supposed to be VT_NULL but seems it is not NULL
2421 str_size = i32(s, offset + 8)
2422 value = s[offset + 12:offset + 12 + str_size - 1]
2424 elif property_type == VT_UI2: # 2-byte unsigned integer
2425 value = i16(s, offset + 4)
2426 elif property_type in (VT_I4, VT_INT, VT_ERROR):
2427 # VT_I4: 32-bit signed integer
2428 # VT_ERROR: HRESULT, similar to 32-bit signed integer,
2429 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx
2430 value = i32(s, offset + 4)
2431 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer
2432 value = i32(s, offset + 4) # FIXME
2433 elif property_type in (VT_BSTR, VT_LPSTR):
2434 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx
2435 # size is a 32 bits integer, including the null terminator, and
2436 # possibly trailing or embedded null chars
2437 # TODO: if codepage is unicode, the string should be converted as such
2438 count = i32(s, offset + 4)
2439 value = s[offset + 8:offset + 8 + count - 1]
2440 # remove all null chars:
2441 value = value.replace(b'\x00', b'')
2442 elif property_type == VT_BLOB:
2443 # binary large object (BLOB)
2444 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx
2445 count = i32(s, offset + 4)
2446 value = s[offset + 8:offset + 8 + count]
2447 elif property_type == VT_LPWSTR:
2448 # UnicodeString
2449 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx
2450 # "the string should NOT contain embedded or additional trailing
2451 # null characters."
2452 count = i32(s, offset + 4)
2453 value = self._decode_utf16_str(s[offset + 8:offset + 8 + count * 2])
2454 elif property_type == VT_FILETIME:
2455 value = long(i32(s, offset + 4)) + (long(i32(s, offset + 8)) << 32)
2456 # FILETIME is a 64-bit int: "number of 100ns periods
2457 # since Jan 1,1601".
2458 if convert_time and property_id not in no_conversion:
2459 log.debug('Converting property #%d to python datetime, value=%d=%fs'
2460 % (property_id, value, float(value) / 10000000))
2461 # convert FILETIME to Python datetime.datetime
2462 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/
2463 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
2464 log.debug('timedelta days=%d' % (value // (10 * 1000000 * 3600 * 24)))
2465 value = _FILETIME_null_date + datetime.timedelta(microseconds=value // 10)
2466 else:
2467 # legacy code kept for backward compatibility: returns a
2468 # number of seconds since Jan 1,1601
2469 value = value // 10000000 # seconds
2470 elif property_type == VT_UI1: # 1-byte unsigned integer
2471 value = i8(s[offset + 4])
2472 elif property_type == VT_CLSID:
2473 value = _clsid(s[offset + 4:offset + 20])
2474 elif property_type == VT_CF:
2475 # PropertyIdentifier or ClipboardData??
2476 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx
2477 count = i32(s, offset + 4)
2478 value = s[offset + 8:offset + 8 + count]
2479 elif property_type == VT_BOOL:
2480 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True
2481 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx
2482 value = bool(i16(s, offset + 4))
2483 else:
2484 value = None # everything else yields "None"
2485 log.debug(
2486 'property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))
2488 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,
2489 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,
2490 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx
2492 # FIXME: add support for VT_VECTOR
2493 # VT_VECTOR is a 32 uint giving the number of items, followed by
2494 # the items in sequence. The VT_VECTOR value is combined with the
2495 # type of items, e.g. VT_VECTOR|VT_BSTR
2496 # see https://msdn.microsoft.com/en-us/library/dd942011.aspx
2498 # print("%08x" % property_id, repr(value), end=" ")
2499 # print("(%s)" % VT[i32(s, offset) & 0xFFF])
2501 data[i-2]['value']=value
2502 except BaseException as exc:
2503 # catch exception while parsing each property, and only raise
2504 # a DEFECT_INCORRECT, because parsing can go on
2505 msg = 'Error while parsing property id %d in stream %s: %s' % (
2506 property_id, repr(streampath), exc)
2507 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2509 except BaseException as exc:
2510 # catch exception while parsing property header, and only raise
2511 # a DEFECT_INCORRECT then return an empty dict, because this is not
2512 # a fatal error when parsing the whole file
2513 msg = 'Error while parsing properties header in stream %s: %s' % (
2514 repr(streampath), exc)
2515 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2516 return data
2518 return data
2521 def get_document_variables(self):
2522 """
2523 Extract the document variables from Microsft Word docs
2524 :return: it returns a list of dictionaries, each of them contains var_name and value keys
2525 """
2526 # TODO: testing the code more rigorously
2527 # TODO: adding exception handeling
2528 data = []
2529 word_fp = self.openstream(['WordDocument'])
2531 # Read fcStwUser from the WordDocument stream
2532 # fcStwUser (4 bytes): An unsigned integer which is an offset in 1Table Stream that StwUser locates.
2533 # fcStwUser is the 121th field in fibRgFcLcb97 (index 120)
2534 fib_base = word_fp.read(32)
2535 nfib = i16(fib_base[2:4])
2536 if nfib == 0x00C1: # fibRgFcLcb97
2537 csw = i16(word_fp.read(2))
2538 fibRgW = word_fp.read(csw * 2)
2539 cslw = i16(word_fp.read(2))
2540 fibRgLw = word_fp.read(cslw * 4)
2541 cbRgFcLcb = i16(word_fp.read(2))
2542 fibRgFcLcbBlob = word_fp.read(cbRgFcLcb * 4)
2543 fcStwUser = i32(fibRgFcLcbBlob[120*4:121*4])
2544 lcbStwUser = i32(fibRgFcLcbBlob[121 * 4:122 * 4])
2546 if lcbStwUser > 0:
2547 # Read StwUser from 1Table stream (WordDocument.fcStwUser points to this structure)
2548 # this structure contains variable names and assigned values
2549 table_fp = self.openstream(['1Table'])
2550 table_fp.seek(fcStwUser)
2552 # SttbNames (array, contain variable names)
2553 ss = table_fp.read(6)
2555 char_size = 1
2556 if ss[:2] == b'\xff\xff':
2557 char_size = 2
2559 cdata = i16(ss[2:])
2561 cbExtra = i16(ss[4:])
2563 # SttbNames (array, contains variable names)
2564 for i in range(cdata):
2565 cchData = i16(table_fp.read(2))
2566 data_str = table_fp.read(cchData *char_size )
2567 if char_size == 2:
2568 data_str = self._decode_utf16_str(data_str)
2569 data.append({'var_name':data_str, 'value':''})
2570 extra = table_fp.read(cbExtra)
2572 # rgxchNames (array, contains values corresponding to variable names in SttbNames)
2573 for i in range(cdata):
2574 cchData = i16(table_fp.read(2))
2575 data_str = table_fp.read(cchData *char_size)
2576 if char_size == 2:
2577 data_str = self._decode_utf16_str(data_str)
2578 data[i]['value'] = data_str
2580 return data
2582# --------------------------------------------------------------------
2583# This script can be used to dump the directory of any OLE2 structured
2584# storage file.
2586def main():
2587 """
2588 Main function when olefile is runs as a script from the command line.
2589 This will open an OLE2 file and display its structure and properties
2590 :return: nothing
2591 """
2592 import sys, optparse
2594 DEFAULT_LOG_LEVEL = "warning" # Default log level
2595 LOG_LEVELS = {
2596 'debug': logging.DEBUG,
2597 'info': logging.INFO,
2598 'warning': logging.WARNING,
2599 'error': logging.ERROR,
2600 'critical': logging.CRITICAL
2601 }
2603 usage = 'usage: %prog [options] <filename> [filename2 ...]'
2604 parser = optparse.OptionParser(usage=usage)
2606 parser.add_option("-c", action="store_true", dest="check_streams",
2607 help='check all streams (for debugging purposes)')
2608 parser.add_option("-v", action="store_true", dest="extract_customvar",
2609 help='extract all document variables')
2610 parser.add_option("-p", action="store_true", dest="extract_customprop",
2611 help='extract all user-defined propertires')
2612 parser.add_option("-d", action="store_true", dest="debug_mode",
2613 help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)')
2614 parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
2615 help="logging level debug/info/warning/error/critical (default=%default)")
2617 (options, args) = parser.parse_args()
2619 print('olefile version {} {} - https://www.decalage.info/en/olefile\n'.format(__version__, __date__))
2621 # Print help if no arguments are passed
2622 if len(args) == 0:
2623 print(__doc__)
2624 parser.print_help()
2625 sys.exit()
2627 if options.debug_mode:
2628 options.loglevel = 'debug'
2630 # setup logging to the console
2631 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')
2633 # also enable the module's logger:
2634 enable_logging()
2636 for filename in args:
2637 try:
2638 ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT)
2639 print("-" * 68)
2640 print(filename)
2641 print("-" * 68)
2642 ole.dumpdirectory()
2643 for streamname in ole.listdir():
2644 if streamname[-1][0] == "\005":
2645 print("%r: properties" % streamname)
2646 try:
2647 props = ole.getproperties(streamname, convert_time=True)
2648 props = sorted(props.items())
2649 for k, v in props:
2650 # [PL]: avoid to display too large or binary values:
2651 if isinstance(v, (basestring, bytes)):
2652 if len(v) > 50:
2653 v = v[:50]
2654 if isinstance(v, bytes):
2655 # quick and dirty binary check:
2656 for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,
2657 21,22,23,24,25,26,27,28,29,30,31):
2658 if c in bytearray(v):
2659 v = '(binary data)'
2660 break
2661 print(" ", k, v)
2662 except Exception:
2663 log.exception('Error while parsing property stream %r' % streamname)
2665 try:
2666 if options.extract_customprop:
2667 variables = ole.get_userdefined_properties(streamname, convert_time=True)
2668 if len(variables):
2669 print("%r: user-defined properties" % streamname)
2670 for index, variable in enumerate(variables):
2671 print('\t{} {}: {}'.format(index, variable['property_name'],variable['value']))
2673 except:
2674 log.exception('Error while parsing user-defined property stream %r' % streamname)
2675 elif options.extract_customvar and streamname[-1]=="WordDocument":
2676 print("%r: document variables" % streamname)
2677 variables = ole.get_document_variables()
2679 for index, var in enumerate(variables):
2680 print('\t{} {}: {}'.format(index, var['var_name'], var['value'][:50]))
2681 print("")
2684 if options.check_streams:
2685 # Read all streams to check if there are errors:
2686 print('\nChecking streams...')
2687 for streamname in ole.listdir():
2688 # print name using repr() to convert binary chars to \xNN:
2689 print('-', repr('/'.join(streamname)),'-', end=' ')
2690 st_type = ole.get_type(streamname)
2691 if st_type == STGTY_STREAM:
2692 print('size %d' % ole.get_size(streamname))
2693 # just try to read stream in memory:
2694 ole.openstream(streamname)
2695 else:
2696 print('NOT a stream : type=%d' % st_type)
2697 print()
2699 # for streamname in ole.listdir():
2700 # # print name using repr() to convert binary chars to \xNN:
2701 # print('-', repr('/'.join(streamname)),'-', end=' ')
2702 # print(ole.getmtime(streamname))
2703 # print()
2705 print('Modification/Creation times of all directory entries:')
2706 for entry in ole.direntries:
2707 if entry is not None:
2708 print('- {}: mtime={} ctime={}'.format(entry.name,
2709 entry.getmtime(), entry.getctime()))
2710 print()
2712 # parse and display metadata:
2713 try:
2714 meta = ole.get_metadata()
2715 meta.dump()
2716 except Exception:
2717 log.exception('Error while parsing metadata')
2718 print()
2719 # [PL] Test a few new methods:
2720 root = ole.get_rootentry_name()
2721 print('Root entry name: "%s"' % root)
2722 if ole.exists('worddocument'):
2723 print("This is a Word document.")
2724 print("type of stream 'WordDocument':", ole.get_type('worddocument'))
2725 print("size :", ole.get_size('worddocument'))
2726 if ole.exists('macros/vba'):
2727 print("This document may contain VBA macros.")
2729 # print parsing issues:
2730 print('\nNon-fatal issues raised during parsing:')
2731 if ole.parsing_issues:
2732 for exctype, msg in ole.parsing_issues:
2733 print('- {}: {}'.format(exctype.__name__, msg))
2734 else:
2735 print('None')
2736 ole.close()
2737 except Exception:
2738 log.exception('Error while parsing file %r' % filename)
2741if __name__ == "__main__":
2742 main()
2744# this code was developed while listening to The Wedding Present "Sea Monsters"