Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/olefile/olefile.py: 44%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2olefile (formerly OleFileIO_PL)
4Module to read/write Microsoft OLE2 files (also called Structured Storage or
5Microsoft Compound Document File Format), such as Microsoft Office 97-2003
6documents, Image Composer and FlashPix files, Outlook messages, ...
7This version is compatible with Python 2.7 and 3.5+
9Project website: https://www.decalage.info/olefile
11olefile is copyright (c) 2005-2023 Philippe Lagadec
12(https://www.decalage.info)
14olefile is based on the OleFileIO module from the PIL library v1.1.7
15See: http://www.pythonware.com/products/pil/index.htm
16and http://svn.effbot.org/public/tags/pil-1.1.7/PIL/OleFileIO.py
18The Python Imaging Library (PIL) is
19Copyright (c) 1997-2009 by Secret Labs AB
20Copyright (c) 1995-2009 by Fredrik Lundh
22See source code and LICENSE.txt for information on usage and redistribution.
23"""
25# Since olefile v0.47, only Python 2.7 and 3.5+ are supported
26# This import enables print() as a function rather than a keyword
27# (main requirement to be compatible with Python 3.x)
28# The comment on the line below should be printed on Python 2.5 or older:
29from __future__ import print_function # This version of olefile requires Python 2.7 or 3.5+.
32#--- LICENSE ------------------------------------------------------------------
34# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2023 Philippe Lagadec
35# (https://www.decalage.info)
36#
37# All rights reserved.
38#
39# Redistribution and use in source and binary forms, with or without modification,
40# are permitted provided that the following conditions are met:
41#
42# * Redistributions of source code must retain the above copyright notice, this
43# list of conditions and the following disclaimer.
44# * Redistributions in binary form must reproduce the above copyright notice,
45# this list of conditions and the following disclaimer in the documentation
46# and/or other materials provided with the distribution.
47#
48# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
49# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
50# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
51# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
52# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
54# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
55# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
56# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
57# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59# ----------
60# PIL License:
61#
62# olefile is based on source code from the OleFileIO module of the Python
63# Imaging Library (PIL) published by Fredrik Lundh under the following license:
65# The Python Imaging Library (PIL) is
66# Copyright (c) 1997-2009 by Secret Labs AB
67# Copyright (c) 1995-2009 by Fredrik Lundh
68#
69# By obtaining, using, and/or copying this software and/or its associated
70# documentation, you agree that you have read, understood, and will comply with
71# the following terms and conditions:
72#
73# Permission to use, copy, modify, and distribute this software and its
74# associated documentation for any purpose and without fee is hereby granted,
75# provided that the above copyright notice appears in all copies, and that both
76# that copyright notice and this permission notice appear in supporting
77# documentation, and that the name of Secret Labs AB or the author(s) not be used
78# in advertising or publicity pertaining to distribution of the software
79# without specific, written prior permission.
80#
81# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
82# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
83# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,
84# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
85# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
86# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
87# PERFORMANCE OF THIS SOFTWARE.
89__date__ = "2023-12-01"
90__version__ = '0.47'
91__author__ = "Philippe Lagadec"
93__all__ = ['isOleFile', 'OleFileIO', 'OleMetadata', 'enable_logging',
94 'MAGIC', 'STGTY_EMPTY',
95 'STGTY_STREAM', 'STGTY_STORAGE', 'STGTY_ROOT', 'STGTY_PROPERTY',
96 'STGTY_LOCKBYTES', 'MINIMAL_OLEFILE_SIZE',
97 'DEFECT_UNSURE', 'DEFECT_POTENTIAL', 'DEFECT_INCORRECT',
98 'DEFECT_FATAL', 'DEFAULT_PATH_ENCODING',
99 'MAXREGSECT', 'DIFSECT', 'FATSECT', 'ENDOFCHAIN', 'FREESECT',
100 'MAXREGSID', 'NOSTREAM', 'UNKNOWN_SIZE', 'WORD_CLSID',
101 'OleFileIONotClosed'
102]
104import io
105import sys
106import struct, array, os.path, datetime, logging, warnings, traceback
108#=== COMPATIBILITY WORKAROUNDS ================================================
110# For Python 3.x, need to redefine long as int:
111if str is not bytes:
112 long = int
114# Need to make sure we use xrange both on Python 2 and 3.x:
115try:
116 # on Python 2 we need xrange:
117 iterrange = xrange
118except Exception:
119 # no xrange, for Python 3 it was renamed as range:
120 iterrange = range
122# [PL] workaround to fix an issue with array item size on 64 bits systems:
123if array.array('L').itemsize == 4:
124 # on 32 bits platforms, long integers in an array are 32 bits:
125 UINT32 = 'L'
126elif array.array('I').itemsize == 4:
127 # on 64 bits platforms, integers in an array are 32 bits:
128 UINT32 = 'I'
129elif array.array('i').itemsize == 4:
130 # On 64 bit Jython, signed integers ('i') are the only way to store our 32
131 # bit values in an array in a *somewhat* reasonable way, as the otherwise
132 # perfectly suited 'H' (unsigned int, 32 bits) results in a completely
133 # unusable behaviour. This is most likely caused by the fact that Java
134 # doesn't have unsigned values, and thus Jython's "array" implementation,
135 # which is based on "jarray", doesn't have them either.
136 # NOTE: to trick Jython into converting the values it would normally
137 # interpret as "signed" into "unsigned", a binary-and operation with
138 # 0xFFFFFFFF can be used. This way it is possible to use the same comparing
139 # operations on all platforms / implementations. The corresponding code
140 # lines are flagged with a 'JYTHON-WORKAROUND' tag below.
141 UINT32 = 'i'
142else:
143 raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...')
146# [PL] These workarounds were inspired from the Path module
147# (see http://www.jorendorff.com/articles/python/path/)
148# TODO: remove the use of basestring, as it was removed in Python 3
149try:
150 basestring
151except NameError:
152 basestring = str
154if sys.version_info[0] < 3:
155 # On Python 2.x, the default encoding for path names is UTF-8:
156 DEFAULT_PATH_ENCODING = 'utf-8'
157else:
158 # On Python 3.x, the default encoding for path names is Unicode (None):
159 DEFAULT_PATH_ENCODING = None
162# === LOGGING =================================================================
164def get_logger(name, level=logging.CRITICAL+1):
165 """
166 Create a suitable logger object for this module.
167 The goal is not to change settings of the root logger, to avoid getting
168 other modules' logs on the screen.
169 If a logger exists with same name, reuse it. (Else it would have duplicate
170 handlers and messages would be doubled.)
171 The level is set to CRITICAL+1 by default, to avoid any logging.
172 """
173 # First, test if there is already a logger with the same name, else it
174 # will generate duplicate messages (due to duplicate handlers):
175 if name in logging.Logger.manager.loggerDict:
176 #NOTE: another less intrusive but more "hackish" solution would be to
177 # use getLogger then test if its effective level is not default.
178 logger = logging.getLogger(name)
179 # make sure level is OK:
180 logger.setLevel(level)
181 return logger
182 # get a new logger:
183 logger = logging.getLogger(name)
184 # only add a NullHandler for this logger, it is up to the application
185 # to configure its own logging:
186 logger.addHandler(logging.NullHandler())
187 logger.setLevel(level)
188 return logger
191# a global logger object used for debugging:
192log = get_logger('olefile')
195def enable_logging():
196 """
197 Enable logging for this module (disabled by default).
198 This will set the module-specific logger level to NOTSET, which
199 means the main application controls the actual logging level.
200 """
201 log.setLevel(logging.NOTSET)
204#=== CONSTANTS ===============================================================
206#: magic bytes that should be at the beginning of every OLE file:
207MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'
209# [PL]: added constants for Sector IDs (from AAF specifications)
210MAXREGSECT = 0xFFFFFFFA #: (-6) maximum SECT
211DIFSECT = 0xFFFFFFFC #: (-4) denotes a DIFAT sector in a FAT
212FATSECT = 0xFFFFFFFD #: (-3) denotes a FAT sector in a FAT
213ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain
214FREESECT = 0xFFFFFFFF #: (-1) unallocated sector
216# [PL]: added constants for Directory Entry IDs (from AAF specifications)
217MAXREGSID = 0xFFFFFFFA #: (-6) maximum directory entry ID
218NOSTREAM = 0xFFFFFFFF #: (-1) unallocated directory entry
220# [PL] object types in storage (from AAF specifications)
221STGTY_EMPTY = 0 #: empty directory entry
222STGTY_STORAGE = 1 #: element is a storage object
223STGTY_STREAM = 2 #: element is a stream object
224STGTY_LOCKBYTES = 3 #: element is an ILockBytes object
225STGTY_PROPERTY = 4 #: element is an IPropertyStorage object
226STGTY_ROOT = 5 #: element is a root storage
228# Unknown size for a stream (used by OleStream):
229UNKNOWN_SIZE = 0x7FFFFFFF
231#
232# --------------------------------------------------------------------
233# property types
235VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;
236VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;
237VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;
238VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;
239VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;
240VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;
241VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;
242VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;
243VT_VECTOR=0x1000;
245# map property id to name (for debugging purposes)
246VT = {}
247for keyword, var in list(vars().items()):
248 if keyword[:3] == "VT_":
249 VT[var] = keyword
251#
252# --------------------------------------------------------------------
253# Some common document types (root.clsid fields)
255WORD_CLSID = "00020900-0000-0000-C000-000000000046"
256# TODO: check Excel, PPT, ...
258# [PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()
259DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect
260DEFECT_POTENTIAL = 20 # a potential defect
261DEFECT_INCORRECT = 30 # an error according to specifications, but parsing
262 # can go on
263DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is
264 # impossible
266# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes
267# (this is used in isOleFile and OleFileIO.open)
268MINIMAL_OLEFILE_SIZE = 1536
270#=== FUNCTIONS ===============================================================
272def isOleFile (filename=None, data=None):
273 """
274 Test if a file is an OLE container (according to the magic bytes in its header).
276 .. note::
277 This function only checks the first 8 bytes of the file, not the
278 rest of the OLE structure.
279 If data is provided, it also checks if the file size is above
280 the minimal size of an OLE file (1536 bytes).
281 If filename is provided with the path of the file on disk, the file is
282 open only to read the first 8 bytes, then closed.
284 .. versionadded:: 0.16
286 :param filename: filename, contents or file-like object of the OLE file (string-like or file-like object)
288 - if data is provided, filename is ignored.
289 - if filename is a unicode string, it is used as path of the file to open on disk.
290 - if filename is a bytes string smaller than 1536 bytes, it is used as path
291 of the file to open on disk.
292 - [deprecated] if filename is a bytes string longer than 1535 bytes, it is parsed
293 as the content of an OLE file in memory. (bytes type only)
294 Note that this use case is deprecated and should be replaced by the new data parameter
295 - if filename is a file-like object (with read and seek methods),
296 it is parsed as-is.
297 :type filename: bytes, str, unicode or file-like object
299 :param data: bytes string with the contents of the file to be checked, when the file is in memory
300 (added in olefile 0.47)
301 :type data: bytes
303 :returns: True if OLE, False otherwise.
304 :rtype: bool
305 """
306 header = None
307 # first check if data is provided and large enough
308 if data is not None:
309 if len(data) >= MINIMAL_OLEFILE_SIZE:
310 header = data[:len(MAGIC)]
311 else:
312 # the file is too small, cannot be OLE
313 return False
314 # check if filename is a string-like or file-like object:
315 elif hasattr(filename, 'read') and hasattr(filename, 'seek'):
316 # file-like object: use it directly
317 header = filename.read(len(MAGIC))
318 # just in case, seek back to start of file:
319 filename.seek(0)
320 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:
321 # filename is a bytes string containing the OLE file to be parsed:
322 header = filename[:len(MAGIC)]
323 else:
324 # string-like object: filename of file on disk
325 with open(filename, 'rb') as fp:
326 header = fp.read(len(MAGIC))
327 if header == MAGIC:
328 return True
329 else:
330 return False
333if bytes is str:
334 # version for Python 2.x
335 def i8(c):
336 return ord(c)
337else:
338 # version for Python 3.x
339 def i8(c):
340 return c if c.__class__ is int else c[0]
343def i16(c, o = 0):
344 """
345 Converts a 2-bytes (16 bits) string to an integer.
347 :param c: string containing bytes to convert
348 :param o: offset of bytes to convert in string
349 """
350 return struct.unpack("<H", c[o:o+2])[0]
353def i32(c, o = 0):
354 """
355 Converts a 4-bytes (32 bits) string to an integer.
357 :param c: string containing bytes to convert
358 :param o: offset of bytes to convert in string
359 """
360 return struct.unpack("<I", c[o:o+4])[0]
363def _clsid(clsid):
364 """
365 Converts a CLSID to a human-readable string.
367 :param clsid: string of length 16.
368 """
369 assert len(clsid) == 16
370 # if clsid is only made of null bytes, return an empty string:
371 # (PL: why not simply return the string with zeroes?)
372 if not clsid.strip(b"\0"):
373 return ""
374 return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %
375 ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +
376 tuple(map(i8, clsid[8:16]))))
380def filetime2datetime(filetime):
381 """
382 convert FILETIME (64 bits int) to Python datetime.datetime
383 """
384 # TODO: manage exception when microseconds is too large
385 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/
386 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
387 # log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24)))
388 return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10)
392#=== CLASSES ==================================================================
394class OleFileError(IOError):
395 """
396 Generic base error for this module.
397 """
398 pass
400class NotOleFileError(OleFileError):
401 """
402 Error raised when the opened file is not an OLE file.
403 """
404 pass
406class OleMetadata:
407 """
408 Class to parse and store metadata from standard properties of OLE files.
410 Available attributes:
411 codepage, title, subject, author, keywords, comments, template,
412 last_saved_by, revision_number, total_edit_time, last_printed, create_time,
413 last_saved_time, num_pages, num_words, num_chars, thumbnail,
414 creating_application, security, codepage_doc, category, presentation_target,
415 bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,
416 scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,
417 chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,
418 version, dig_sig, content_type, content_status, language, doc_version
420 Note: an attribute is set to None when not present in the properties of the
421 OLE file.
423 References for SummaryInformation stream:
425 - https://msdn.microsoft.com/en-us/library/dd942545.aspx
426 - https://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx
427 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx
428 - https://msdn.microsoft.com/en-us/library/aa372045.aspx
429 - http://sedna-soft.de/articles/summary-information-stream/
430 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html
432 References for DocumentSummaryInformation stream:
434 - https://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
435 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx
436 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html
438 New in version 0.25
439 """
441 # attribute names for SummaryInformation stream properties:
442 # (ordered by property id, starting at 1)
443 SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',
444 'template', 'last_saved_by', 'revision_number', 'total_edit_time',
445 'last_printed', 'create_time', 'last_saved_time', 'num_pages',
446 'num_words', 'num_chars', 'thumbnail', 'creating_application',
447 'security']
449 # attribute names for DocumentSummaryInformation stream properties:
450 # (ordered by property id, starting at 1)
451 DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',
452 'slides', 'notes', 'hidden_slides', 'mm_clips',
453 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',
454 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',
455 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',
456 'content_type', 'content_status', 'language', 'doc_version']
458 def __init__(self):
459 """
460 Constructor for OleMetadata
461 All attributes are set to None by default
462 """
463 # properties from SummaryInformation stream
464 self.codepage = None
465 self.title = None
466 self.subject = None
467 self.author = None
468 self.keywords = None
469 self.comments = None
470 self.template = None
471 self.last_saved_by = None
472 self.revision_number = None
473 self.total_edit_time = None
474 self.last_printed = None
475 self.create_time = None
476 self.last_saved_time = None
477 self.num_pages = None
478 self.num_words = None
479 self.num_chars = None
480 self.thumbnail = None
481 self.creating_application = None
482 self.security = None
483 # properties from DocumentSummaryInformation stream
484 self.codepage_doc = None
485 self.category = None
486 self.presentation_target = None
487 self.bytes = None
488 self.lines = None
489 self.paragraphs = None
490 self.slides = None
491 self.notes = None
492 self.hidden_slides = None
493 self.mm_clips = None
494 self.scale_crop = None
495 self.heading_pairs = None
496 self.titles_of_parts = None
497 self.manager = None
498 self.company = None
499 self.links_dirty = None
500 self.chars_with_spaces = None
501 self.unused = None
502 self.shared_doc = None
503 self.link_base = None
504 self.hlinks = None
505 self.hlinks_changed = None
506 self.version = None
507 self.dig_sig = None
508 self.content_type = None
509 self.content_status = None
510 self.language = None
511 self.doc_version = None
513 def parse_properties(self, ole_file):
514 """
515 Parse standard properties of an OLE file, from the streams
516 ``\\x05SummaryInformation`` and ``\\x05DocumentSummaryInformation``,
517 if present.
518 Properties are converted to strings, integers or python datetime objects.
519 If a property is not present, its value is set to None.
521 :param ole_file: OleFileIO object from which to parse properties
522 """
523 # first set all attributes to None:
524 for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS):
525 setattr(self, attrib, None)
526 if ole_file.exists("\x05SummaryInformation"):
527 # get properties from the stream:
528 # (converting timestamps to python datetime, except total_edit_time,
529 # which is property #10)
530 props = ole_file.getproperties("\x05SummaryInformation",
531 convert_time=True, no_conversion=[10])
532 # store them into this object's attributes:
533 for i in range(len(self.SUMMARY_ATTRIBS)):
534 # ids for standards properties start at 0x01, until 0x13
535 value = props.get(i+1, None)
536 setattr(self, self.SUMMARY_ATTRIBS[i], value)
537 if ole_file.exists("\x05DocumentSummaryInformation"):
538 # get properties from the stream:
539 props = ole_file.getproperties("\x05DocumentSummaryInformation",
540 convert_time=True)
541 # store them into this object's attributes:
542 for i in range(len(self.DOCSUM_ATTRIBS)):
543 # ids for standards properties start at 0x01, until 0x13
544 value = props.get(i+1, None)
545 setattr(self, self.DOCSUM_ATTRIBS[i], value)
547 def dump(self):
548 """
549 Dump all metadata, for debugging purposes.
550 """
551 print('Properties from SummaryInformation stream:')
552 for prop in self.SUMMARY_ATTRIBS:
553 value = getattr(self, prop)
554 print('- {}: {}'.format(prop, repr(value)))
555 print('Properties from DocumentSummaryInformation stream:')
556 for prop in self.DOCSUM_ATTRIBS:
557 value = getattr(self, prop)
558 print('- {}: {}'.format(prop, repr(value)))
560class OleFileIONotClosed(RuntimeWarning):
561 """
562 Warning type used when OleFileIO is destructed but has open file handle.
563 """
564 def __init__(self, stack_of_open=None):
565 super(OleFileIONotClosed, self).__init__()
566 self.stack_of_open = stack_of_open
568 def __str__(self):
569 msg = 'Deleting OleFileIO instance with open file handle. ' \
570 'You should ensure that OleFileIO is never deleted ' \
571 'without calling close() first. Consider using '\
572 '"with OleFileIO(...) as ole: ...".'
573 if self.stack_of_open:
574 return ''.join([msg, '\n', 'Stacktrace of open() call:\n'] +
575 self.stack_of_open.format())
576 else:
577 return msg
580# --- OleStream ---------------------------------------------------------------
582class OleStream(io.BytesIO):
583 """
584 OLE2 Stream
586 Returns a read-only file object which can be used to read
587 the contents of a OLE stream (instance of the BytesIO class).
588 To open a stream, use the openstream method in the OleFileIO class.
590 This function can be used with either ordinary streams,
591 or ministreams, depending on the offset, sectorsize, and
592 fat table arguments.
594 Attributes:
596 - size: actual size of data stream, after it was opened.
597 """
598 # FIXME: should store the list of sects obtained by following
599 # the fat chain, and load new sectors on demand instead of
600 # loading it all in one go.
602 def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio):
603 """
604 Constructor for OleStream class.
606 :param fp: file object, the OLE container or the MiniFAT stream
607 :param sect: sector index of first sector in the stream
608 :param size: total size of the stream
609 :param offset: offset in bytes for the first FAT or MiniFAT sector
610 :param sectorsize: size of one sector
611 :param fat: array/list of sector indexes (FAT or MiniFAT)
612 :param filesize: size of OLE file (for debugging)
613 :param olefileio: OleFileIO object containing this stream
614 :returns: a BytesIO instance containing the OLE stream
615 """
616 log.debug('OleStream.__init__:')
617 log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'
618 %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))
619 self.ole = olefileio
620 # this check is necessary, otherwise when attempting to open a stream
621 # from a closed OleFileIO, a stream of size zero is returned without
622 # raising an exception. (see issue #81)
623 if self.ole.fp.closed:
624 raise OSError('Attempting to open a stream from a closed OLE File')
625 # [PL] To detect malformed documents with FAT loops, we compute the
626 # expected number of sectors in the stream:
627 unknown_size = False
628 if size == UNKNOWN_SIZE:
629 # this is the case when called from OleFileIO._open(), and stream
630 # size is not known in advance (for example when reading the
631 # Directory stream). Then we can only guess maximum size:
632 size = len(fat)*sectorsize
633 # and we keep a record that size was unknown:
634 unknown_size = True
635 log.debug(' stream with UNKNOWN SIZE')
636 nb_sectors = (size + (sectorsize-1)) // sectorsize
637 log.debug('nb_sectors = %d' % nb_sectors)
638 # This number should (at least) be less than the total number of
639 # sectors in the given FAT:
640 if nb_sectors > len(fat):
641 self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large')
642 # optimization(?): data is first a list of strings, and join() is called
643 # at the end to concatenate all in one string.
644 # (this may not be really useful with recent Python versions)
645 data = []
646 # if size is zero, then first sector index should be ENDOFCHAIN:
647 if size == 0 and sect != ENDOFCHAIN:
648 log.debug('size == 0 and sect != ENDOFCHAIN:')
649 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream')
650 # [PL] A fixed-length for loop is used instead of an undefined while
651 # loop to avoid DoS attacks:
652 for i in range(nb_sectors):
653 log.debug('Reading stream sector[%d] = %Xh' % (i, sect))
654 # Sector index may be ENDOFCHAIN, but only if size was unknown
655 if sect == ENDOFCHAIN:
656 if unknown_size:
657 log.debug('Reached ENDOFCHAIN sector for stream with unknown size')
658 break
659 else:
660 # else this means that the stream is smaller than declared:
661 log.debug('sect=ENDOFCHAIN before expected size')
662 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream')
663 # sector index should be within FAT:
664 if sect<0 or sect>=len(fat):
665 log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))
666 log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors))
667## tmp_data = b"".join(data)
668## f = open('test_debug.bin', 'wb')
669## f.write(tmp_data)
670## f.close()
671## log.debug('data read so far: %d bytes' % len(tmp_data))
672 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')
673 # stop reading here if the exception is ignored:
674 break
675 # TODO: merge this code with OleFileIO.getsect() ?
676 # TODO: check if this works with 4K sectors:
677 try:
678 fp.seek(offset + sectorsize * sect)
679 except Exception:
680 log.debug('sect=%d, seek=%d, filesize=%d' %
681 (sect, offset+sectorsize*sect, filesize))
682 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range')
683 # stop reading here if the exception is ignored:
684 break
685 sector_data = fp.read(sectorsize)
686 # [PL] check if there was enough data:
687 # Note: if sector is the last of the file, sometimes it is not a
688 # complete sector (of 512 or 4K), so we may read less than
689 # sectorsize.
690 if len(sector_data)!=sectorsize and sect!=(len(fat)-1):
691 log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %
692 (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))
693 log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))
694 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector')
695 data.append(sector_data)
696 # jump to next sector in the FAT:
697 try:
698 sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND
699 except IndexError:
700 # [PL] if pointer is out of the FAT an exception is raised
701 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')
702 # stop reading here if the exception is ignored:
703 break
704 # [PL] Last sector should be a "end of chain" marker:
705 # if sect != ENDOFCHAIN:
706 # raise IOError('incorrect last sector index in OLE stream')
707 data = b"".join(data)
708 # Data is truncated to the actual stream size:
709 if len(data) >= size:
710 log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size))
711 data = data[:size]
712 # actual stream size is stored for future use:
713 self.size = size
714 elif unknown_size:
715 # actual stream size was not known, now we know the size of read
716 # data:
717 log.debug('Read data of length %d, the stream size was unknown' % len(data))
718 self.size = len(data)
719 else:
720 # read data is less than expected:
721 log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size))
722 # TODO: provide details in exception message
723 self.size = len(data)
724 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared')
725 # when all data is read in memory, BytesIO constructor is called
726 io.BytesIO.__init__(self, data)
727 # Then the OleStream object can be used as a read-only file object.
730# --- OleDirectoryEntry -------------------------------------------------------
732class OleDirectoryEntry:
733 """
734 OLE2 Directory Entry pointing to a stream or a storage
735 """
736 # struct to parse directory entries:
737 # <: little-endian byte order, standard sizes
738 # (note: this should guarantee that Q returns a 64 bits int)
739 # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes
740 # H: uint16, number of bytes used in name buffer, including null = (len+1)*2
741 # B: uint8, dir entry type (between 0 and 5)
742 # B: uint8, color: 0=black, 1=red
743 # I: uint32, index of left child node in the red-black tree, NOSTREAM if none
744 # I: uint32, index of right child node in the red-black tree, NOSTREAM if none
745 # I: uint32, index of child root node if it is a storage, else NOSTREAM
746 # 16s: CLSID, unique identifier (only used if it is a storage)
747 # I: uint32, user flags
748 # Q (was 8s): uint64, creation timestamp or zero
749 # Q (was 8s): uint64, modification timestamp or zero
750 # I: uint32, SID of first sector if stream or ministream, SID of 1st sector
751 # of stream containing ministreams if root entry, 0 otherwise
752 # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise
753 # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise
754 STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII'
755 # size of a directory entry: 128 bytes
756 DIRENTRY_SIZE = 128
757 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE
759 def __init__(self, entry, sid, ole_file):
760 """
761 Constructor for an OleDirectoryEntry object.
762 Parses a 128-bytes entry from the OLE Directory stream.
764 :param bytes entry: bytes string (must be 128 bytes long)
765 :param int sid: index of this directory entry in the OLE file directory
766 :param OleFileIO ole_file: OleFileIO object containing this directory entry
767 """
768 self.sid = sid
769 # ref to ole_file is stored for future use
770 self.olefile = ole_file
771 # kids is a list of children entries, if this entry is a storage:
772 # (list of OleDirectoryEntry objects)
773 self.kids = []
774 # kids_dict is a dictionary of children entries, indexed by their
775 # name in lowercase: used to quickly find an entry, and to detect
776 # duplicates
777 self.kids_dict = {}
778 # flag used to detect if the entry is referenced more than once in
779 # directory:
780 self.used = False
781 # decode DirEntry
782 (
783 self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes
784 self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2
785 self.entry_type,
786 self.color,
787 self.sid_left,
788 self.sid_right,
789 self.sid_child,
790 clsid,
791 self.dwUserFlags,
792 self.createTime,
793 self.modifyTime,
794 self.isectStart,
795 self.sizeLow,
796 self.sizeHigh
797 ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry)
798 if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:
799 ole_file._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')
800 # only first directory entry can (and should) be root:
801 if self.entry_type == STGTY_ROOT and sid != 0:
802 ole_file._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')
803 if sid == 0 and self.entry_type != STGTY_ROOT:
804 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')
805 # log.debug(struct.unpack(fmt_entry, entry[:len_entry]))
806 # name should be at most 31 unicode characters + null character,
807 # so 64 bytes in total (31*2 + 2):
808 if self.namelength > 64:
809 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes')
810 # if exception not raised, namelength is set to the maximum value:
811 self.namelength = 64
812 # only characters without ending null char are kept:
813 self.name_utf16 = self.name_raw[:(self.namelength-2)]
814 # TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1)
815 # TODO: check if the name does not contain forbidden characters:
816 # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'."
817 # name is converted from UTF-16LE to the path encoding specified in the OleFileIO:
818 self.name = ole_file._decode_utf16_str(self.name_utf16)
820 log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
821 log.debug(' - type: %d' % self.entry_type)
822 log.debug(' - sect: %Xh' % self.isectStart)
823 log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,
824 self.sid_right, self.sid_child))
826 # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes
827 # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1
828 # or some other value so it cannot be raised as a defect in general:
829 if ole_file.sectorsize == 512:
830 if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF:
831 log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %
832 (ole_file.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh))
833 ole_file._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')
834 self.size = self.sizeLow
835 else:
836 self.size = self.sizeLow + (long(self.sizeHigh)<<32)
837 log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh))
839 self.clsid = _clsid(clsid)
840 # a storage should have a null size, BUT some implementations such as
841 # Word 8 for Mac seem to allow non-null values => Potential defect:
842 if self.entry_type == STGTY_STORAGE and self.size != 0:
843 ole_file._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')
844 # check if stream is not already referenced elsewhere:
845 self.is_minifat = False
846 if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:
847 if self.size < ole_file.minisectorcutoff \
848 and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT
849 # ministream object
850 self.is_minifat = True
851 else:
852 self.is_minifat = False
853 ole_file._check_duplicate_stream(self.isectStart, self.is_minifat)
854 self.sect_chain = None
856 def build_sect_chain(self, ole_file):
857 """
858 Build the sector chain for a stream (from the FAT or the MiniFAT)
860 :param OleFileIO ole_file: OleFileIO object containing this directory entry
861 :return: nothing
862 """
863 # TODO: seems to be used only from _write_mini_stream, is it useful?
864 # TODO: use self.olefile instead of ole_file
865 if self.sect_chain:
866 return
867 if self.entry_type not in (STGTY_ROOT, STGTY_STREAM) or self.size == 0:
868 return
870 self.sect_chain = list()
872 if self.is_minifat and not ole_file.minifat:
873 ole_file.loadminifat()
875 next_sect = self.isectStart
876 while next_sect != ENDOFCHAIN:
877 self.sect_chain.append(next_sect)
878 if self.is_minifat:
879 next_sect = ole_file.minifat[next_sect]
880 else:
881 next_sect = ole_file.fat[next_sect]
883 def build_storage_tree(self):
884 """
885 Read and build the red-black tree attached to this OleDirectoryEntry
886 object, if it is a storage.
887 Note that this method builds a tree of all subentries, so it should
888 only be called for the root object once.
889 """
890 log.debug('build_storage_tree: SID=%d - %s - sid_child=%d'
891 % (self.sid, repr(self.name), self.sid_child))
892 if self.sid_child != NOSTREAM:
893 # if child SID is not NOSTREAM, then this entry is a storage.
894 # Let's walk through the tree of children to fill the kids list:
895 self.append_kids(self.sid_child)
897 # Note from OpenOffice documentation: the safest way is to
898 # recreate the tree because some implementations may store broken
899 # red-black trees...
901 # in the OLE file, entries are sorted on (length, name).
902 # for convenience, we sort them on name instead:
903 # (see rich comparison methods in this class)
904 self.kids.sort()
906 def append_kids(self, child_sid):
907 """
908 Walk through red-black tree of children of this directory entry to add
909 all of them to the kids list. (recursive method)
911 :param child_sid: index of child directory entry to use, or None when called
912 first time for the root. (only used during recursion)
913 """
914 log.debug('append_kids: child_sid=%d' % child_sid)
915 # [PL] this method was added to use simple recursion instead of a complex
916 # algorithm.
917 # if this is not a storage or a leaf of the tree, nothing to do:
918 if child_sid == NOSTREAM:
919 return
920 # check if child SID is in the proper range:
921 if child_sid<0 or child_sid>=len(self.olefile.direntries):
922 self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range')
923 else:
924 # get child direntry:
925 child = self.olefile._load_direntry(child_sid) #direntries[child_sid]
926 log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'
927 % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))
928 # Check if kid was not already referenced in a storage:
929 if child.used:
930 self.olefile._raise_defect(DEFECT_INCORRECT,
931 'OLE Entry referenced more than once')
932 return
933 child.used = True
934 # the directory entries are organized as a red-black tree.
935 # (cf. Wikipedia for details)
936 # First walk through left side of the tree:
937 self.append_kids(child.sid_left)
938 # Check if its name is not already used (case-insensitive):
939 name_lower = child.name.lower()
940 if name_lower in self.kids_dict:
941 self.olefile._raise_defect(DEFECT_INCORRECT,
942 "Duplicate filename in OLE storage")
943 # Then the child_sid OleDirectoryEntry object is appended to the
944 # kids list and dictionary:
945 self.kids.append(child)
946 self.kids_dict[name_lower] = child
947 # Finally walk through right side of the tree:
948 self.append_kids(child.sid_right)
949 # Afterwards build kid's own tree if it's also a storage:
950 child.build_storage_tree()
952 def __eq__(self, other):
953 "Compare entries by name"
954 return self.name == other.name
956 def __lt__(self, other):
957 "Compare entries by name"
958 return self.name < other.name
960 def __ne__(self, other):
961 return not self.__eq__(other)
963 def __le__(self, other):
964 return self.__eq__(other) or self.__lt__(other)
966 # Reflected __lt__() and __le__() will be used for __gt__() and __ge__()
968 # TODO: replace by the same function as MS implementation ?
969 # (order by name length first, then case-insensitive order)
971 def dump(self, tab = 0):
972 "Dump this entry, and all its subentries (for debug purposes only)"
973 TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",
974 "(property)", "(root)"]
975 try:
976 type_name = TYPES[self.entry_type]
977 except IndexError:
978 type_name = '(UNKNOWN)'
979 print(" "*tab + repr(self.name), type_name, end=' ')
980 if self.entry_type in (STGTY_STREAM, STGTY_ROOT):
981 print(self.size, "bytes", end=' ')
982 print()
983 if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:
984 print(" "*tab + "{%s}" % self.clsid)
986 for kid in self.kids:
987 kid.dump(tab + 2)
989 def getmtime(self):
990 """
991 Return modification time of a directory entry.
993 :returns: None if modification time is null, a python datetime object
994 otherwise (UTC timezone)
996 new in version 0.26
997 """
998 if self.modifyTime == 0:
999 return None
1000 return filetime2datetime(self.modifyTime)
1003 def getctime(self):
1004 """
1005 Return creation time of a directory entry.
1007 :returns: None if modification time is null, a python datetime object
1008 otherwise (UTC timezone)
1010 new in version 0.26
1011 """
1012 if self.createTime == 0:
1013 return None
1014 return filetime2datetime(self.createTime)
1017#--- OleFileIO ----------------------------------------------------------------
1019class OleFileIO:
1020 """
1021 OLE container object
1023 This class encapsulates the interface to an OLE 2 structured
1024 storage file. Use the listdir and openstream methods to
1025 access the contents of this file.
1027 Object names are given as a list of strings, one for each subentry
1028 level. The root entry should be omitted. For example, the following
1029 code extracts all image streams from a Microsoft Image Composer file::
1031 with OleFileIO("fan.mic") as ole:
1033 for entry in ole.listdir():
1034 if entry[1:2] == "Image":
1035 fin = ole.openstream(entry)
1036 fout = open(entry[0:1], "wb")
1037 while True:
1038 s = fin.read(8192)
1039 if not s:
1040 break
1041 fout.write(s)
1043 You can use the viewer application provided with the Python Imaging
1044 Library to view the resulting files (which happens to be standard
1045 TIFF files).
1046 """
1048 def __init__(self, filename=None, raise_defects=DEFECT_FATAL,
1049 write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING):
1050 """
1051 Constructor for the OleFileIO class.
1053 :param filename: file to open.
1055 - if filename is a string smaller than 1536 bytes, it is the path
1056 of the file to open. (bytes or unicode string)
1057 - if filename is a string longer than 1535 bytes, it is parsed
1058 as the content of an OLE file in memory. (bytes type only)
1059 - if filename is a file-like object (with read, seek and tell methods),
1060 it is parsed as-is. The caller is responsible for closing it when done.
1062 :param raise_defects: minimal level for defects to be raised as exceptions.
1063 (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a
1064 security-oriented application, see source code for details)
1066 :param write_mode: bool, if True the file is opened in read/write mode instead
1067 of read-only by default.
1069 :param debug: bool, set debug mode (deprecated, not used anymore)
1071 :param path_encoding: None or str, name of the codec to use for path
1072 names (streams and storages), or None for Unicode.
1073 Unicode by default on Python 3+, UTF-8 on Python 2.x.
1074 (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)
1075 """
1076 # minimal level for defects to be raised as exceptions:
1077 self._raise_defects_level = raise_defects
1078 #: list of defects/issues not raised as exceptions:
1079 #: tuples of (exception type, message)
1080 self.parsing_issues = []
1081 self.write_mode = write_mode
1082 self.path_encoding = path_encoding
1083 # initialize all attributes to default values:
1084 self._filesize = None
1085 self.ministream = None
1086 self._used_streams_fat = []
1087 self._used_streams_minifat = []
1088 self.byte_order = None
1089 self.directory_fp = None
1090 self.direntries = None
1091 self.dll_version = None
1092 self.fat = None
1093 self.first_difat_sector = None
1094 self.first_dir_sector = None
1095 self.first_mini_fat_sector = None
1096 self.fp = None
1097 self.header_clsid = None
1098 self.header_signature = None
1099 self.metadata = None
1100 self.mini_sector_shift = None
1101 self.mini_sector_size = None
1102 self.mini_stream_cutoff_size = None
1103 self.minifat = None
1104 self.minifatsect = None
1105 # TODO: duplicates?
1106 self.minisectorcutoff = None
1107 self.minisectorsize = None
1108 self.ministream = None
1109 self.minor_version = None
1110 self.nb_sect = None
1111 self.num_difat_sectors = None
1112 self.num_dir_sectors = None
1113 self.num_fat_sectors = None
1114 self.num_mini_fat_sectors = None
1115 self.reserved1 = None
1116 self.reserved2 = None
1117 self.root = None
1118 self.sector_shift = None
1119 self.sector_size = None
1120 self.transaction_signature_number = None
1121 self.warn_if_not_closed = False
1122 self._we_opened_fp = False
1123 self._open_stack = None
1124 if filename:
1125 # try opening, ensure fp is closed if that fails
1126 try:
1127 self.open(filename, write_mode=write_mode)
1128 except Exception:
1129 # caller has no chance of calling close() now
1130 self._close(warn=False)
1131 raise
1133 def __del__(self):
1134 """Destructor, ensures all file handles are closed that we opened."""
1135 self._close(warn=True)
1136 # super(OleFileIO, self).__del__() # there's no super-class destructor
1139 def __enter__(self):
1140 return self
1143 def __exit__(self, *args):
1144 self._close(warn=False)
1147 def _raise_defect(self, defect_level, message, exception_type=OleFileError):
1148 """
1149 This method should be called for any defect found during file parsing.
1150 It may raise an OleFileError exception according to the minimal level chosen
1151 for the OleFileIO object.
1153 :param defect_level: defect level, possible values are:
1155 - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect
1156 - DEFECT_POTENTIAL : a potential defect
1157 - DEFECT_INCORRECT : an error according to specifications, but parsing can go on
1158 - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible
1160 :param message: string describing the defect, used with raised exception.
1161 :param exception_type: exception class to be raised, OleFileError by default
1162 """
1163 # added by [PL]
1164 if defect_level >= self._raise_defects_level:
1165 log.error(message)
1166 raise exception_type(message)
1167 else:
1168 # just record the issue, no exception raised:
1169 self.parsing_issues.append((exception_type, message))
1170 log.warning(message)
1173 def _decode_utf16_str(self, utf16_str, errors='replace'):
1174 """
1175 Decode a string encoded in UTF-16 LE format, as found in the OLE
1176 directory or in property streams. Return a string encoded
1177 according to the path_encoding specified for the OleFileIO object.
1179 :param bytes utf16_str: bytes string encoded in UTF-16 LE format
1180 :param str errors: str, see python documentation for str.decode()
1181 :return: str, encoded according to path_encoding
1182 :rtype: str
1183 """
1184 unicode_str = utf16_str.decode('UTF-16LE', errors)
1185 if self.path_encoding:
1186 # an encoding has been specified for path names:
1187 return unicode_str.encode(self.path_encoding, errors)
1188 else:
1189 # path_encoding=None, return the Unicode string as-is:
1190 return unicode_str
1193 def open(self, filename, write_mode=False):
1194 """
1195 Open an OLE2 file in read-only or read/write mode.
1196 Read and parse the header, FAT and directory.
1198 :param filename: string-like or file-like object, OLE file to parse
1200 - if filename is a string smaller than 1536 bytes, it is the path
1201 of the file to open. (bytes or unicode string)
1202 - if filename is a string longer than 1535 bytes, it is parsed
1203 as the content of an OLE file in memory. (bytes type only)
1204 - if filename is a file-like object (with read, seek and tell methods),
1205 it is parsed as-is. The caller is responsible for closing it when done
1207 :param write_mode: bool, if True the file is opened in read/write mode instead
1208 of read-only by default. (ignored if filename is not a path)
1209 """
1210 self.write_mode = write_mode
1211 # [PL] check if filename is a string-like or file-like object:
1212 # (it is better to check for a read() method)
1213 if hasattr(filename, 'read'):
1214 # TODO: also check seek and tell methods?
1215 # file-like object: use it directly
1216 self.fp = filename
1217 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:
1218 # filename is a bytes string containing the OLE file to be parsed:
1219 # convert it to BytesIO
1220 self.fp = io.BytesIO(filename)
1221 else:
1222 # string-like object: filename of file on disk
1223 if self.write_mode:
1224 # open file in mode 'read with update, binary'
1225 # According to https://docs.python.org/library/functions.html#open
1226 # 'w' would truncate the file, 'a' may only append on some Unixes
1227 mode = 'r+b'
1228 else:
1229 # read-only mode by default
1230 mode = 'rb'
1231 self.fp = open(filename, mode)
1232 self._we_opened_fp = True
1233 self._open_stack = traceback.extract_stack() # remember for warning
1234 # obtain the filesize by using seek and tell, which should work on most
1235 # file-like objects:
1236 # TODO: do it above, using getsize with filename when possible?
1237 # TODO: fix code to fail with clear exception when filesize cannot be obtained
1238 filesize = 0
1239 self.fp.seek(0, os.SEEK_END)
1240 try:
1241 filesize = self.fp.tell()
1242 finally:
1243 self.fp.seek(0)
1244 self._filesize = filesize
1245 log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize))
1247 # lists of streams in FAT and MiniFAT, to detect duplicate references
1248 # (list of indexes of first sectors of each stream)
1249 self._used_streams_fat = []
1250 self._used_streams_minifat = []
1252 header = self.fp.read(512)
1254 if len(header) != 512 or header[:8] != MAGIC:
1255 log.debug('Magic = {!r} instead of {!r}'.format(header[:8], MAGIC))
1256 self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file", NotOleFileError)
1258 # [PL] header structure according to AAF specifications:
1259 ##Header
1260 ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
1261 ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
1262 ## // 0x1a, 0xe1} for current version
1263 ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/
1264 ## // GetClassFile uses root directory class id)
1265 ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is
1266 ## // written by reference implementation
1267 ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for
1268 ## // 512-byte sectors, 4 for 4 KB sectors
1269 ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
1270 ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two;
1271 ## // typically 9 indicating 512-byte sectors
1272 ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
1273 ## // typically 6 indicating 64-byte mini-sectors
1274 ##USHORT _usReserved; // [22H,02] reserved, must be zero
1275 ##ULONG _ulReserved1; // [24H,04] reserved, must be zero
1276 ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,
1277 ## // number of SECTs in directory chain for 4 KB
1278 ## // sectors
1279 ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain
1280 ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain
1281 ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must
1282 ## // be zero. The reference implementation
1283 ## // does not support transactions
1284 ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;
1285 ## // typically 4096 bytes
1286 ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
1287 ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
1288 ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain
1289 ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain
1290 ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
1291 ##};
1293 # [PL] header decoding:
1294 # '<' indicates little-endian byte ordering for Intel (cf. struct module help)
1295 fmt_header = '<8s16sHHHHHHLLLLLLLLLL'
1296 header_size = struct.calcsize(fmt_header)
1297 log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )
1298 header1 = header[:header_size]
1299 (
1300 self.header_signature,
1301 self.header_clsid,
1302 self.minor_version,
1303 self.dll_version,
1304 self.byte_order,
1305 self.sector_shift,
1306 self.mini_sector_shift,
1307 self.reserved1,
1308 self.reserved2,
1309 self.num_dir_sectors,
1310 self.num_fat_sectors,
1311 self.first_dir_sector,
1312 self.transaction_signature_number,
1313 self.mini_stream_cutoff_size,
1314 self.first_mini_fat_sector,
1315 self.num_mini_fat_sectors,
1316 self.first_difat_sector,
1317 self.num_difat_sectors
1318 ) = struct.unpack(fmt_header, header1)
1319 log.debug( struct.unpack(fmt_header, header1))
1321 if self.header_signature != MAGIC:
1322 # OLE signature should always be present
1323 self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
1324 if self.header_clsid != bytearray(16):
1325 # according to AAF specs, CLSID should always be zero
1326 self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
1327 log.debug( "Minor Version = %d" % self.minor_version )
1328 # TODO: according to MS-CFB, minor version should be 0x003E
1329 log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version )
1330 if self.dll_version not in [3, 4]:
1331 # version 3: usual format, 512 bytes per sector
1332 # version 4: large format, 4K per sector
1333 self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")
1334 log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order )
1335 if self.byte_order != 0xFFFE:
1336 # For now only common little-endian documents are handled correctly
1337 self._raise_defect(DEFECT_INCORRECT, "incorrect ByteOrder in OLE header")
1338 # TODO: add big-endian support for documents created on Mac ?
1339 # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE.
1340 self.sector_size = 2**self.sector_shift
1341 log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size )
1342 if self.sector_size not in [512, 4096]:
1343 self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header")
1344 if (self.dll_version==3 and self.sector_size!=512) \
1345 or (self.dll_version==4 and self.sector_size!=4096):
1346 self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header")
1347 self.mini_sector_size = 2**self.mini_sector_shift
1348 log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size )
1349 if self.mini_sector_size not in [64]:
1350 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header")
1351 if self.reserved1 != 0 or self.reserved2 != 0:
1352 self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")
1353 log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors )
1354 # Number of directory sectors (only allowed if DllVersion != 3)
1355 if self.sector_size==512 and self.num_dir_sectors!=0:
1356 self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header")
1357 log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors )
1358 # num_fat_sectors = number of FAT sectors in the file
1359 log.debug( "First Directory sector = %Xh" % self.first_dir_sector )
1360 # first_dir_sector = 1st sector containing the directory
1361 log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number )
1362 # Signature should be zero, BUT some implementations do not follow this
1363 # rule => only a potential defect:
1364 # (according to MS-CFB, may be != 0 for applications supporting file
1365 # transactions)
1366 if self.transaction_signature_number != 0:
1367 self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)")
1368 log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size )
1369 # MS-CFB: This integer field MUST be set to 0x00001000. This field
1370 # specifies the maximum size of a user-defined data stream allocated
1371 # from the mini FAT and mini stream, and that cutoff is 4096 bytes.
1372 # Any user-defined data stream larger than or equal to this cutoff size
1373 # must be allocated as normal sectors from the FAT.
1374 if self.mini_stream_cutoff_size != 0x1000:
1375 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header")
1376 # if no exception is raised, the cutoff size is fixed to 0x1000
1377 log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' %
1378 self.mini_stream_cutoff_size)
1379 self.mini_stream_cutoff_size = 0x1000
1380 # TODO: check if these values are OK
1381 log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector )
1382 log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors )
1383 log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector )
1384 log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors )
1386 # calculate the number of sectors in the file
1387 # (-1 because header doesn't count)
1388 self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1
1389 log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect))
1390 # TODO: change this test, because an OLE file MAY contain other data
1391 # after the last sector.
1393 # file clsid
1394 self.header_clsid = _clsid(header[8:24])
1396 # TODO: remove redundant attributes, and fix the code which uses them?
1397 self.sectorsize = self.sector_size #1 << i16(header, 30)
1398 self.minisectorsize = self.mini_sector_size #1 << i16(header, 32)
1399 self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56)
1401 # check known streams for duplicate references (these are always in FAT,
1402 # never in MiniFAT):
1403 self._check_duplicate_stream(self.first_dir_sector)
1404 # check MiniFAT only if it is not empty:
1405 if self.num_mini_fat_sectors:
1406 self._check_duplicate_stream(self.first_mini_fat_sector)
1407 # check DIFAT only if it is not empty:
1408 if self.num_difat_sectors:
1409 self._check_duplicate_stream(self.first_difat_sector)
1411 # Load file allocation tables
1412 self.loadfat(header)
1413 # Load directory. This sets both the direntries list (ordered by sid)
1414 # and the root (ordered by hierarchy) members.
1415 self.loaddirectory(self.first_dir_sector)
1416 self.minifatsect = self.first_mini_fat_sector
1418 def close(self):
1419 """
1420 close the OLE file, release the file object if we created it ourselves.
1422 Leaves the file handle open if it was provided by the caller.
1423 """
1424 self._close(warn=False)
1426 def _close(self, warn=False):
1427 """Implementation of close() with internal arg `warn`."""
1428 if self._we_opened_fp:
1429 if warn and self.warn_if_not_closed:
1430 # we only raise a warning if the file was not explicitly closed,
1431 # and if the option warn_if_not_closed is enabled
1432 warnings.warn(OleFileIONotClosed(self._open_stack))
1433 self.fp.close()
1434 self._we_opened_fp = False
1436 def _check_duplicate_stream(self, first_sect, minifat=False):
1437 """
1438 Checks if a stream has not been already referenced elsewhere.
1439 This method should only be called once for each known stream, and only
1440 if stream size is not null.
1442 :param first_sect: int, index of first sector of the stream in FAT
1443 :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT
1444 """
1445 if minifat:
1446 log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect)
1447 used_streams = self._used_streams_minifat
1448 else:
1449 log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect)
1450 # some values can be safely ignored (not a real stream):
1451 if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):
1452 return
1453 used_streams = self._used_streams_fat
1454 # TODO: would it be more efficient using a dict or hash values, instead
1455 # of a list of long ?
1456 if first_sect in used_streams:
1457 self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')
1458 else:
1459 used_streams.append(first_sect)
1461 def dumpfat(self, fat, firstindex=0):
1462 """
1463 Display a part of FAT in human-readable form for debugging purposes
1464 """
1465 # dictionary to convert special FAT values in human-readable strings
1466 VPL = 8 # values per line (8+1 * 8+1 = 81)
1467 fatnames = {
1468 FREESECT: "..free..",
1469 ENDOFCHAIN: "[ END. ]",
1470 FATSECT: "FATSECT ",
1471 DIFSECT: "DIFSECT "
1472 }
1473 nbsect = len(fat)
1474 nlines = (nbsect+VPL-1)//VPL
1475 print("index", end=" ")
1476 for i in range(VPL):
1477 print("%8X" % i, end=" ")
1478 print()
1479 for l in range(nlines):
1480 index = l*VPL
1481 print("%6X:" % (firstindex+index), end=" ")
1482 for i in range(index, index+VPL):
1483 if i>=nbsect:
1484 break
1485 sect = fat[i]
1486 aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND
1487 if aux in fatnames:
1488 name = fatnames[aux]
1489 else:
1490 if sect == i+1:
1491 name = " --->"
1492 else:
1493 name = "%8X" % sect
1494 print(name, end=" ")
1495 print()
1497 def dumpsect(self, sector, firstindex=0):
1498 """
1499 Display a sector in a human-readable form, for debugging purposes
1500 """
1501 VPL=8 # number of values per line (8+1 * 8+1 = 81)
1502 tab = array.array(UINT32, sector)
1503 if sys.byteorder == 'big':
1504 tab.byteswap()
1505 nbsect = len(tab)
1506 nlines = (nbsect+VPL-1)//VPL
1507 print("index", end=" ")
1508 for i in range(VPL):
1509 print("%8X" % i, end=" ")
1510 print()
1511 for l in range(nlines):
1512 index = l*VPL
1513 print("%6X:" % (firstindex+index), end=" ")
1514 for i in range(index, index+VPL):
1515 if i>=nbsect:
1516 break
1517 sect = tab[i]
1518 name = "%8X" % sect
1519 print(name, end=" ")
1520 print()
1522 def sect2array(self, sect):
1523 """
1524 convert a sector to an array of 32 bits unsigned integers,
1525 swapping bytes on big endian CPUs such as PowerPC (old Macs)
1526 """
1527 # TODO: make this a static function
1528 a = array.array(UINT32, sect)
1529 # if CPU is big endian, swap bytes:
1530 if sys.byteorder == 'big':
1531 a.byteswap()
1532 return a
1534 def loadfat_sect(self, sect):
1535 """
1536 Adds the indexes of the given sector to the FAT
1538 :param sect: string containing the first FAT sector, or array of long integers
1539 :returns: index of last FAT sector.
1540 """
1541 # a FAT sector is an array of ulong integers.
1542 if isinstance(sect, array.array):
1543 # if sect is already an array it is directly used
1544 fat1 = sect
1545 else:
1546 # if it's a raw sector, it is parsed in an array
1547 fat1 = self.sect2array(sect)
1548 # Display the sector contents only if the logging level is debug:
1549 if log.isEnabledFor(logging.DEBUG):
1550 self.dumpsect(sect)
1551 # The FAT is a sector chain starting at the first index of itself.
1552 # initialize isect, just in case:
1553 isect = None
1554 for isect in fat1:
1555 isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND
1556 log.debug("isect = %X" % isect)
1557 if isect == ENDOFCHAIN or isect == FREESECT:
1558 # the end of the sector chain has been reached
1559 log.debug("found end of sector chain")
1560 break
1561 # read the FAT sector
1562 s = self.getsect(isect)
1563 # parse it as an array of 32 bits integers, and add it to the
1564 # global FAT array
1565 nextfat = self.sect2array(s)
1566 self.fat = self.fat + nextfat
1567 return isect
1569 def loadfat(self, header):
1570 """
1571 Load the FAT table.
1572 """
1573 # The 1st sector of the file contains sector numbers for the first 109
1574 # FAT sectors, right after the header which is 76 bytes long.
1575 # (always 109, whatever the sector size: 512 bytes = 76+4*109)
1576 # Additional sectors are described by DIF blocks
1578 log.debug('Loading the FAT table, starting with the 1st sector after the header')
1579 sect = header[76:512]
1580 log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) )
1581 # fat = []
1582 # FAT is an array of 32 bits unsigned ints, it's more effective
1583 # to use an array than a list in Python.
1584 # It's initialized as empty first:
1585 self.fat = array.array(UINT32)
1586 self.loadfat_sect(sect)
1587 # self.dumpfat(self.fat)
1588 # for i in range(0, len(sect), 4):
1589 # ix = i32(sect, i)
1590 # # [PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:
1591 # if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:
1592 # break
1593 # s = self.getsect(ix)
1594 # # fat = fat + [i32(s, i) for i in range(0, len(s), 4)]
1595 # fat = fat + array.array(UINT32, s)
1596 if self.num_difat_sectors != 0:
1597 log.debug('DIFAT is used, because file size > 6.8MB.')
1598 # [PL] There's a DIFAT because file is larger than 6.8MB
1599 # some checks just in case:
1600 if self.num_fat_sectors <= 109:
1601 # there must be at least 109 blocks in header and the rest in
1602 # DIFAT, so number of sectors must be >109.
1603 self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')
1604 if self.first_difat_sector >= self.nb_sect:
1605 # initial DIFAT block index must be valid
1606 self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')
1607 log.debug( "DIFAT analysis..." )
1608 # We compute the necessary number of DIFAT sectors :
1609 # Number of pointers per DIFAT sector = (sectorsize/4)-1
1610 # (-1 because the last pointer is the next DIFAT sector number)
1611 nb_difat_sectors = (self.sectorsize//4)-1
1612 # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)
1613 nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors
1614 log.debug( "nb_difat = %d" % nb_difat )
1615 if self.num_difat_sectors != nb_difat:
1616 raise IOError('incorrect DIFAT')
1617 isect_difat = self.first_difat_sector
1618 for i in iterrange(nb_difat):
1619 log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) )
1620 # TODO: check if corresponding FAT SID = DIFSECT
1621 sector_difat = self.getsect(isect_difat)
1622 difat = self.sect2array(sector_difat)
1623 # Display the sector contents only if the logging level is debug:
1624 if log.isEnabledFor(logging.DEBUG):
1625 self.dumpsect(sector_difat)
1626 self.loadfat_sect(difat[:nb_difat_sectors])
1627 # last DIFAT pointer is next DIFAT sector:
1628 isect_difat = difat[nb_difat_sectors]
1629 log.debug( "next DIFAT sector: %X" % isect_difat )
1630 # checks:
1631 if isect_difat not in [ENDOFCHAIN, FREESECT]:
1632 # last DIFAT pointer value must be ENDOFCHAIN or FREESECT
1633 raise IOError('incorrect end of DIFAT')
1634 # if len(self.fat) != self.num_fat_sectors:
1635 # # FAT should contain num_fat_sectors blocks
1636 # print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors))
1637 # raise IOError('incorrect DIFAT')
1638 else:
1639 log.debug('No DIFAT, because file size < 6.8MB.')
1640 # since FAT is read from fixed-size sectors, it may contain more values
1641 # than the actual number of sectors in the file.
1642 # Keep only the relevant sector indexes:
1643 if len(self.fat) > self.nb_sect:
1644 log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))
1645 self.fat = self.fat[:self.nb_sect]
1646 log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect))
1647 # Display the FAT contents only if the logging level is debug:
1648 if log.isEnabledFor(logging.DEBUG):
1649 log.debug('\nFAT:')
1650 self.dumpfat(self.fat)
1652 def loadminifat(self):
1653 """
1654 Load the MiniFAT table.
1655 """
1656 # MiniFAT is stored in a standard sub-stream, pointed to by a header
1657 # field.
1658 # NOTE: there are two sizes to take into account for this stream:
1659 # 1) Stream size is calculated according to the number of sectors
1660 # declared in the OLE header. This allocated stream may be more than
1661 # needed to store the actual sector indexes.
1662 # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size)
1663 stream_size = self.num_mini_fat_sectors * self.sector_size
1664 # 2) Actually used size is calculated by dividing the MiniStream size
1665 # (given by root entry size) by the size of mini sectors, *4 for
1666 # 32 bits indexes:
1667 nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size
1668 used_size = nb_minisectors * 4
1669 log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %
1670 (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors))
1671 if used_size > stream_size:
1672 # This is not really a problem, but may indicate a wrong implementation:
1673 self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')
1674 # In any case, first read stream_size:
1675 s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
1676 # [PL] Old code replaced by an array:
1677 #self.minifat = [i32(s, i) for i in range(0, len(s), 4)]
1678 self.minifat = self.sect2array(s)
1679 # Then shrink the array to used size, to avoid indexes out of MiniStream:
1680 log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))
1681 self.minifat = self.minifat[:nb_minisectors]
1682 log.debug('loadminifat(): len=%d' % len(self.minifat))
1683 # Display the FAT contents only if the logging level is debug:
1684 if log.isEnabledFor(logging.DEBUG):
1685 log.debug('\nMiniFAT:')
1686 self.dumpfat(self.minifat)
1688 def getsect(self, sect):
1689 """
1690 Read given sector from file on disk.
1692 :param sect: int, sector index
1693 :returns: a string containing the sector data.
1694 """
1695 # From [MS-CFB]: A sector number can be converted into a byte offset
1696 # into the file by using the following formula:
1697 # (sector number + 1) x Sector Size.
1698 # This implies that sector #0 of the file begins at byte offset Sector
1699 # Size, not at 0.
1701 # [PL] the original code in PIL was wrong when sectors are 4KB instead of
1702 # 512 bytes:
1703 #self.fp.seek(512 + self.sectorsize * sect)
1704 # [PL]: added safety checks:
1705 #print("getsect(%X)" % sect)
1706 try:
1707 self.fp.seek(self.sectorsize * (sect+1))
1708 except Exception:
1709 log.debug('getsect(): sect=%X, seek=%d, filesize=%d' %
1710 (sect, self.sectorsize*(sect+1), self._filesize))
1711 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1712 sector = self.fp.read(self.sectorsize)
1713 if len(sector) != self.sectorsize:
1714 log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
1715 (sect, len(sector), self.sectorsize))
1716 self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
1717 return sector
1719 def write_sect(self, sect, data, padding=b'\x00'):
1720 """
1721 Write given sector to file on disk.
1723 :param sect: int, sector index
1724 :param data: bytes, sector data
1725 :param padding: single byte, padding character if data < sector size
1726 """
1727 if not isinstance(data, bytes):
1728 raise TypeError("write_sect: data must be a bytes string")
1729 if not isinstance(padding, bytes) or len(padding)!=1:
1730 raise TypeError("write_sect: padding must be a bytes string of 1 char")
1731 # TODO: we could allow padding=None for no padding at all
1732 try:
1733 self.fp.seek(self.sectorsize * (sect+1))
1734 except Exception:
1735 log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' %
1736 (sect, self.sectorsize*(sect+1), self._filesize))
1737 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1738 if len(data) < self.sectorsize:
1739 # add padding
1740 data += padding * (self.sectorsize - len(data))
1741 elif len(data) > self.sectorsize:
1742 raise ValueError("Data is larger than sector size")
1743 self.fp.write(data)
1745 def _write_mini_sect(self, fp_pos, data, padding = b'\x00'):
1746 """
1747 Write given sector to file on disk.
1749 :param fp_pos: int, file position
1750 :param data: bytes, sector data
1751 :param padding: single byte, padding character if data < sector size
1752 """
1753 if not isinstance(data, bytes):
1754 raise TypeError("write_mini_sect: data must be a bytes string")
1755 if not isinstance(padding, bytes) or len(padding) != 1:
1756 raise TypeError("write_mini_sect: padding must be a bytes string of 1 char")
1758 try:
1759 self.fp.seek(fp_pos)
1760 except Exception:
1761 log.debug('write_mini_sect(): fp_pos=%d, filesize=%d' %
1762 (fp_pos, self._filesize))
1763 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1764 len_data = len(data)
1765 if len_data < self.mini_sector_size:
1766 data += padding * (self.mini_sector_size - len_data)
1767 if self.mini_sector_size < len_data:
1768 raise ValueError("Data is larger than sector size")
1769 self.fp.write(data)
1771 def loaddirectory(self, sect):
1772 """
1773 Load the directory.
1775 :param sect: sector index of directory stream.
1776 """
1777 log.debug('Loading the Directory:')
1778 # The directory is stored in a standard
1779 # substream, independent of its size.
1781 # open directory stream as a read-only file:
1782 # (stream size is not known in advance)
1783 self.directory_fp = self._open(sect, force_FAT=True)
1785 # [PL] to detect malformed documents and avoid DoS attacks, the maximum
1786 # number of directory entries can be calculated:
1787 max_entries = self.directory_fp.size // 128
1788 log.debug('loaddirectory: size=%d, max_entries=%d' %
1789 (self.directory_fp.size, max_entries))
1791 # Create list of directory entries
1792 # self.direntries = []
1793 # We start with a list of "None" object
1794 self.direntries = [None] * max_entries
1795 # for sid in iterrange(max_entries):
1796 # entry = fp.read(128)
1797 # if not entry:
1798 # break
1799 # self.direntries.append(OleDirectoryEntry(entry, sid, self))
1800 # load root entry:
1801 root_entry = self._load_direntry(0)
1802 # Root entry is the first entry:
1803 self.root = self.direntries[0]
1804 # TODO: read ALL directory entries (ignore bad entries?)
1805 # TODO: adapt build_storage_tree to avoid duplicate reads
1806 # for i in range(1, max_entries):
1807 # self._load_direntry(i)
1808 # read and build all storage trees, starting from the root:
1809 self.root.build_storage_tree()
1811 def _load_direntry (self, sid):
1812 """
1813 Load a directory entry from the directory.
1814 This method should only be called once for each storage/stream when
1815 loading the directory.
1817 :param sid: index of storage/stream in the directory.
1818 :returns: a OleDirectoryEntry object
1820 :exception OleFileError: if the entry has always been referenced.
1821 """
1822 # check if SID is OK:
1823 if sid<0 or sid>=len(self.direntries):
1824 self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")
1825 # check if entry was already referenced:
1826 if self.direntries[sid] is not None:
1827 self._raise_defect(DEFECT_INCORRECT,
1828 "double reference for OLE stream/storage")
1829 # if exception not raised, return the object
1830 return self.direntries[sid]
1831 self.directory_fp.seek(sid * 128)
1832 entry = self.directory_fp.read(128)
1833 self.direntries[sid] = OleDirectoryEntry(entry, sid, self)
1834 return self.direntries[sid]
1836 def dumpdirectory(self):
1837 """
1838 Dump directory (for debugging only)
1839 """
1840 self.root.dump()
1842 def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False):
1843 """
1844 Open a stream, either in FAT or MiniFAT according to its size.
1845 (openstream helper)
1847 :param start: index of first sector
1848 :param size: size of stream (or nothing if size is unknown)
1849 :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT
1850 according to size. If True, it will always be opened in FAT.
1851 """
1852 log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' %
1853 (start, size, str(force_FAT)))
1854 # stream size is compared to the mini_stream_cutoff_size threshold:
1855 if size < self.minisectorcutoff and not force_FAT:
1856 # ministream object
1857 if not self.ministream:
1858 # load MiniFAT if it wasn't already done:
1859 self.loadminifat()
1860 # The first sector index of the miniFAT stream is stored in the
1861 # root directory entry:
1862 size_ministream = self.root.size
1863 log.debug('Opening MiniStream: sect=%Xh, size=%d' %
1864 (self.root.isectStart, size_ministream))
1865 self.ministream = self._open(self.root.isectStart,
1866 size_ministream, force_FAT=True)
1867 return OleStream(fp=self.ministream, sect=start, size=size,
1868 offset=0, sectorsize=self.minisectorsize,
1869 fat=self.minifat, filesize=self.ministream.size,
1870 olefileio=self)
1871 else:
1872 # standard stream
1873 return OleStream(fp=self.fp, sect=start, size=size,
1874 offset=self.sectorsize,
1875 sectorsize=self.sectorsize, fat=self.fat,
1876 filesize=self._filesize,
1877 olefileio=self)
1879 def _list(self, files, prefix, node, streams=True, storages=False):
1880 """
1881 listdir helper
1883 :param files: list of files to fill in
1884 :param prefix: current location in storage tree (list of names)
1885 :param node: current node (OleDirectoryEntry object)
1886 :param streams: bool, include streams if True (True by default) - new in v0.26
1887 :param storages: bool, include storages if True (False by default) - new in v0.26
1888 (note: the root storage is never included)
1889 """
1890 prefix = prefix + [node.name]
1891 for entry in node.kids:
1892 if entry.entry_type == STGTY_STORAGE:
1893 # this is a storage
1894 if storages:
1895 # add it to the list
1896 files.append(prefix[1:] + [entry.name])
1897 # check its kids
1898 self._list(files, prefix, entry, streams, storages)
1899 elif entry.entry_type == STGTY_STREAM:
1900 # this is a stream
1901 if streams:
1902 # add it to the list
1903 files.append(prefix[1:] + [entry.name])
1904 else:
1905 self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.')
1907 def listdir(self, streams=True, storages=False):
1908 """
1909 Return a list of streams and/or storages stored in this file
1911 :param streams: bool, include streams if True (True by default) - new in v0.26
1912 :param storages: bool, include storages if True (False by default) - new in v0.26
1913 (note: the root storage is never included)
1914 :returns: list of stream and/or storage paths
1915 """
1916 files = []
1917 self._list(files, [], self.root, streams, storages)
1918 return files
1920 def _find(self, filename):
1921 """
1922 Returns directory entry of given filename. (openstream helper)
1923 Note: this method is case-insensitive.
1925 :param filename: path of stream in storage tree (except root entry), either:
1927 - a string using Unix path syntax, for example:
1928 'storage_1/storage_1.2/stream'
1929 - or a list of storage filenames, path to the desired stream/storage.
1930 Example: ['storage_1', 'storage_1.2', 'stream']
1932 :returns: sid of requested filename
1933 :exception IOError: if file not found
1934 """
1936 # if filename is a string instead of a list, split it on slashes to
1937 # convert to a list:
1938 if isinstance(filename, basestring):
1939 filename = filename.split('/')
1940 # walk across storage tree, following given path:
1941 node = self.root
1942 for name in filename:
1943 for kid in node.kids:
1944 if kid.name.lower() == name.lower():
1945 break
1946 else:
1947 raise IOError("file not found")
1948 node = kid
1949 return node.sid
1951 def openstream(self, filename):
1952 """
1953 Open a stream as a read-only file object (BytesIO).
1954 Note: filename is case-insensitive.
1956 :param filename: path of stream in storage tree (except root entry), either:
1958 - a string using Unix path syntax, for example:
1959 'storage_1/storage_1.2/stream'
1960 - or a list of storage filenames, path to the desired stream/storage.
1961 Example: ['storage_1', 'storage_1.2', 'stream']
1963 :returns: file object (read-only)
1964 :exception IOError: if filename not found, or if this is not a stream.
1965 """
1966 sid = self._find(filename)
1967 entry = self.direntries[sid]
1968 if entry.entry_type != STGTY_STREAM:
1969 raise IOError("this file is not a stream")
1970 return self._open(entry.isectStart, entry.size)
1972 def _write_mini_stream(self, entry, data_to_write):
1973 if not entry.sect_chain:
1974 entry.build_sect_chain(self)
1975 nb_sectors = len(entry.sect_chain)
1977 if not self.root.sect_chain:
1978 self.root.build_sect_chain(self)
1979 block_size = self.sector_size // self.mini_sector_size
1980 for idx, sect in enumerate(entry.sect_chain):
1981 sect_base = sect // block_size
1982 sect_offset = sect % block_size
1983 fp_pos = (self.root.sect_chain[sect_base] + 1)*self.sector_size + sect_offset*self.mini_sector_size
1984 if idx < (nb_sectors - 1):
1985 data_per_sector = data_to_write[idx * self.mini_sector_size: (idx + 1) * self.mini_sector_size]
1986 else:
1987 data_per_sector = data_to_write[idx * self.mini_sector_size:]
1988 self._write_mini_sect(fp_pos, data_per_sector)
1990 def write_stream(self, stream_name, data):
1991 """
1992 Write a stream to disk. For now, it is only possible to replace an
1993 existing stream by data of the same size.
1995 :param stream_name: path of stream in storage tree (except root entry), either:
1997 - a string using Unix path syntax, for example:
1998 'storage_1/storage_1.2/stream'
1999 - or a list of storage filenames, path to the desired stream/storage.
2000 Example: ['storage_1', 'storage_1.2', 'stream']
2002 :param data: bytes, data to be written, must be the same size as the original
2003 stream.
2004 """
2005 if not isinstance(data, bytes):
2006 raise TypeError("write_stream: data must be a bytes string")
2007 sid = self._find(stream_name)
2008 entry = self.direntries[sid]
2009 if entry.entry_type != STGTY_STREAM:
2010 raise IOError("this is not a stream")
2011 size = entry.size
2012 if size != len(data):
2013 raise ValueError("write_stream: data must be the same size as the existing stream")
2014 if size < self.minisectorcutoff and entry.entry_type != STGTY_ROOT:
2015 return self._write_mini_stream(entry = entry, data_to_write = data)
2017 sect = entry.isectStart
2018 # number of sectors to write
2019 nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize
2020 log.debug('nb_sectors = %d' % nb_sectors)
2021 for i in range(nb_sectors):
2022 # try:
2023 # self.fp.seek(offset + self.sectorsize * sect)
2024 # except Exception:
2025 # log.debug('sect=%d, seek=%d' %
2026 # (sect, offset+self.sectorsize*sect))
2027 # raise IOError('OLE sector index out of range')
2028 # extract one sector from data, the last one being smaller:
2029 if i<(nb_sectors-1):
2030 data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize]
2031 # TODO: comment this if it works
2032 assert(len(data_sector)==self.sectorsize)
2033 else:
2034 data_sector = data [i*self.sectorsize:]
2035 # TODO: comment this if it works
2036 log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d'
2037 % (size, self.sectorsize, len(data_sector), size % self.sectorsize))
2038 assert(len(data_sector) % self.sectorsize==size % self.sectorsize)
2039 self.write_sect(sect, data_sector)
2040 # self.fp.write(data_sector)
2041 # jump to next sector in the FAT:
2042 try:
2043 sect = self.fat[sect]
2044 except IndexError:
2045 # [PL] if pointer is out of the FAT an exception is raised
2046 raise IOError('incorrect OLE FAT, sector index out of range')
2047 # [PL] Last sector should be a "end of chain" marker:
2048 if sect != ENDOFCHAIN:
2049 raise IOError('incorrect last sector index in OLE stream')
2051 def get_type(self, filename):
2052 """
2053 Test if given filename exists as a stream or a storage in the OLE
2054 container, and return its type.
2056 :param filename: path of stream in storage tree. (see openstream for syntax)
2057 :returns: False if object does not exist, its entry type (>0) otherwise:
2059 - STGTY_STREAM: a stream
2060 - STGTY_STORAGE: a storage
2061 - STGTY_ROOT: the root entry
2062 """
2063 try:
2064 sid = self._find(filename)
2065 entry = self.direntries[sid]
2066 return entry.entry_type
2067 except Exception:
2068 return False
2070 def getclsid(self, filename):
2071 """
2072 Return clsid of a stream/storage.
2074 :param filename: path of stream/storage in storage tree. (see openstream for
2075 syntax)
2076 :returns: Empty string if clsid is null, a printable representation of the clsid otherwise
2078 new in version 0.44
2079 """
2080 sid = self._find(filename)
2081 entry = self.direntries[sid]
2082 return entry.clsid
2084 def getmtime(self, filename):
2085 """
2086 Return modification time of a stream/storage.
2088 :param filename: path of stream/storage in storage tree. (see openstream for
2089 syntax)
2090 :returns: None if modification time is null, a python datetime object
2091 otherwise (UTC timezone)
2093 new in version 0.26
2094 """
2095 sid = self._find(filename)
2096 entry = self.direntries[sid]
2097 return entry.getmtime()
2099 def getctime(self, filename):
2100 """
2101 Return creation time of a stream/storage.
2103 :param filename: path of stream/storage in storage tree. (see openstream for
2104 syntax)
2105 :returns: None if creation time is null, a python datetime object
2106 otherwise (UTC timezone)
2108 new in version 0.26
2109 """
2110 sid = self._find(filename)
2111 entry = self.direntries[sid]
2112 return entry.getctime()
2114 def exists(self, filename):
2115 """
2116 Test if given filename exists as a stream or a storage in the OLE
2117 container.
2118 Note: filename is case-insensitive.
2120 :param filename: path of stream in storage tree. (see openstream for syntax)
2121 :returns: True if object exist, else False.
2122 """
2123 try:
2124 sid = self._find(filename)
2125 return True
2126 except Exception:
2127 return False
2129 def get_size(self, filename):
2130 """
2131 Return size of a stream in the OLE container, in bytes.
2133 :param filename: path of stream in storage tree (see openstream for syntax)
2134 :returns: size in bytes (long integer)
2135 :exception IOError: if file not found
2136 :exception TypeError: if this is not a stream.
2137 """
2138 sid = self._find(filename)
2139 entry = self.direntries[sid]
2140 if entry.entry_type != STGTY_STREAM:
2141 # TODO: Should it return zero instead of raising an exception ?
2142 raise TypeError('object is not an OLE stream')
2143 return entry.size
2145 def get_rootentry_name(self):
2146 """
2147 Return root entry name. Should usually be 'Root Entry' or 'R' in most
2148 implementations.
2149 """
2150 return self.root.name
2152 def getproperties(self, filename, convert_time=False, no_conversion=None):
2153 """
2154 Return properties described in substream.
2156 :param filename: path of stream in storage tree (see openstream for syntax)
2157 :param convert_time: bool, if True timestamps will be converted to Python datetime
2158 :param no_conversion: None or list of int, timestamps not to be converted
2159 (for example total editing time is not a real timestamp)
2161 :returns: a dictionary of values indexed by id (integer)
2162 """
2163 #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx
2164 # make sure no_conversion is a list, just to simplify code below:
2165 if no_conversion == None:
2166 no_conversion = []
2167 # stream path as a string to report exceptions:
2168 streampath = filename
2169 if not isinstance(streampath, str):
2170 streampath = '/'.join(streampath)
2171 fp = self.openstream(filename)
2172 data = {}
2173 try:
2174 # header
2175 s = fp.read(28)
2176 clsid = _clsid(s[8:24])
2177 # format id
2178 s = fp.read(20)
2179 fmtid = _clsid(s[:16])
2180 fp.seek(i32(s, 16))
2181 # get section
2182 s = b"****" + fp.read(i32(fp.read(4))-4)
2183 # number of properties:
2184 num_props = i32(s, 4)
2185 except BaseException as exc:
2186 # catch exception while parsing property header, and only raise
2187 # a DEFECT_INCORRECT then return an empty dict, because this is not
2188 # a fatal error when parsing the whole file
2189 msg = 'Error while parsing properties header in stream {}: {}'.format(
2190 repr(streampath), exc)
2191 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2192 return data
2193 # clamp num_props based on the data length
2194 num_props = min(num_props, int(len(s) / 8))
2195 for i in iterrange(num_props):
2196 property_id = 0 # just in case of an exception
2197 try:
2198 property_id = i32(s, 8+i*8)
2199 offset = i32(s, 12+i*8)
2200 property_type = i32(s, offset)
2202 vt_name = VT.get(property_type, 'UNKNOWN')
2203 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))
2205 value = self._parse_property(s, offset+4, property_id, property_type, convert_time, no_conversion)
2206 data[property_id] = value
2207 except BaseException as exc:
2208 # catch exception while parsing each property, and only raise
2209 # a DEFECT_INCORRECT, because parsing can go on
2210 msg = 'Error while parsing property id %d in stream %s: %s' % (
2211 property_id, repr(streampath), exc)
2212 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2214 return data
2216 def _parse_property(self, s, offset, property_id, property_type, convert_time, no_conversion):
2217 v = None
2218 if property_type <= VT_BLOB or property_type in (VT_CLSID, VT_CF):
2219 v, _ = self._parse_property_basic(s, offset, property_id, property_type, convert_time, no_conversion)
2220 elif property_type == VT_VECTOR | VT_VARIANT:
2221 log.debug('property_type == VT_VECTOR | VT_VARIANT')
2222 off = 4
2223 count = i32(s, offset)
2224 values = []
2225 for _ in range(count):
2226 property_type = i32(s, offset + off)
2227 v, sz = self._parse_property_basic(s, offset + off + 4, property_id, property_type, convert_time, no_conversion)
2228 values.append(v)
2229 off += sz + 4
2230 v = values
2232 elif property_type & VT_VECTOR:
2233 property_type_base = property_type & ~VT_VECTOR
2234 log.debug('property_type == VT_VECTOR | %s' % VT.get(property_type_base, 'UNKNOWN'))
2235 off = 4
2236 count = i32(s, offset)
2237 values = []
2238 for _ in range(count):
2239 v, sz = self._parse_property_basic(s, offset + off, property_id, property_type & ~VT_VECTOR, convert_time, no_conversion)
2240 values.append(v)
2241 off += sz
2242 v = values
2243 else:
2244 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))
2245 return v
2247 def _parse_property_basic(self, s, offset, property_id, property_type, convert_time, no_conversion):
2248 value = None
2249 size = 0
2250 # test for common types first (should perhaps use
2251 # a dictionary instead?)
2253 if property_type == VT_I2: # 16-bit signed integer
2254 value = i16(s, offset)
2255 if value >= 32768:
2256 value = value - 65536
2257 size = 2
2258 elif property_type == VT_UI2: # 2-byte unsigned integer
2259 value = i16(s, offset)
2260 size = 2
2261 elif property_type in (VT_I4, VT_INT, VT_ERROR):
2262 # VT_I4: 32-bit signed integer
2263 # VT_ERROR: HRESULT, similar to 32-bit signed integer,
2264 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx
2265 value = i32(s, offset)
2266 size = 4
2267 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer
2268 value = i32(s, offset) # FIXME
2269 size = 4
2270 elif property_type in (VT_BSTR, VT_LPSTR):
2271 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx
2272 # size is a 32 bits integer, including the null terminator, and
2273 # possibly trailing or embedded null chars
2274 #TODO: if codepage is unicode, the string should be converted as such
2275 count = i32(s, offset)
2276 value = s[offset+4:offset+4+count-1]
2277 # remove all null chars:
2278 value = value.replace(b'\x00', b'')
2279 size = 4 + count
2280 elif property_type == VT_BLOB:
2281 # binary large object (BLOB)
2282 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx
2283 count = i32(s, offset)
2284 value = s[offset+4:offset+4+count]
2285 size = 4 + count
2286 elif property_type == VT_LPWSTR:
2287 # UnicodeString
2288 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx
2289 # "the string should NOT contain embedded or additional trailing
2290 # null characters."
2291 count = i32(s, offset+4)
2292 value = self._decode_utf16_str(s[offset+4:offset+4+count*2])
2293 size = 4 + count * 2
2294 elif property_type == VT_FILETIME:
2295 value = long(i32(s, offset)) + (long(i32(s, offset+4))<<32)
2296 # FILETIME is a 64-bit int: "number of 100ns periods
2297 # since Jan 1,1601".
2298 if convert_time and property_id not in no_conversion:
2299 log.debug('Converting property #%d to python datetime, value=%d=%fs'
2300 %(property_id, value, float(value)/10000000))
2301 # convert FILETIME to Python datetime.datetime
2302 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/
2303 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
2304 log.debug('timedelta days=%d' % (value//(10*1000000*3600*24)))
2305 value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10)
2306 else:
2307 # legacy code kept for backward compatibility: returns a
2308 # number of seconds since Jan 1,1601
2309 value = value // 10000000 # seconds
2310 size = 8
2311 elif property_type == VT_UI1: # 1-byte unsigned integer
2312 value = i8(s[offset])
2313 size = 1
2314 elif property_type == VT_CLSID:
2315 value = _clsid(s[offset:offset+16])
2316 size = 16
2317 elif property_type == VT_CF:
2318 # PropertyIdentifier or ClipboardData??
2319 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx
2320 count = i32(s, offset)
2321 value = s[offset+4:offset+4+count]
2322 size = 4 + count
2323 elif property_type == VT_BOOL:
2324 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True
2325 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx
2326 value = bool(i16(s, offset))
2327 size = 2
2328 else:
2329 value = None # everything else yields "None"
2330 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))
2332 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,
2333 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,
2334 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx
2336 #print("%08x" % property_id, repr(value), end=" ")
2337 #print("(%s)" % VT[i32(s, offset) & 0xFFF])
2338 return value, size
2341 def get_metadata(self):
2342 """
2343 Parse standard properties streams, return an OleMetadata object
2344 containing all the available metadata.
2345 (also stored in the metadata attribute of the OleFileIO object)
2347 new in version 0.25
2348 """
2349 self.metadata = OleMetadata()
2350 self.metadata.parse_properties(self)
2351 return self.metadata
2353 def get_userdefined_properties(self, filename, convert_time=False, no_conversion=None):
2354 """
2355 Return properties described in substream.
2357 :param filename: path of stream in storage tree (see openstream for syntax)
2358 :param convert_time: bool, if True timestamps will be converted to Python datetime
2359 :param no_conversion: None or list of int, timestamps not to be converted
2360 (for example total editing time is not a real timestamp)
2362 :returns: a dictionary of values indexed by id (integer)
2363 """
2364 # REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx
2365 # REFERENCE: https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-oshared/2ea8be67-a4a0-4e2e-b42f-49a182645562
2366 #'D5CDD502-2E9C-101B-9397-08002B2CF9AE'
2367 # TODO: testing the code more rigorously
2368 # TODO: adding exception handeling
2369 FMTID_USERDEFINED_PROPERTIES = _clsid(b'\x05\xD5\xCD\xD5\x9C\x2E\x1B\x10\x93\x97\x08\x00\x2B\x2C\xF9\xAE')
2371 # make sure no_conversion is a list, just to simplify code below:
2372 if no_conversion == None:
2373 no_conversion = []
2374 # stream path as a string to report exceptions:
2375 streampath = filename
2376 if not isinstance(streampath, str):
2377 streampath = '/'.join(streampath)
2379 fp = self.openstream(filename)
2381 data = []
2383 # header
2384 s = fp.read(28)
2385 clsid = _clsid(s[8:24])
2387 # PropertySetStream.cSections (4 bytes starts at 1c): number of property sets in this stream
2388 sections_count = i32(s, 24)
2390 section_file_pointers = []
2392 try:
2393 for i in range(sections_count):
2394 # format id
2395 s = fp.read(20)
2396 fmtid = _clsid(s[:16])
2398 if fmtid == FMTID_USERDEFINED_PROPERTIES:
2399 file_pointer = i32(s, 16)
2400 fp.seek(file_pointer)
2401 # read saved sections
2402 s = b"****" + fp.read(i32(fp.read(4)) - 4)
2403 # number of properties:
2404 num_props = i32(s, 4)
2406 PropertyIdentifierAndOffset = s[8: 8+8*num_props]
2408 # property names (dictionary)
2409 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/99127b7f-c440-4697-91a4-c853086d6b33
2410 index = 8+8*num_props
2411 entry_count = i32(s[index: index+4])
2412 index += 4
2413 for i in range(entry_count):
2414 identifier = s[index: index +4]
2415 str_size = i32(s[index+4: index + 8])
2416 string = s[index+8: index+8+str_size].decode('utf_8').strip('\0')
2417 data.append({'property_name':string, 'value':None})
2418 index = index+8+str_size
2419 # clamp num_props based on the data length
2420 num_props = min(num_props, int(len(s) / 8))
2422 # property values
2423 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/f122b9d7-e5cf-4484-8466-83f6fd94b3cc
2424 for i in iterrange(2, num_props):
2425 property_id = 0 # just in case of an exception
2426 try:
2427 property_id = i32(s, 8 + i * 8)
2428 offset = i32(s, 12 + i * 8)
2429 property_type = i32(s, offset)
2431 vt_name = VT.get(property_type, 'UNKNOWN')
2432 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))
2434 # test for common types first (should perhaps use
2435 # a dictionary instead?)
2437 if property_type == VT_I2: # 16-bit signed integer
2438 value = i16(s, offset + 4)
2439 if value >= 32768:
2440 value = value - 65536
2441 elif property_type == 1:
2442 # supposed to be VT_NULL but seems it is not NULL
2443 str_size = i32(s, offset + 8)
2444 value = s[offset + 12:offset + 12 + str_size - 1]
2446 elif property_type == VT_UI2: # 2-byte unsigned integer
2447 value = i16(s, offset + 4)
2448 elif property_type in (VT_I4, VT_INT, VT_ERROR):
2449 # VT_I4: 32-bit signed integer
2450 # VT_ERROR: HRESULT, similar to 32-bit signed integer,
2451 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx
2452 value = i32(s, offset + 4)
2453 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer
2454 value = i32(s, offset + 4) # FIXME
2455 elif property_type in (VT_BSTR, VT_LPSTR):
2456 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx
2457 # size is a 32 bits integer, including the null terminator, and
2458 # possibly trailing or embedded null chars
2459 # TODO: if codepage is unicode, the string should be converted as such
2460 count = i32(s, offset + 4)
2461 value = s[offset + 8:offset + 8 + count - 1]
2462 # remove all null chars:
2463 value = value.replace(b'\x00', b'')
2464 elif property_type == VT_BLOB:
2465 # binary large object (BLOB)
2466 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx
2467 count = i32(s, offset + 4)
2468 value = s[offset + 8:offset + 8 + count]
2469 elif property_type == VT_LPWSTR:
2470 # UnicodeString
2471 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx
2472 # "the string should NOT contain embedded or additional trailing
2473 # null characters."
2474 count = i32(s, offset + 4)
2475 value = self._decode_utf16_str(s[offset + 8:offset + 8 + count * 2])
2476 elif property_type == VT_FILETIME:
2477 value = long(i32(s, offset + 4)) + (long(i32(s, offset + 8)) << 32)
2478 # FILETIME is a 64-bit int: "number of 100ns periods
2479 # since Jan 1,1601".
2480 if convert_time and property_id not in no_conversion:
2481 log.debug('Converting property #%d to python datetime, value=%d=%fs'
2482 % (property_id, value, float(value) / 10000000))
2483 # convert FILETIME to Python datetime.datetime
2484 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/
2485 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
2486 log.debug('timedelta days=%d' % (value // (10 * 1000000 * 3600 * 24)))
2487 value = _FILETIME_null_date + datetime.timedelta(microseconds=value // 10)
2488 else:
2489 # legacy code kept for backward compatibility: returns a
2490 # number of seconds since Jan 1,1601
2491 value = value // 10000000 # seconds
2492 elif property_type == VT_UI1: # 1-byte unsigned integer
2493 value = i8(s[offset + 4])
2494 elif property_type == VT_CLSID:
2495 value = _clsid(s[offset + 4:offset + 20])
2496 elif property_type == VT_CF:
2497 # PropertyIdentifier or ClipboardData??
2498 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx
2499 count = i32(s, offset + 4)
2500 value = s[offset + 8:offset + 8 + count]
2501 elif property_type == VT_BOOL:
2502 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True
2503 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx
2504 value = bool(i16(s, offset + 4))
2505 else:
2506 value = None # everything else yields "None"
2507 log.debug(
2508 'property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))
2510 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,
2511 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,
2512 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx
2514 # FIXME: add support for VT_VECTOR
2515 # VT_VECTOR is a 32 uint giving the number of items, followed by
2516 # the items in sequence. The VT_VECTOR value is combined with the
2517 # type of items, e.g. VT_VECTOR|VT_BSTR
2518 # see https://msdn.microsoft.com/en-us/library/dd942011.aspx
2520 # print("%08x" % property_id, repr(value), end=" ")
2521 # print("(%s)" % VT[i32(s, offset) & 0xFFF])
2523 data[i-2]['value']=value
2524 except BaseException as exc:
2525 # catch exception while parsing each property, and only raise
2526 # a DEFECT_INCORRECT, because parsing can go on
2527 msg = 'Error while parsing property id %d in stream %s: %s' % (
2528 property_id, repr(streampath), exc)
2529 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2531 except BaseException as exc:
2532 # catch exception while parsing property header, and only raise
2533 # a DEFECT_INCORRECT then return an empty dict, because this is not
2534 # a fatal error when parsing the whole file
2535 msg = 'Error while parsing properties header in stream %s: %s' % (
2536 repr(streampath), exc)
2537 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
2538 return data
2540 return data
2543# --------------------------------------------------------------------
2544# This script can be used to dump the directory of any OLE2 structured
2545# storage file.
2547def main():
2548 """
2549 Main function when olefile is runs as a script from the command line.
2550 This will open an OLE2 file and display its structure and properties
2551 :return: nothing
2552 """
2553 import sys, optparse
2555 DEFAULT_LOG_LEVEL = "warning" # Default log level
2556 LOG_LEVELS = {
2557 'debug': logging.DEBUG,
2558 'info': logging.INFO,
2559 'warning': logging.WARNING,
2560 'error': logging.ERROR,
2561 'critical': logging.CRITICAL
2562 }
2564 usage = 'usage: %prog [options] <filename> [filename2 ...]'
2565 parser = optparse.OptionParser(usage=usage)
2567 parser.add_option("-c", action="store_true", dest="check_streams",
2568 help='check all streams (for debugging purposes)')
2569 parser.add_option("-p", action="store_true", dest="extract_customprop",
2570 help='extract all user-defined propertires')
2571 parser.add_option("-d", action="store_true", dest="debug_mode",
2572 help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)')
2573 parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
2574 help="logging level debug/info/warning/error/critical (default=%default)")
2576 (options, args) = parser.parse_args()
2578 print('olefile version {} {} - https://www.decalage.info/en/olefile\n'.format(__version__, __date__))
2580 # Print help if no arguments are passed
2581 if len(args) == 0:
2582 print(__doc__)
2583 parser.print_help()
2584 sys.exit()
2586 if options.debug_mode:
2587 options.loglevel = 'debug'
2589 # setup logging to the console
2590 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')
2592 # also enable the module's logger:
2593 enable_logging()
2595 for filename in args:
2596 try:
2597 ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT)
2598 print("-" * 68)
2599 print(filename)
2600 print("-" * 68)
2601 ole.dumpdirectory()
2602 for streamname in ole.listdir():
2603 if streamname[-1][0] == "\005":
2604 print("%r: properties" % streamname)
2605 try:
2606 props = ole.getproperties(streamname, convert_time=True)
2607 props = sorted(props.items())
2608 for k, v in props:
2609 # [PL]: avoid to display too large or binary values:
2610 if isinstance(v, (basestring, bytes)):
2611 if len(v) > 50:
2612 v = v[:50]
2613 if isinstance(v, bytes):
2614 # quick and dirty binary check:
2615 for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,
2616 21,22,23,24,25,26,27,28,29,30,31):
2617 if c in bytearray(v):
2618 v = '(binary data)'
2619 break
2620 print(" ", k, v)
2621 except Exception:
2622 log.exception('Error while parsing property stream %r' % streamname)
2624 try:
2625 if options.extract_customprop:
2626 variables = ole.get_userdefined_properties(streamname, convert_time=True)
2627 if len(variables):
2628 print("%r: user-defined properties" % streamname)
2629 for index, variable in enumerate(variables):
2630 print('\t{} {}: {}'.format(index, variable['property_name'],variable['value']))
2632 except:
2633 log.exception('Error while parsing user-defined property stream %r' % streamname)
2636 if options.check_streams:
2637 # Read all streams to check if there are errors:
2638 print('\nChecking streams...')
2639 for streamname in ole.listdir():
2640 # print name using repr() to convert binary chars to \xNN:
2641 print('-', repr('/'.join(streamname)),'-', end=' ')
2642 st_type = ole.get_type(streamname)
2643 if st_type == STGTY_STREAM:
2644 print('size %d' % ole.get_size(streamname))
2645 # just try to read stream in memory:
2646 ole.openstream(streamname)
2647 else:
2648 print('NOT a stream : type=%d' % st_type)
2649 print()
2651 # for streamname in ole.listdir():
2652 # # print name using repr() to convert binary chars to \xNN:
2653 # print('-', repr('/'.join(streamname)),'-', end=' ')
2654 # print(ole.getmtime(streamname))
2655 # print()
2657 print('Modification/Creation times of all directory entries:')
2658 for entry in ole.direntries:
2659 if entry is not None:
2660 print('- {}: mtime={} ctime={}'.format(entry.name,
2661 entry.getmtime(), entry.getctime()))
2662 print()
2664 # parse and display metadata:
2665 try:
2666 meta = ole.get_metadata()
2667 meta.dump()
2668 except Exception:
2669 log.exception('Error while parsing metadata')
2670 print()
2671 # [PL] Test a few new methods:
2672 root = ole.get_rootentry_name()
2673 print('Root entry name: "%s"' % root)
2674 if ole.exists('worddocument'):
2675 print("This is a Word document.")
2676 print("type of stream 'WordDocument':", ole.get_type('worddocument'))
2677 print("size :", ole.get_size('worddocument'))
2678 if ole.exists('macros/vba'):
2679 print("This document may contain VBA macros.")
2681 # print parsing issues:
2682 print('\nNon-fatal issues raised during parsing:')
2683 if ole.parsing_issues:
2684 for exctype, msg in ole.parsing_issues:
2685 print('- {}: {}'.format(exctype.__name__, msg))
2686 else:
2687 print('None')
2688 ole.close()
2689 except Exception:
2690 log.exception('Error while parsing file %r' % filename)
2693if __name__ == "__main__":
2694 main()
2696# this code was developed while listening to The Wedding Present "Sea Monsters"