Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/olefile/olefile.py: 42%

1"""

2olefile (formerly OleFileIO_PL)

4Module to read/write Microsoft OLE2 files (also called Structured Storage or

5Microsoft Compound Document File Format), such as Microsoft Office 97-2003

6documents, Image Composer and FlashPix files, Outlook messages, ...

7This version is compatible with Python 2.7 and 3.5+

9Project website: https://www.decalage.info/olefile

12(https://www.decalage.info)

14olefile is based on the OleFileIO module from the PIL library v1.1.7

15See: http://www.pythonware.com/products/pil/index.htm

16and http://svn.effbot.org/public/tags/pil-1.1.7/PIL/OleFileIO.py

18The Python Imaging Library (PIL) is

22See source code and LICENSE.txt for information on usage and redistribution.

23"""

25# Since olefile v0.47, only Python 2.7 and 3.5+ are supported

26# This import enables print() as a function rather than a keyword

27# (main requirement to be compatible with Python 3.x)

28# The comment on the line below should be printed on Python 2.5 or older:

29from __future__ import print_function # This version of olefile requires Python 2.7 or 3.5+.

32#--- LICENSE ------------------------------------------------------------------

35# (https://www.decalage.info)

36#

38#

39# Redistribution and use in source and binary forms, with or without modification,

40# are permitted provided that the following conditions are met:

41#

42# * Redistributions of source code must retain the above copyright notice, this

43# list of conditions and the following disclaimer.

44# * Redistributions in binary form must reproduce the above copyright notice,

45# this list of conditions and the following disclaimer in the documentation

46# and/or other materials provided with the distribution.

47#

48# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

49# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

50# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

51# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE

52# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

53# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

54# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

55# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

56# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

57# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

59# ----------

60# PIL License:

61#

62# olefile is based on source code from the OleFileIO module of the Python

63# Imaging Library (PIL) published by Fredrik Lundh under the following license:

65# The Python Imaging Library (PIL) is

68#

69# By obtaining, using, and/or copying this software and/or its associated

70# documentation, you agree that you have read, understood, and will comply with

71# the following terms and conditions:

72#

73# Permission to use, copy, modify, and distribute this software and its

74# associated documentation for any purpose and without fee is hereby granted,

75# provided that the above copyright notice appears in all copies, and that both

76# that copyright notice and this permission notice appear in supporting

77# documentation, and that the name of Secret Labs AB or the author(s) not be used

78# in advertising or publicity pertaining to distribution of the software

79# without specific, written prior permission.

80#

81# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS

82# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.

83# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,

84# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM

85# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR

86# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR

87# PERFORMANCE OF THIS SOFTWARE.

89__date__ = "2020-10-07"

90__version__ = '0.47.dev4'

91__author__ = "Philippe Lagadec"

93__all__ = ['isOleFile', 'OleFileIO', 'OleMetadata', 'enable_logging',

94 'MAGIC', 'STGTY_EMPTY',

95 'STGTY_STREAM', 'STGTY_STORAGE', 'STGTY_ROOT', 'STGTY_PROPERTY',

96 'STGTY_LOCKBYTES', 'MINIMAL_OLEFILE_SIZE',

97 'DEFECT_UNSURE', 'DEFECT_POTENTIAL', 'DEFECT_INCORRECT',

98 'DEFECT_FATAL', 'DEFAULT_PATH_ENCODING',

99 'MAXREGSECT', 'DIFSECT', 'FATSECT', 'ENDOFCHAIN', 'FREESECT',

100 'MAXREGSID', 'NOSTREAM', 'UNKNOWN_SIZE', 'WORD_CLSID',

101 'OleFileIONotClosed'

102]

103

104import io

105import sys

106import struct, array, os.path, datetime, logging, warnings, traceback

107

108#=== COMPATIBILITY WORKAROUNDS ================================================

109

110# For Python 3.x, need to redefine long as int:

111if str is not bytes:

112 long = int

113

114# Need to make sure we use xrange both on Python 2 and 3.x:

115try:

116 # on Python 2 we need xrange:

117 iterrange = xrange

118except Exception:

119 # no xrange, for Python 3 it was renamed as range:

120 iterrange = range

121

122# [PL] workaround to fix an issue with array item size on 64 bits systems:

123if array.array('L').itemsize == 4:

124 # on 32 bits platforms, long integers in an array are 32 bits:

125 UINT32 = 'L'

126elif array.array('I').itemsize == 4:

127 # on 64 bits platforms, integers in an array are 32 bits:

128 UINT32 = 'I'

129elif array.array('i').itemsize == 4:

130 # On 64 bit Jython, signed integers ('i') are the only way to store our 32

131 # bit values in an array in a *somewhat* reasonable way, as the otherwise

132 # perfectly suited 'H' (unsigned int, 32 bits) results in a completely

133 # unusable behaviour. This is most likely caused by the fact that Java

134 # doesn't have unsigned values, and thus Jython's "array" implementation,

135 # which is based on "jarray", doesn't have them either.

136 # NOTE: to trick Jython into converting the values it would normally

137 # interpret as "signed" into "unsigned", a binary-and operation with

138 # 0xFFFFFFFF can be used. This way it is possible to use the same comparing

139 # operations on all platforms / implementations. The corresponding code

140 # lines are flagged with a 'JYTHON-WORKAROUND' tag below.

141 UINT32 = 'i'

142else:

143 raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...')

144

145

146# [PL] These workarounds were inspired from the Path module

147# (see http://www.jorendorff.com/articles/python/path/)

148# TODO: remove the use of basestring, as it was removed in Python 3

149try:

150 basestring

151except NameError:

152 basestring = str

153

154if sys.version_info[0] < 3:

155 # On Python 2.x, the default encoding for path names is UTF-8:

156 DEFAULT_PATH_ENCODING = 'utf-8'

157else:

158 # On Python 3.x, the default encoding for path names is Unicode (None):

159 DEFAULT_PATH_ENCODING = None

160

161

162# === LOGGING =================================================================

163

164def get_logger(name, level=logging.CRITICAL+1):

165 """

166 Create a suitable logger object for this module.

167 The goal is not to change settings of the root logger, to avoid getting

168 other modules' logs on the screen.

169 If a logger exists with same name, reuse it. (Else it would have duplicate

170 handlers and messages would be doubled.)

171 The level is set to CRITICAL+1 by default, to avoid any logging.

172 """

173 # First, test if there is already a logger with the same name, else it

174 # will generate duplicate messages (due to duplicate handlers):

175 if name in logging.Logger.manager.loggerDict:

176 #NOTE: another less intrusive but more "hackish" solution would be to

177 # use getLogger then test if its effective level is not default.

178 logger = logging.getLogger(name)

179 # make sure level is OK:

180 logger.setLevel(level)

181 return logger

182 # get a new logger:

183 logger = logging.getLogger(name)

184 # only add a NullHandler for this logger, it is up to the application

185 # to configure its own logging:

186 logger.addHandler(logging.NullHandler())

187 logger.setLevel(level)

188 return logger

189

190

191# a global logger object used for debugging:

192log = get_logger('olefile')

193

194

195def enable_logging():

196 """

197 Enable logging for this module (disabled by default).

198 This will set the module-specific logger level to NOTSET, which

199 means the main application controls the actual logging level.

200 """

201 log.setLevel(logging.NOTSET)

202

203

204#=== CONSTANTS ===============================================================

205

206#: magic bytes that should be at the beginning of every OLE file:

207MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'

208

209# [PL]: added constants for Sector IDs (from AAF specifications)

210MAXREGSECT = 0xFFFFFFFA #: (-6) maximum SECT

211DIFSECT = 0xFFFFFFFC #: (-4) denotes a DIFAT sector in a FAT

212FATSECT = 0xFFFFFFFD #: (-3) denotes a FAT sector in a FAT

213ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain

214FREESECT = 0xFFFFFFFF #: (-1) unallocated sector

215

216# [PL]: added constants for Directory Entry IDs (from AAF specifications)

217MAXREGSID = 0xFFFFFFFA #: (-6) maximum directory entry ID

218NOSTREAM = 0xFFFFFFFF #: (-1) unallocated directory entry

219

220# [PL] object types in storage (from AAF specifications)

221STGTY_EMPTY = 0 #: empty directory entry

222STGTY_STORAGE = 1 #: element is a storage object

223STGTY_STREAM = 2 #: element is a stream object

224STGTY_LOCKBYTES = 3 #: element is an ILockBytes object

225STGTY_PROPERTY = 4 #: element is an IPropertyStorage object

226STGTY_ROOT = 5 #: element is a root storage

227

228# Unknown size for a stream (used by OleStream):

229UNKNOWN_SIZE = 0x7FFFFFFF

230

231#

232# --------------------------------------------------------------------

233# property types

234

235VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;

236VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;

237VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;

238VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;

239VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;

240VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;

241VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;

242VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;

243VT_VECTOR=0x1000;

244

245# map property id to name (for debugging purposes)

246VT = {}

247for keyword, var in list(vars().items()):

248 if keyword[:3] == "VT_":

249 VT[var] = keyword

250

251#

252# --------------------------------------------------------------------

253# Some common document types (root.clsid fields)

254

255WORD_CLSID = "00020900-0000-0000-C000-000000000046"

256# TODO: check Excel, PPT, ...

257

258# [PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()

259DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect

260DEFECT_POTENTIAL = 20 # a potential defect

261DEFECT_INCORRECT = 30 # an error according to specifications, but parsing

262 # can go on

263DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is

264 # impossible

265

266# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes

267# (this is used in isOleFile and OleFileIO.open)

268MINIMAL_OLEFILE_SIZE = 1536

269

270#=== FUNCTIONS ===============================================================

271

272def isOleFile (filename):

273 """

274 Test if a file is an OLE container (according to the magic bytes in its header).

275

276 .. note::

277 This function only checks the first 8 bytes of the file, not the

278 rest of the OLE structure.

279

280 .. versionadded:: 0.16

281

282 :param filename: filename, contents or file-like object of the OLE file (string-like or file-like object)

283

284 - if filename is a string smaller than 1536 bytes, it is the path

285 of the file to open. (bytes or unicode string)

286 - if filename is a string longer than 1535 bytes, it is parsed

287 as the content of an OLE file in memory. (bytes type only)

288 - if filename is a file-like object (with read and seek methods),

289 it is parsed as-is.

290

291 :type filename: bytes or str or unicode or file

292 :returns: True if OLE, False otherwise.

293 :rtype: bool

294 """

295 # check if filename is a string-like or file-like object:

296 if hasattr(filename, 'read'):

297 # file-like object: use it directly

298 header = filename.read(len(MAGIC))

299 # just in case, seek back to start of file:

300 filename.seek(0)

301 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:

302 # filename is a bytes string containing the OLE file to be parsed:

303 header = filename[:len(MAGIC)]

304 else:

305 # string-like object: filename of file on disk

306 with open(filename, 'rb') as fp:

307 header = fp.read(len(MAGIC))

308 if header == MAGIC:

309 return True

310 else:

311 return False

312

313

314if bytes is str:

315 # version for Python 2.x

316 def i8(c):

317 return ord(c)

318else:

319 # version for Python 3.x

320 def i8(c):

321 return c if c.__class__ is int else c[0]

322

323

324def i16(c, o = 0):

325 """

326 Converts a 2-bytes (16 bits) string to an integer.

327

328 :param c: string containing bytes to convert

329 :param o: offset of bytes to convert in string

330 """

331 return struct.unpack("<H", c[o:o+2])[0]

332

333

334def i32(c, o = 0):

335 """

336 Converts a 4-bytes (32 bits) string to an integer.

337

338 :param c: string containing bytes to convert

339 :param o: offset of bytes to convert in string

340 """

341 return struct.unpack("<I", c[o:o+4])[0]

342

343

344def _clsid(clsid):

345 """

346 Converts a CLSID to a human-readable string.

347

348 :param clsid: string of length 16.

349 """

350 assert len(clsid) == 16

351 # if clsid is only made of null bytes, return an empty string:

352 # (PL: why not simply return the string with zeroes?)

353 if not clsid.strip(b"\0"):

354 return ""

355 return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %

356 ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +

357 tuple(map(i8, clsid[8:16]))))

358

359

360

361def filetime2datetime(filetime):

362 """

363 convert FILETIME (64 bits int) to Python datetime.datetime

364 """

365 # TODO: manage exception when microseconds is too large

366 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/

367 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)

368 # log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24)))

369 return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10)

370

371

372

373#=== CLASSES ==================================================================

374

375class OleFileError(IOError):

376 """

377 Generic base error for this module.

378 """

379 pass

380

381class NotOleFileError(OleFileError):

382 """

383 Error raised when the opened file is not an OLE file.

384 """

385 pass

386

387class OleMetadata:

388 """

389 Class to parse and store metadata from standard properties of OLE files.

390

391 Available attributes:

392 codepage, title, subject, author, keywords, comments, template,

393 last_saved_by, revision_number, total_edit_time, last_printed, create_time,

394 last_saved_time, num_pages, num_words, num_chars, thumbnail,

395 creating_application, security, codepage_doc, category, presentation_target,

396 bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,

397 scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,

398 chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,

399 version, dig_sig, content_type, content_status, language, doc_version

400

401 Note: an attribute is set to None when not present in the properties of the

402 OLE file.

403

404 References for SummaryInformation stream:

405

406 - https://msdn.microsoft.com/en-us/library/dd942545.aspx

407 - https://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx

408 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx

409 - https://msdn.microsoft.com/en-us/library/aa372045.aspx

410 - http://sedna-soft.de/articles/summary-information-stream/

411 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html

412

413 References for DocumentSummaryInformation stream:

414

415 - https://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx

416 - https://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx

417 - https://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html

418

419 New in version 0.25

420 """

421

422 # attribute names for SummaryInformation stream properties:

423 # (ordered by property id, starting at 1)

424 SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',

425 'template', 'last_saved_by', 'revision_number', 'total_edit_time',

426 'last_printed', 'create_time', 'last_saved_time', 'num_pages',

427 'num_words', 'num_chars', 'thumbnail', 'creating_application',

428 'security']

429

430 # attribute names for DocumentSummaryInformation stream properties:

431 # (ordered by property id, starting at 1)

432 DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',

433 'slides', 'notes', 'hidden_slides', 'mm_clips',

434 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',

435 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',

436 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',

437 'content_type', 'content_status', 'language', 'doc_version']

438

439 def __init__(self):

440 """

441 Constructor for OleMetadata

442 All attributes are set to None by default

443 """

444 # properties from SummaryInformation stream

445 self.codepage = None

446 self.title = None

447 self.subject = None

448 self.author = None

449 self.keywords = None

450 self.comments = None

451 self.template = None

452 self.last_saved_by = None

453 self.revision_number = None

454 self.total_edit_time = None

455 self.last_printed = None

456 self.create_time = None

457 self.last_saved_time = None

458 self.num_pages = None

459 self.num_words = None

460 self.num_chars = None

461 self.thumbnail = None

462 self.creating_application = None

463 self.security = None

464 # properties from DocumentSummaryInformation stream

465 self.codepage_doc = None

466 self.category = None

467 self.presentation_target = None

468 self.bytes = None

469 self.lines = None

470 self.paragraphs = None

471 self.slides = None

472 self.notes = None

473 self.hidden_slides = None

474 self.mm_clips = None

475 self.scale_crop = None

476 self.heading_pairs = None

477 self.titles_of_parts = None

478 self.manager = None

479 self.company = None

480 self.links_dirty = None

481 self.chars_with_spaces = None

482 self.unused = None

483 self.shared_doc = None

484 self.link_base = None

485 self.hlinks = None

486 self.hlinks_changed = None

487 self.version = None

488 self.dig_sig = None

489 self.content_type = None

490 self.content_status = None

491 self.language = None

492 self.doc_version = None

493

494 def parse_properties(self, ole_file):

495 """

496 Parse standard properties of an OLE file, from the streams

497 ``\\x05SummaryInformation`` and ``\\x05DocumentSummaryInformation``,

498 if present.

499 Properties are converted to strings, integers or python datetime objects.

500 If a property is not present, its value is set to None.

501

502 :param ole_file: OleFileIO object from which to parse properties

503 """

504 # first set all attributes to None:

505 for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS):

506 setattr(self, attrib, None)

507 if ole_file.exists("\x05SummaryInformation"):

508 # get properties from the stream:

509 # (converting timestamps to python datetime, except total_edit_time,

510 # which is property #10)

511 props = ole_file.getproperties("\x05SummaryInformation",

512 convert_time=True, no_conversion=[10])

513 # store them into this object's attributes:

514 for i in range(len(self.SUMMARY_ATTRIBS)):

515 # ids for standards properties start at 0x01, until 0x13

516 value = props.get(i+1, None)

517 setattr(self, self.SUMMARY_ATTRIBS[i], value)

518 if ole_file.exists("\x05DocumentSummaryInformation"):

519 # get properties from the stream:

520 props = ole_file.getproperties("\x05DocumentSummaryInformation",

521 convert_time=True)

522 # store them into this object's attributes:

523 for i in range(len(self.DOCSUM_ATTRIBS)):

524 # ids for standards properties start at 0x01, until 0x13

525 value = props.get(i+1, None)

526 setattr(self, self.DOCSUM_ATTRIBS[i], value)

527

528 def dump(self):

529 """

530 Dump all metadata, for debugging purposes.

531 """

532 print('Properties from SummaryInformation stream:')

533 for prop in self.SUMMARY_ATTRIBS:

534 value = getattr(self, prop)

535 print('- {}: {}'.format(prop, repr(value)))

536 print('Properties from DocumentSummaryInformation stream:')

537 for prop in self.DOCSUM_ATTRIBS:

538 value = getattr(self, prop)

539 print('- {}: {}'.format(prop, repr(value)))

540

541class OleFileIONotClosed(RuntimeWarning):

542 """

543 Warning type used when OleFileIO is destructed but has open file handle.

544 """

545 def __init__(self, stack_of_open=None):

546 super(OleFileIONotClosed, self).__init__()

547 self.stack_of_open = stack_of_open

548

549 def __str__(self):

550 msg = 'Deleting OleFileIO instance with open file handle. ' \

551 'You should ensure that OleFileIO is never deleted ' \

552 'without calling close() first. Consider using '\

553 '"with OleFileIO(...) as ole: ...".'

554 if self.stack_of_open:

555 return ''.join([msg, '\n', 'Stacktrace of open() call:\n'] +

556 self.stack_of_open.format())

557 else:

558 return msg

559

560

561# --- OleStream ---------------------------------------------------------------

562

563class OleStream(io.BytesIO):

564 """

565 OLE2 Stream

566

567 Returns a read-only file object which can be used to read

568 the contents of a OLE stream (instance of the BytesIO class).

569 To open a stream, use the openstream method in the OleFileIO class.

570

571 This function can be used with either ordinary streams,

572 or ministreams, depending on the offset, sectorsize, and

573 fat table arguments.

574

575 Attributes:

576

577 - size: actual size of data stream, after it was opened.

578 """

579 # FIXME: should store the list of sects obtained by following

580 # the fat chain, and load new sectors on demand instead of

581 # loading it all in one go.

582

583 def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio):

584 """

585 Constructor for OleStream class.

586

587 :param fp: file object, the OLE container or the MiniFAT stream

588 :param sect: sector index of first sector in the stream

589 :param size: total size of the stream

590 :param offset: offset in bytes for the first FAT or MiniFAT sector

591 :param sectorsize: size of one sector

592 :param fat: array/list of sector indexes (FAT or MiniFAT)

593 :param filesize: size of OLE file (for debugging)

594 :param olefileio: OleFileIO object containing this stream

595 :returns: a BytesIO instance containing the OLE stream

596 """

597 log.debug('OleStream.__init__:')

598 log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'

599 %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))

600 self.ole = olefileio

601 # this check is necessary, otherwise when attempting to open a stream

602 # from a closed OleFileIO, a stream of size zero is returned without

603 # raising an exception. (see issue #81)

604 if self.ole.fp.closed:

605 raise OSError('Attempting to open a stream from a closed OLE File')

606 # [PL] To detect malformed documents with FAT loops, we compute the

607 # expected number of sectors in the stream:

608 unknown_size = False

609 if size == UNKNOWN_SIZE:

610 # this is the case when called from OleFileIO._open(), and stream

611 # size is not known in advance (for example when reading the

612 # Directory stream). Then we can only guess maximum size:

613 size = len(fat)*sectorsize

614 # and we keep a record that size was unknown:

615 unknown_size = True

616 log.debug(' stream with UNKNOWN SIZE')

617 nb_sectors = (size + (sectorsize-1)) // sectorsize

618 log.debug('nb_sectors = %d' % nb_sectors)

619 # This number should (at least) be less than the total number of

620 # sectors in the given FAT:

621 if nb_sectors > len(fat):

622 self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large')

623 # optimization(?): data is first a list of strings, and join() is called

624 # at the end to concatenate all in one string.

625 # (this may not be really useful with recent Python versions)

626 data = []

627 # if size is zero, then first sector index should be ENDOFCHAIN:

628 if size == 0 and sect != ENDOFCHAIN:

629 log.debug('size == 0 and sect != ENDOFCHAIN:')

630 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream')

631 # [PL] A fixed-length for loop is used instead of an undefined while

632 # loop to avoid DoS attacks:

633 for i in range(nb_sectors):

634 log.debug('Reading stream sector[%d] = %Xh' % (i, sect))

635 # Sector index may be ENDOFCHAIN, but only if size was unknown

636 if sect == ENDOFCHAIN:

637 if unknown_size:

638 log.debug('Reached ENDOFCHAIN sector for stream with unknown size')

639 break

640 else:

641 # else this means that the stream is smaller than declared:

642 log.debug('sect=ENDOFCHAIN before expected size')

643 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream')

644 # sector index should be within FAT:

645 if sect<0 or sect>=len(fat):

646 log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))

647 log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors))

648## tmp_data = b"".join(data)

649## f = open('test_debug.bin', 'wb')

650## f.write(tmp_data)

651## f.close()

652## log.debug('data read so far: %d bytes' % len(tmp_data))

653 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')

654 # stop reading here if the exception is ignored:

655 break

656 # TODO: merge this code with OleFileIO.getsect() ?

657 # TODO: check if this works with 4K sectors:

658 try:

659 fp.seek(offset + sectorsize * sect)

660 except Exception:

661 log.debug('sect=%d, seek=%d, filesize=%d' %

662 (sect, offset+sectorsize*sect, filesize))

663 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range')

664 # stop reading here if the exception is ignored:

665 break

666 sector_data = fp.read(sectorsize)

667 # [PL] check if there was enough data:

668 # Note: if sector is the last of the file, sometimes it is not a

669 # complete sector (of 512 or 4K), so we may read less than

670 # sectorsize.

671 if len(sector_data)!=sectorsize and sect!=(len(fat)-1):

672 log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %

673 (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))

674 log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))

675 self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector')

676 data.append(sector_data)

677 # jump to next sector in the FAT:

678 try:

679 sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND

680 except IndexError:

681 # [PL] if pointer is out of the FAT an exception is raised

682 self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range')

683 # stop reading here if the exception is ignored:

684 break

685 # [PL] Last sector should be a "end of chain" marker:

686 # if sect != ENDOFCHAIN:

687 # raise IOError('incorrect last sector index in OLE stream')

688 data = b"".join(data)

689 # Data is truncated to the actual stream size:

690 if len(data) >= size:

691 log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size))

692 data = data[:size]

693 # actual stream size is stored for future use:

694 self.size = size

695 elif unknown_size:

696 # actual stream size was not known, now we know the size of read

697 # data:

698 log.debug('Read data of length %d, the stream size was unknown' % len(data))

699 self.size = len(data)

700 else:

701 # read data is less than expected:

702 log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size))

703 # TODO: provide details in exception message

704 self.size = len(data)

705 self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared')

706 # when all data is read in memory, BytesIO constructor is called

707 io.BytesIO.__init__(self, data)

708 # Then the OleStream object can be used as a read-only file object.

709

710

711# --- OleDirectoryEntry -------------------------------------------------------

712

713class OleDirectoryEntry:

714 """

715 OLE2 Directory Entry pointing to a stream or a storage

716 """

717 # struct to parse directory entries:

718 # <: little-endian byte order, standard sizes

719 # (note: this should guarantee that Q returns a 64 bits int)

720 # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes

721 # H: uint16, number of bytes used in name buffer, including null = (len+1)*2

722 # B: uint8, dir entry type (between 0 and 5)

723 # B: uint8, color: 0=black, 1=red

724 # I: uint32, index of left child node in the red-black tree, NOSTREAM if none

725 # I: uint32, index of right child node in the red-black tree, NOSTREAM if none

726 # I: uint32, index of child root node if it is a storage, else NOSTREAM

727 # 16s: CLSID, unique identifier (only used if it is a storage)

728 # I: uint32, user flags

729 # Q (was 8s): uint64, creation timestamp or zero

730 # Q (was 8s): uint64, modification timestamp or zero

731 # I: uint32, SID of first sector if stream or ministream, SID of 1st sector

732 # of stream containing ministreams if root entry, 0 otherwise

733 # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise

734 # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise

735 STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII'

736 # size of a directory entry: 128 bytes

737 DIRENTRY_SIZE = 128

738 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE

739

740 def __init__(self, entry, sid, ole_file):

741 """

742 Constructor for an OleDirectoryEntry object.

743 Parses a 128-bytes entry from the OLE Directory stream.

744

745 :param bytes entry: bytes string (must be 128 bytes long)

746 :param int sid: index of this directory entry in the OLE file directory

747 :param OleFileIO ole_file: OleFileIO object containing this directory entry

748 """

749 self.sid = sid

750 # ref to ole_file is stored for future use

751 self.olefile = ole_file

752 # kids is a list of children entries, if this entry is a storage:

753 # (list of OleDirectoryEntry objects)

754 self.kids = []

755 # kids_dict is a dictionary of children entries, indexed by their

756 # name in lowercase: used to quickly find an entry, and to detect

757 # duplicates

758 self.kids_dict = {}

759 # flag used to detect if the entry is referenced more than once in

760 # directory:

761 self.used = False

762 # decode DirEntry

763 (

764 self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes

765 self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2

766 self.entry_type,

767 self.color,

768 self.sid_left,

769 self.sid_right,

770 self.sid_child,

771 clsid,

772 self.dwUserFlags,

773 self.createTime,

774 self.modifyTime,

775 self.isectStart,

776 self.sizeLow,

777 self.sizeHigh

778 ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry)

779 if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:

780 ole_file._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')

781 # only first directory entry can (and should) be root:

782 if self.entry_type == STGTY_ROOT and sid != 0:

783 ole_file._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')

784 if sid == 0 and self.entry_type != STGTY_ROOT:

785 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')

786 # log.debug(struct.unpack(fmt_entry, entry[:len_entry]))

787 # name should be at most 31 unicode characters + null character,

788 # so 64 bytes in total (31*2 + 2):

789 if self.namelength > 64:

790 ole_file._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes')

791 # if exception not raised, namelength is set to the maximum value:

792 self.namelength = 64

793 # only characters without ending null char are kept:

794 self.name_utf16 = self.name_raw[:(self.namelength-2)]

795 # TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1)

796 # TODO: check if the name does not contain forbidden characters:

797 # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'."

798 # name is converted from UTF-16LE to the path encoding specified in the OleFileIO:

799 self.name = ole_file._decode_utf16_str(self.name_utf16)

800

801 log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))

802 log.debug(' - type: %d' % self.entry_type)

803 log.debug(' - sect: %Xh' % self.isectStart)

804 log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,

805 self.sid_right, self.sid_child))

806

807 # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes

808 # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1

809 # or some other value so it cannot be raised as a defect in general:

810 if ole_file.sectorsize == 512:

811 if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF:

812 log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %

813 (ole_file.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh))

814 ole_file._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')

815 self.size = self.sizeLow

816 else:

817 self.size = self.sizeLow + (long(self.sizeHigh)<<32)

818 log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh))

819

820 self.clsid = _clsid(clsid)

821 # a storage should have a null size, BUT some implementations such as

822 # Word 8 for Mac seem to allow non-null values => Potential defect:

823 if self.entry_type == STGTY_STORAGE and self.size != 0:

824 ole_file._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')

825 # check if stream is not already referenced elsewhere:

826 self.is_minifat = False

827 if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:

828 if self.size < ole_file.minisectorcutoff \

829 and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT

830 # ministream object

831 self.is_minifat = True

832 else:

833 self.is_minifat = False

834 ole_file._check_duplicate_stream(self.isectStart, self.is_minifat)

835 self.sect_chain = None

836

837 def build_sect_chain(self, ole_file):

838 """

839 Build the sector chain for a stream (from the FAT or the MiniFAT)

840

841 :param OleFileIO ole_file: OleFileIO object containing this directory entry

842 :return: nothing

843 """

844 # TODO: seems to be used only from _write_mini_stream, is it useful?

845 # TODO: use self.olefile instead of ole_file

846 if self.sect_chain:

847 return

848 if self.entry_type not in (STGTY_ROOT, STGTY_STREAM) or self.size == 0:

849 return

850

851 self.sect_chain = list()

852

853 if self.is_minifat and not ole_file.minifat:

854 ole_file.loadminifat()

855

856 next_sect = self.isectStart

857 while next_sect != ENDOFCHAIN:

858 self.sect_chain.append(next_sect)

859 if self.is_minifat:

860 next_sect = ole_file.minifat[next_sect]

861 else:

862 next_sect = ole_file.fat[next_sect]

863

864 def build_storage_tree(self):

865 """

866 Read and build the red-black tree attached to this OleDirectoryEntry

867 object, if it is a storage.

868 Note that this method builds a tree of all subentries, so it should

869 only be called for the root object once.

870 """

871 log.debug('build_storage_tree: SID=%d - %s - sid_child=%d'

872 % (self.sid, repr(self.name), self.sid_child))

873 if self.sid_child != NOSTREAM:

874 # if child SID is not NOSTREAM, then this entry is a storage.

875 # Let's walk through the tree of children to fill the kids list:

876 self.append_kids(self.sid_child)

877

878 # Note from OpenOffice documentation: the safest way is to

879 # recreate the tree because some implementations may store broken

880 # red-black trees...

881

882 # in the OLE file, entries are sorted on (length, name).

883 # for convenience, we sort them on name instead:

884 # (see rich comparison methods in this class)

885 self.kids.sort()

886

887 def append_kids(self, child_sid):

888 """

889 Walk through red-black tree of children of this directory entry to add

890 all of them to the kids list. (recursive method)

891

892 :param child_sid: index of child directory entry to use, or None when called

893 first time for the root. (only used during recursion)

894 """

895 log.debug('append_kids: child_sid=%d' % child_sid)

896 # [PL] this method was added to use simple recursion instead of a complex

897 # algorithm.

898 # if this is not a storage or a leaf of the tree, nothing to do:

899 if child_sid == NOSTREAM:

900 return

901 # check if child SID is in the proper range:

902 if child_sid<0 or child_sid>=len(self.olefile.direntries):

903 self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range')

904 else:

905 # get child direntry:

906 child = self.olefile._load_direntry(child_sid) #direntries[child_sid]

907 log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'

908 % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))

909 # Check if kid was not already referenced in a storage:

910 if child.used:

911 self.olefile._raise_defect(DEFECT_INCORRECT,

912 'OLE Entry referenced more than once')

913 return

914 child.used = True

915 # the directory entries are organized as a red-black tree.

916 # (cf. Wikipedia for details)

917 # First walk through left side of the tree:

918 self.append_kids(child.sid_left)

919 # Check if its name is not already used (case-insensitive):

920 name_lower = child.name.lower()

921 if name_lower in self.kids_dict:

922 self.olefile._raise_defect(DEFECT_INCORRECT,

923 "Duplicate filename in OLE storage")

924 # Then the child_sid OleDirectoryEntry object is appended to the

925 # kids list and dictionary:

926 self.kids.append(child)

927 self.kids_dict[name_lower] = child

928 # Finally walk through right side of the tree:

929 self.append_kids(child.sid_right)

930 # Afterwards build kid's own tree if it's also a storage:

931 child.build_storage_tree()

932

933 def __eq__(self, other):

934 "Compare entries by name"

935 return self.name == other.name

936

937 def __lt__(self, other):

938 "Compare entries by name"

939 return self.name < other.name

940

941 def __ne__(self, other):

942 return not self.__eq__(other)

943

944 def __le__(self, other):

945 return self.__eq__(other) or self.__lt__(other)

946

947 # Reflected __lt__() and __le__() will be used for __gt__() and __ge__()

948

949 # TODO: replace by the same function as MS implementation ?

950 # (order by name length first, then case-insensitive order)

951

952 def dump(self, tab = 0):

953 "Dump this entry, and all its subentries (for debug purposes only)"

954 TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",

955 "(property)", "(root)"]

956 try:

957 type_name = TYPES[self.entry_type]

958 except IndexError:

959 type_name = '(UNKNOWN)'

960 print(" "*tab + repr(self.name), type_name, end=' ')

961 if self.entry_type in (STGTY_STREAM, STGTY_ROOT):

962 print(self.size, "bytes", end=' ')

963 print()

964 if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:

965 print(" "*tab + "{%s}" % self.clsid)

966

967 for kid in self.kids:

968 kid.dump(tab + 2)

969

970 def getmtime(self):

971 """

972 Return modification time of a directory entry.

973

974 :returns: None if modification time is null, a python datetime object

975 otherwise (UTC timezone)

976

977 new in version 0.26

978 """

979 if self.modifyTime == 0:

980 return None

981 return filetime2datetime(self.modifyTime)

982

983

984 def getctime(self):

985 """

986 Return creation time of a directory entry.

987

988 :returns: None if modification time is null, a python datetime object

989 otherwise (UTC timezone)

990

991 new in version 0.26

992 """

993 if self.createTime == 0:

994 return None

995 return filetime2datetime(self.createTime)

996

997

998#--- OleFileIO ----------------------------------------------------------------

999

1000class OleFileIO:

1001 """

1002 OLE container object

1003

1004 This class encapsulates the interface to an OLE 2 structured

1005 storage file. Use the listdir and openstream methods to

1006 access the contents of this file.

1007

1008 Object names are given as a list of strings, one for each subentry

1009 level. The root entry should be omitted. For example, the following

1010 code extracts all image streams from a Microsoft Image Composer file::

1011

1012 with OleFileIO("fan.mic") as ole:

1013

1014 for entry in ole.listdir():

1015 if entry[1:2] == "Image":

1016 fin = ole.openstream(entry)

1017 fout = open(entry[0:1], "wb")

1018 while True:

1019 s = fin.read(8192)

1020 if not s:

1021 break

1022 fout.write(s)

1023

1024 You can use the viewer application provided with the Python Imaging

1025 Library to view the resulting files (which happens to be standard

1026 TIFF files).

1027 """

1028

1029 def __init__(self, filename=None, raise_defects=DEFECT_FATAL,

1030 write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING):

1031 """

1032 Constructor for the OleFileIO class.

1033

1034 :param filename: file to open.

1035

1036 - if filename is a string smaller than 1536 bytes, it is the path

1037 of the file to open. (bytes or unicode string)

1038 - if filename is a string longer than 1535 bytes, it is parsed

1039 as the content of an OLE file in memory. (bytes type only)

1040 - if filename is a file-like object (with read, seek and tell methods),

1041 it is parsed as-is. The caller is responsible for closing it when done.

1042

1043 :param raise_defects: minimal level for defects to be raised as exceptions.

1044 (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a

1045 security-oriented application, see source code for details)

1046

1047 :param write_mode: bool, if True the file is opened in read/write mode instead

1048 of read-only by default.

1049

1050 :param debug: bool, set debug mode (deprecated, not used anymore)

1051

1052 :param path_encoding: None or str, name of the codec to use for path

1053 names (streams and storages), or None for Unicode.

1054 Unicode by default on Python 3+, UTF-8 on Python 2.x.

1055 (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)

1056 """

1057 # minimal level for defects to be raised as exceptions:

1058 self._raise_defects_level = raise_defects

1059 #: list of defects/issues not raised as exceptions:

1060 #: tuples of (exception type, message)

1061 self.parsing_issues = []

1062 self.write_mode = write_mode

1063 self.path_encoding = path_encoding

1064 # initialize all attributes to default values:

1065 self._filesize = None

1066 self.ministream = None

1067 self._used_streams_fat = []

1068 self._used_streams_minifat = []

1069 self.byte_order = None

1070 self.directory_fp = None

1071 self.direntries = None

1072 self.dll_version = None

1073 self.fat = None

1074 self.first_difat_sector = None

1075 self.first_dir_sector = None

1076 self.first_mini_fat_sector = None

1077 self.fp = None

1078 self.header_clsid = None

1079 self.header_signature = None

1080 self.metadata = None

1081 self.mini_sector_shift = None

1082 self.mini_sector_size = None

1083 self.mini_stream_cutoff_size = None

1084 self.minifat = None

1085 self.minifatsect = None

1086 # TODO: duplicates?

1087 self.minisectorcutoff = None

1088 self.minisectorsize = None

1089 self.ministream = None

1090 self.minor_version = None

1091 self.nb_sect = None

1092 self.num_difat_sectors = None

1093 self.num_dir_sectors = None

1094 self.num_fat_sectors = None

1095 self.num_mini_fat_sectors = None

1096 self.reserved1 = None

1097 self.reserved2 = None

1098 self.root = None

1099 self.sector_shift = None

1100 self.sector_size = None

1101 self.transaction_signature_number = None

1102 self._we_opened_fp = False

1103 self._open_stack = None

1104 if filename:

1105 # try opening, ensure fp is closed if that fails

1106 try:

1107 self.open(filename, write_mode=write_mode)

1108 except Exception:

1109 # caller has no chance of calling close() now

1110 self._close(warn=False)

1111 raise

1112

1113 def __del__(self):

1114 """Destructor, ensures all file handles are closed that we opened."""

1115 self._close(warn=True)

1116 # super(OleFileIO, self).__del__() # there's no super-class destructor

1117

1118

1119 def __enter__(self):

1120 return self

1121

1122

1123 def __exit__(self, *args):

1124 self._close(warn=False)

1125

1126

1127 def _raise_defect(self, defect_level, message, exception_type=OleFileError):

1128 """

1129 This method should be called for any defect found during file parsing.

1130 It may raise an OleFileError exception according to the minimal level chosen

1131 for the OleFileIO object.

1132

1133 :param defect_level: defect level, possible values are:

1134

1135 - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect

1136 - DEFECT_POTENTIAL : a potential defect

1137 - DEFECT_INCORRECT : an error according to specifications, but parsing can go on

1138 - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible

1139

1140 :param message: string describing the defect, used with raised exception.

1141 :param exception_type: exception class to be raised, OleFileError by default

1142 """

1143 # added by [PL]

1144 if defect_level >= self._raise_defects_level:

1145 log.error(message)

1146 raise exception_type(message)

1147 else:

1148 # just record the issue, no exception raised:

1149 self.parsing_issues.append((exception_type, message))

1150 log.warning(message)

1151

1152

1153 def _decode_utf16_str(self, utf16_str, errors='replace'):

1154 """

1155 Decode a string encoded in UTF-16 LE format, as found in the OLE

1156 directory or in property streams. Return a string encoded

1157 according to the path_encoding specified for the OleFileIO object.

1158

1159 :param bytes utf16_str: bytes string encoded in UTF-16 LE format

1160 :param str errors: str, see python documentation for str.decode()

1161 :return: str, encoded according to path_encoding

1162 :rtype: str

1163 """

1164 unicode_str = utf16_str.decode('UTF-16LE', errors)

1165 if self.path_encoding:

1166 # an encoding has been specified for path names:

1167 return unicode_str.encode(self.path_encoding, errors)

1168 else:

1169 # path_encoding=None, return the Unicode string as-is:

1170 return unicode_str

1171

1172

1173 def open(self, filename, write_mode=False):

1174 """

1175 Open an OLE2 file in read-only or read/write mode.

1176 Read and parse the header, FAT and directory.

1177

1178 :param filename: string-like or file-like object, OLE file to parse

1179

1180 - if filename is a string smaller than 1536 bytes, it is the path

1181 of the file to open. (bytes or unicode string)

1182 - if filename is a string longer than 1535 bytes, it is parsed

1183 as the content of an OLE file in memory. (bytes type only)

1184 - if filename is a file-like object (with read, seek and tell methods),

1185 it is parsed as-is. The caller is responsible for closing it when done

1186

1187 :param write_mode: bool, if True the file is opened in read/write mode instead

1188 of read-only by default. (ignored if filename is not a path)

1189 """

1190 self.write_mode = write_mode

1191 # [PL] check if filename is a string-like or file-like object:

1192 # (it is better to check for a read() method)

1193 if hasattr(filename, 'read'):

1194 # TODO: also check seek and tell methods?

1195 # file-like object: use it directly

1196 self.fp = filename

1197 elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:

1198 # filename is a bytes string containing the OLE file to be parsed:

1199 # convert it to BytesIO

1200 self.fp = io.BytesIO(filename)

1201 else:

1202 # string-like object: filename of file on disk

1203 if self.write_mode:

1204 # open file in mode 'read with update, binary'

1205 # According to https://docs.python.org/library/functions.html#open

1206 # 'w' would truncate the file, 'a' may only append on some Unixes

1207 mode = 'r+b'

1208 else:

1209 # read-only mode by default

1210 mode = 'rb'

1211 self.fp = open(filename, mode)

1212 self._we_opened_fp = True

1213 self._open_stack = traceback.extract_stack() # remember for warning

1214 # obtain the filesize by using seek and tell, which should work on most

1215 # file-like objects:

1216 # TODO: do it above, using getsize with filename when possible?

1217 # TODO: fix code to fail with clear exception when filesize cannot be obtained

1218 filesize = 0

1219 self.fp.seek(0, os.SEEK_END)

1220 try:

1221 filesize = self.fp.tell()

1222 finally:

1223 self.fp.seek(0)

1224 self._filesize = filesize

1225 log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize))

1226

1227 # lists of streams in FAT and MiniFAT, to detect duplicate references

1228 # (list of indexes of first sectors of each stream)

1229 self._used_streams_fat = []

1230 self._used_streams_minifat = []

1231

1232 header = self.fp.read(512)

1233

1234 if len(header) != 512 or header[:8] != MAGIC:

1235 log.debug('Magic = {!r} instead of {!r}'.format(header[:8], MAGIC))

1236 self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file", NotOleFileError)

1237

1238 # [PL] header structure according to AAF specifications:

1239 ##Header

1240 ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]

1241 ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,

1242 ## // 0x1a, 0xe1} for current version

1243 ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/

1244 ## // GetClassFile uses root directory class id)

1245 ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is

1246 ## // written by reference implementation

1247 ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for

1248 ## // 512-byte sectors, 4 for 4 KB sectors

1249 ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering

1250 ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two;

1251 ## // typically 9 indicating 512-byte sectors

1252 ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;

1253 ## // typically 6 indicating 64-byte mini-sectors

1254 ##USHORT _usReserved; // [22H,02] reserved, must be zero

1255 ##ULONG _ulReserved1; // [24H,04] reserved, must be zero

1256 ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,

1257 ## // number of SECTs in directory chain for 4 KB

1258 ## // sectors

1259 ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain

1260 ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain

1261 ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must

1262 ## // be zero. The reference implementation

1263 ## // does not support transactions

1264 ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;

1265 ## // typically 4096 bytes

1266 ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain

1267 ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain

1268 ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain

1269 ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain

1270 ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors

1271 ##};

1272

1273 # [PL] header decoding:

1274 # '<' indicates little-endian byte ordering for Intel (cf. struct module help)

1275 fmt_header = '<8s16sHHHHHHLLLLLLLLLL'

1276 header_size = struct.calcsize(fmt_header)

1277 log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )

1278 header1 = header[:header_size]

1279 (

1280 self.header_signature,

1281 self.header_clsid,

1282 self.minor_version,

1283 self.dll_version,

1284 self.byte_order,

1285 self.sector_shift,

1286 self.mini_sector_shift,

1287 self.reserved1,

1288 self.reserved2,

1289 self.num_dir_sectors,

1290 self.num_fat_sectors,

1291 self.first_dir_sector,

1292 self.transaction_signature_number,

1293 self.mini_stream_cutoff_size,

1294 self.first_mini_fat_sector,

1295 self.num_mini_fat_sectors,

1296 self.first_difat_sector,

1297 self.num_difat_sectors

1298 ) = struct.unpack(fmt_header, header1)

1299 log.debug( struct.unpack(fmt_header, header1))

1300

1301 if self.header_signature != MAGIC:

1302 # OLE signature should always be present

1303 self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")

1304 if self.header_clsid != bytearray(16):

1305 # according to AAF specs, CLSID should always be zero

1306 self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")

1307 log.debug( "Minor Version = %d" % self.minor_version )

1308 # TODO: according to MS-CFB, minor version should be 0x003E

1309 log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version )

1310 if self.dll_version not in [3, 4]:

1311 # version 3: usual format, 512 bytes per sector

1312 # version 4: large format, 4K per sector

1313 self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")

1314 log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order )

1315 if self.byte_order != 0xFFFE:

1316 # For now only common little-endian documents are handled correctly

1317 self._raise_defect(DEFECT_INCORRECT, "incorrect ByteOrder in OLE header")

1318 # TODO: add big-endian support for documents created on Mac ?

1319 # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE.

1320 self.sector_size = 2**self.sector_shift

1321 log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size )

1322 if self.sector_size not in [512, 4096]:

1323 self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header")

1324 if (self.dll_version==3 and self.sector_size!=512) \

1325 or (self.dll_version==4 and self.sector_size!=4096):

1326 self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header")

1327 self.mini_sector_size = 2**self.mini_sector_shift

1328 log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size )

1329 if self.mini_sector_size not in [64]:

1330 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header")

1331 if self.reserved1 != 0 or self.reserved2 != 0:

1332 self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")

1333 log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors )

1334 # Number of directory sectors (only allowed if DllVersion != 3)

1335 if self.sector_size==512 and self.num_dir_sectors!=0:

1336 self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header")

1337 log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors )

1338 # num_fat_sectors = number of FAT sectors in the file

1339 log.debug( "First Directory sector = %Xh" % self.first_dir_sector )

1340 # first_dir_sector = 1st sector containing the directory

1341 log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number )

1342 # Signature should be zero, BUT some implementations do not follow this

1343 # rule => only a potential defect:

1344 # (according to MS-CFB, may be != 0 for applications supporting file

1345 # transactions)

1346 if self.transaction_signature_number != 0:

1347 self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)")

1348 log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size )

1349 # MS-CFB: This integer field MUST be set to 0x00001000. This field

1350 # specifies the maximum size of a user-defined data stream allocated

1351 # from the mini FAT and mini stream, and that cutoff is 4096 bytes.

1352 # Any user-defined data stream larger than or equal to this cutoff size

1353 # must be allocated as normal sectors from the FAT.

1354 if self.mini_stream_cutoff_size != 0x1000:

1355 self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header")

1356 # if no exception is raised, the cutoff size is fixed to 0x1000

1357 log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' %

1358 self.mini_stream_cutoff_size)

1359 self.mini_stream_cutoff_size = 0x1000

1360 # TODO: check if these values are OK

1361 log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector )

1362 log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors )

1363 log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector )

1364 log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors )

1365

1366 # calculate the number of sectors in the file

1367 # (-1 because header doesn't count)

1368 self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1

1369 log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect))

1370 # TODO: change this test, because an OLE file MAY contain other data

1371 # after the last sector.

1372

1373 # file clsid

1374 self.header_clsid = _clsid(header[8:24])

1375

1376 # TODO: remove redundant attributes, and fix the code which uses them?

1377 self.sectorsize = self.sector_size #1 << i16(header, 30)

1378 self.minisectorsize = self.mini_sector_size #1 << i16(header, 32)

1379 self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56)

1380

1381 # check known streams for duplicate references (these are always in FAT,

1382 # never in MiniFAT):

1383 self._check_duplicate_stream(self.first_dir_sector)

1384 # check MiniFAT only if it is not empty:

1385 if self.num_mini_fat_sectors:

1386 self._check_duplicate_stream(self.first_mini_fat_sector)

1387 # check DIFAT only if it is not empty:

1388 if self.num_difat_sectors:

1389 self._check_duplicate_stream(self.first_difat_sector)

1390

1391 # Load file allocation tables

1392 self.loadfat(header)

1393 # Load directory. This sets both the direntries list (ordered by sid)

1394 # and the root (ordered by hierarchy) members.

1395 self.loaddirectory(self.first_dir_sector)

1396 self.minifatsect = self.first_mini_fat_sector

1397

1398 def close(self):

1399 """

1400 close the OLE file, release the file object if we created it ourselves.

1401

1402 Leaves the file handle open if it was provided by the caller.

1403 """

1404 self._close(warn=False)

1405

1406 def _close(self, warn=False):

1407 """Implementation of close() with internal arg `warn`."""

1408 if self._we_opened_fp:

1409 if warn:

1410 warnings.warn(OleFileIONotClosed(self._open_stack))

1411 self.fp.close()

1412 self._we_opened_fp = False

1413

1414 def _check_duplicate_stream(self, first_sect, minifat=False):

1415 """

1416 Checks if a stream has not been already referenced elsewhere.

1417 This method should only be called once for each known stream, and only

1418 if stream size is not null.

1419

1420 :param first_sect: int, index of first sector of the stream in FAT

1421 :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT

1422 """

1423 if minifat:

1424 log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect)

1425 used_streams = self._used_streams_minifat

1426 else:

1427 log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect)

1428 # some values can be safely ignored (not a real stream):

1429 if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):

1430 return

1431 used_streams = self._used_streams_fat

1432 # TODO: would it be more efficient using a dict or hash values, instead

1433 # of a list of long ?

1434 if first_sect in used_streams:

1435 self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')

1436 else:

1437 used_streams.append(first_sect)

1438

1439 def dumpfat(self, fat, firstindex=0):

1440 """

1441 Display a part of FAT in human-readable form for debugging purposes

1442 """

1443 # dictionary to convert special FAT values in human-readable strings

1444 VPL = 8 # values per line (8+1 * 8+1 = 81)

1445 fatnames = {

1446 FREESECT: "..free..",

1447 ENDOFCHAIN: "[ END. ]",

1448 FATSECT: "FATSECT ",

1449 DIFSECT: "DIFSECT "

1450 }

1451 nbsect = len(fat)

1452 nlines = (nbsect+VPL-1)//VPL

1453 print("index", end=" ")

1454 for i in range(VPL):

1455 print("%8X" % i, end=" ")

1456 print()

1457 for l in range(nlines):

1458 index = l*VPL

1459 print("%6X:" % (firstindex+index), end=" ")

1460 for i in range(index, index+VPL):

1461 if i>=nbsect:

1462 break

1463 sect = fat[i]

1464 aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND

1465 if aux in fatnames:

1466 name = fatnames[aux]

1467 else:

1468 if sect == i+1:

1469 name = " --->"

1470 else:

1471 name = "%8X" % sect

1472 print(name, end=" ")

1473 print()

1474

1475 def dumpsect(self, sector, firstindex=0):

1476 """

1477 Display a sector in a human-readable form, for debugging purposes

1478 """

1479 VPL=8 # number of values per line (8+1 * 8+1 = 81)

1480 tab = array.array(UINT32, sector)

1481 if sys.byteorder == 'big':

1482 tab.byteswap()

1483 nbsect = len(tab)

1484 nlines = (nbsect+VPL-1)//VPL

1485 print("index", end=" ")

1486 for i in range(VPL):

1487 print("%8X" % i, end=" ")

1488 print()

1489 for l in range(nlines):

1490 index = l*VPL

1491 print("%6X:" % (firstindex+index), end=" ")

1492 for i in range(index, index+VPL):

1493 if i>=nbsect:

1494 break

1495 sect = tab[i]

1496 name = "%8X" % sect

1497 print(name, end=" ")

1498 print()

1499

1500 def sect2array(self, sect):

1501 """

1502 convert a sector to an array of 32 bits unsigned integers,

1503 swapping bytes on big endian CPUs such as PowerPC (old Macs)

1504 """

1505 # TODO: make this a static function

1506 a = array.array(UINT32, sect)

1507 # if CPU is big endian, swap bytes:

1508 if sys.byteorder == 'big':

1509 a.byteswap()

1510 return a

1511

1512 def loadfat_sect(self, sect):

1513 """

1514 Adds the indexes of the given sector to the FAT

1515

1516 :param sect: string containing the first FAT sector, or array of long integers

1517 :returns: index of last FAT sector.

1518 """

1519 # a FAT sector is an array of ulong integers.

1520 if isinstance(sect, array.array):

1521 # if sect is already an array it is directly used

1522 fat1 = sect

1523 else:

1524 # if it's a raw sector, it is parsed in an array

1525 fat1 = self.sect2array(sect)

1526 # Display the sector contents only if the logging level is debug:

1527 if log.isEnabledFor(logging.DEBUG):

1528 self.dumpsect(sect)

1529 # The FAT is a sector chain starting at the first index of itself.

1530 # initialize isect, just in case:

1531 isect = None

1532 for isect in fat1:

1533 isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND

1534 log.debug("isect = %X" % isect)

1535 if isect == ENDOFCHAIN or isect == FREESECT:

1536 # the end of the sector chain has been reached

1537 log.debug("found end of sector chain")

1538 break

1539 # read the FAT sector

1540 s = self.getsect(isect)

1541 # parse it as an array of 32 bits integers, and add it to the

1542 # global FAT array

1543 nextfat = self.sect2array(s)

1544 self.fat = self.fat + nextfat

1545 return isect

1546

1547 def loadfat(self, header):

1548 """

1549 Load the FAT table.

1550 """

1551 # The 1st sector of the file contains sector numbers for the first 109

1552 # FAT sectors, right after the header which is 76 bytes long.

1553 # (always 109, whatever the sector size: 512 bytes = 76+4*109)

1554 # Additional sectors are described by DIF blocks

1555

1556 log.debug('Loading the FAT table, starting with the 1st sector after the header')

1557 sect = header[76:512]

1558 log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) )

1559 # fat = []

1560 # FAT is an array of 32 bits unsigned ints, it's more effective

1561 # to use an array than a list in Python.

1562 # It's initialized as empty first:

1563 self.fat = array.array(UINT32)

1564 self.loadfat_sect(sect)

1565 # self.dumpfat(self.fat)

1566 # for i in range(0, len(sect), 4):

1567 # ix = i32(sect, i)

1568 # # [PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:

1569 # if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:

1570 # break

1571 # s = self.getsect(ix)

1572 # # fat = fat + [i32(s, i) for i in range(0, len(s), 4)]

1573 # fat = fat + array.array(UINT32, s)

1574 if self.num_difat_sectors != 0:

1575 log.debug('DIFAT is used, because file size > 6.8MB.')

1576 # [PL] There's a DIFAT because file is larger than 6.8MB

1577 # some checks just in case:

1578 if self.num_fat_sectors <= 109:

1579 # there must be at least 109 blocks in header and the rest in

1580 # DIFAT, so number of sectors must be >109.

1581 self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')

1582 if self.first_difat_sector >= self.nb_sect:

1583 # initial DIFAT block index must be valid

1584 self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')

1585 log.debug( "DIFAT analysis..." )

1586 # We compute the necessary number of DIFAT sectors :

1587 # Number of pointers per DIFAT sector = (sectorsize/4)-1

1588 # (-1 because the last pointer is the next DIFAT sector number)

1589 nb_difat_sectors = (self.sectorsize//4)-1

1590 # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)

1591 nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors

1592 log.debug( "nb_difat = %d" % nb_difat )

1593 if self.num_difat_sectors != nb_difat:

1594 raise IOError('incorrect DIFAT')

1595 isect_difat = self.first_difat_sector

1596 for i in iterrange(nb_difat):

1597 log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) )

1598 # TODO: check if corresponding FAT SID = DIFSECT

1599 sector_difat = self.getsect(isect_difat)

1600 difat = self.sect2array(sector_difat)

1601 # Display the sector contents only if the logging level is debug:

1602 if log.isEnabledFor(logging.DEBUG):

1603 self.dumpsect(sector_difat)

1604 self.loadfat_sect(difat[:nb_difat_sectors])

1605 # last DIFAT pointer is next DIFAT sector:

1606 isect_difat = difat[nb_difat_sectors]

1607 log.debug( "next DIFAT sector: %X" % isect_difat )

1608 # checks:

1609 if isect_difat not in [ENDOFCHAIN, FREESECT]:

1610 # last DIFAT pointer value must be ENDOFCHAIN or FREESECT

1611 raise IOError('incorrect end of DIFAT')

1612 # if len(self.fat) != self.num_fat_sectors:

1613 # # FAT should contain num_fat_sectors blocks

1614 # print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors))

1615 # raise IOError('incorrect DIFAT')

1616 else:

1617 log.debug('No DIFAT, because file size < 6.8MB.')

1618 # since FAT is read from fixed-size sectors, it may contain more values

1619 # than the actual number of sectors in the file.

1620 # Keep only the relevant sector indexes:

1621 if len(self.fat) > self.nb_sect:

1622 log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))

1623 self.fat = self.fat[:self.nb_sect]

1624 log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect))

1625 # Display the FAT contents only if the logging level is debug:

1626 if log.isEnabledFor(logging.DEBUG):

1627 log.debug('\nFAT:')

1628 self.dumpfat(self.fat)

1629

1630 def loadminifat(self):

1631 """

1632 Load the MiniFAT table.

1633 """

1634 # MiniFAT is stored in a standard sub-stream, pointed to by a header

1635 # field.

1636 # NOTE: there are two sizes to take into account for this stream:

1637 # 1) Stream size is calculated according to the number of sectors

1638 # declared in the OLE header. This allocated stream may be more than

1639 # needed to store the actual sector indexes.

1640 # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size)

1641 stream_size = self.num_mini_fat_sectors * self.sector_size

1642 # 2) Actually used size is calculated by dividing the MiniStream size

1643 # (given by root entry size) by the size of mini sectors, *4 for

1644 # 32 bits indexes:

1645 nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size

1646 used_size = nb_minisectors * 4

1647 log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %

1648 (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors))

1649 if used_size > stream_size:

1650 # This is not really a problem, but may indicate a wrong implementation:

1651 self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')

1652 # In any case, first read stream_size:

1653 s = self._open(self.minifatsect, stream_size, force_FAT=True).read()

1654 # [PL] Old code replaced by an array:

1655 #self.minifat = [i32(s, i) for i in range(0, len(s), 4)]

1656 self.minifat = self.sect2array(s)

1657 # Then shrink the array to used size, to avoid indexes out of MiniStream:

1658 log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))

1659 self.minifat = self.minifat[:nb_minisectors]

1660 log.debug('loadminifat(): len=%d' % len(self.minifat))

1661 # Display the FAT contents only if the logging level is debug:

1662 if log.isEnabledFor(logging.DEBUG):

1663 log.debug('\nMiniFAT:')

1664 self.dumpfat(self.minifat)

1665

1666 def getsect(self, sect):

1667 """

1668 Read given sector from file on disk.

1669

1670 :param sect: int, sector index

1671 :returns: a string containing the sector data.

1672 """

1673 # From [MS-CFB]: A sector number can be converted into a byte offset

1674 # into the file by using the following formula:

1675 # (sector number + 1) x Sector Size.

1676 # This implies that sector #0 of the file begins at byte offset Sector

1677 # Size, not at 0.

1678

1679 # [PL] the original code in PIL was wrong when sectors are 4KB instead of

1680 # 512 bytes:

1681 #self.fp.seek(512 + self.sectorsize * sect)

1682 # [PL]: added safety checks:

1683 #print("getsect(%X)" % sect)

1684 try:

1685 self.fp.seek(self.sectorsize * (sect+1))

1686 except Exception:

1687 log.debug('getsect(): sect=%X, seek=%d, filesize=%d' %

1688 (sect, self.sectorsize*(sect+1), self._filesize))

1689 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')

1690 sector = self.fp.read(self.sectorsize)

1691 if len(sector) != self.sectorsize:

1692 log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' %

1693 (sect, len(sector), self.sectorsize))

1694 self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')

1695 return sector

1696

1697 def write_sect(self, sect, data, padding=b'\x00'):

1698 """

1699 Write given sector to file on disk.

1700

1701 :param sect: int, sector index

1702 :param data: bytes, sector data

1703 :param padding: single byte, padding character if data < sector size

1704 """

1705 if not isinstance(data, bytes):

1706 raise TypeError("write_sect: data must be a bytes string")

1707 if not isinstance(padding, bytes) or len(padding)!=1:

1708 raise TypeError("write_sect: padding must be a bytes string of 1 char")

1709 # TODO: we could allow padding=None for no padding at all

1710 try:

1711 self.fp.seek(self.sectorsize * (sect+1))

1712 except Exception:

1713 log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' %

1714 (sect, self.sectorsize*(sect+1), self._filesize))

1715 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')

1716 if len(data) < self.sectorsize:

1717 # add padding

1718 data += padding * (self.sectorsize - len(data))

1719 elif len(data) < self.sectorsize:

1720 raise ValueError("Data is larger than sector size")

1721 self.fp.write(data)

1722

1723 def _write_mini_sect(self, fp_pos, data, padding = b'\x00'):

1724 """

1725 Write given sector to file on disk.

1726

1727 :param fp_pos: int, file position

1728 :param data: bytes, sector data

1729 :param padding: single byte, padding character if data < sector size

1730 """

1731 if not isinstance(data, bytes):

1732 raise TypeError("write_mini_sect: data must be a bytes string")

1733 if not isinstance(padding, bytes) or len(padding) != 1:

1734 raise TypeError("write_mini_sect: padding must be a bytes string of 1 char")

1735

1736 try:

1737 self.fp.seek(fp_pos)

1738 except Exception:

1739 log.debug('write_mini_sect(): fp_pos=%d, filesize=%d' %

1740 (fp_pos, self._filesize))

1741 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')

1742 len_data = len(data)

1743 if len_data < self.mini_sector_size:

1744 data += padding * (self.mini_sector_size - len_data)

1745 if self.mini_sector_size < len_data:

1746 raise ValueError("Data is larger than sector size")

1747 self.fp.write(data)

1748

1749 def loaddirectory(self, sect):

1750 """

1751 Load the directory.

1752

1753 :param sect: sector index of directory stream.

1754 """

1755 log.debug('Loading the Directory:')

1756 # The directory is stored in a standard

1757 # substream, independent of its size.

1758

1759 # open directory stream as a read-only file:

1760 # (stream size is not known in advance)

1761 self.directory_fp = self._open(sect, force_FAT=True)

1762

1763 # [PL] to detect malformed documents and avoid DoS attacks, the maximum

1764 # number of directory entries can be calculated:

1765 max_entries = self.directory_fp.size // 128

1766 log.debug('loaddirectory: size=%d, max_entries=%d' %

1767 (self.directory_fp.size, max_entries))

1768

1769 # Create list of directory entries

1770 # self.direntries = []

1771 # We start with a list of "None" object

1772 self.direntries = [None] * max_entries

1773 # for sid in iterrange(max_entries):

1774 # entry = fp.read(128)

1775 # if not entry:

1776 # break

1777 # self.direntries.append(OleDirectoryEntry(entry, sid, self))

1778 # load root entry:

1779 root_entry = self._load_direntry(0)

1780 # Root entry is the first entry:

1781 self.root = self.direntries[0]

1782 # TODO: read ALL directory entries (ignore bad entries?)

1783 # TODO: adapt build_storage_tree to avoid duplicate reads

1784 # for i in range(1, max_entries):

1785 # self._load_direntry(i)

1786 # read and build all storage trees, starting from the root:

1787 self.root.build_storage_tree()

1788

1789 def _load_direntry (self, sid):

1790 """

1791 Load a directory entry from the directory.

1792 This method should only be called once for each storage/stream when

1793 loading the directory.

1794

1795 :param sid: index of storage/stream in the directory.

1796 :returns: a OleDirectoryEntry object

1797

1798 :exception OleFileError: if the entry has always been referenced.

1799 """

1800 # check if SID is OK:

1801 if sid<0 or sid>=len(self.direntries):

1802 self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")

1803 # check if entry was already referenced:

1804 if self.direntries[sid] is not None:

1805 self._raise_defect(DEFECT_INCORRECT,

1806 "double reference for OLE stream/storage")

1807 # if exception not raised, return the object

1808 return self.direntries[sid]

1809 self.directory_fp.seek(sid * 128)

1810 entry = self.directory_fp.read(128)

1811 self.direntries[sid] = OleDirectoryEntry(entry, sid, self)

1812 return self.direntries[sid]

1813

1814 def dumpdirectory(self):

1815 """

1816 Dump directory (for debugging only)

1817 """

1818 self.root.dump()

1819

1820 def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False):

1821 """

1822 Open a stream, either in FAT or MiniFAT according to its size.

1823 (openstream helper)

1824

1825 :param start: index of first sector

1826 :param size: size of stream (or nothing if size is unknown)

1827 :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT

1828 according to size. If True, it will always be opened in FAT.

1829 """

1830 log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' %

1831 (start, size, str(force_FAT)))

1832 # stream size is compared to the mini_stream_cutoff_size threshold:

1833 if size < self.minisectorcutoff and not force_FAT:

1834 # ministream object

1835 if not self.ministream:

1836 # load MiniFAT if it wasn't already done:

1837 self.loadminifat()

1838 # The first sector index of the miniFAT stream is stored in the

1839 # root directory entry:

1840 size_ministream = self.root.size

1841 log.debug('Opening MiniStream: sect=%Xh, size=%d' %

1842 (self.root.isectStart, size_ministream))

1843 self.ministream = self._open(self.root.isectStart,

1844 size_ministream, force_FAT=True)

1845 return OleStream(fp=self.ministream, sect=start, size=size,

1846 offset=0, sectorsize=self.minisectorsize,

1847 fat=self.minifat, filesize=self.ministream.size,

1848 olefileio=self)

1849 else:

1850 # standard stream

1851 return OleStream(fp=self.fp, sect=start, size=size,

1852 offset=self.sectorsize,

1853 sectorsize=self.sectorsize, fat=self.fat,

1854 filesize=self._filesize,

1855 olefileio=self)

1856

1857 def _list(self, files, prefix, node, streams=True, storages=False):

1858 """

1859 listdir helper

1860

1861 :param files: list of files to fill in

1862 :param prefix: current location in storage tree (list of names)

1863 :param node: current node (OleDirectoryEntry object)

1864 :param streams: bool, include streams if True (True by default) - new in v0.26

1865 :param storages: bool, include storages if True (False by default) - new in v0.26

1866 (note: the root storage is never included)

1867 """

1868 prefix = prefix + [node.name]

1869 for entry in node.kids:

1870 if entry.entry_type == STGTY_STORAGE:

1871 # this is a storage

1872 if storages:

1873 # add it to the list

1874 files.append(prefix[1:] + [entry.name])

1875 # check its kids

1876 self._list(files, prefix, entry, streams, storages)

1877 elif entry.entry_type == STGTY_STREAM:

1878 # this is a stream

1879 if streams:

1880 # add it to the list

1881 files.append(prefix[1:] + [entry.name])

1882 else:

1883 self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.')

1884

1885 def listdir(self, streams=True, storages=False):

1886 """

1887 Return a list of streams and/or storages stored in this file

1888

1889 :param streams: bool, include streams if True (True by default) - new in v0.26

1890 :param storages: bool, include storages if True (False by default) - new in v0.26

1891 (note: the root storage is never included)

1892 :returns: list of stream and/or storage paths

1893 """

1894 files = []

1895 self._list(files, [], self.root, streams, storages)

1896 return files

1897

1898 def _find(self, filename):

1899 """

1900 Returns directory entry of given filename. (openstream helper)

1901 Note: this method is case-insensitive.

1902

1903 :param filename: path of stream in storage tree (except root entry), either:

1904

1905 - a string using Unix path syntax, for example:

1906 'storage_1/storage_1.2/stream'

1907 - or a list of storage filenames, path to the desired stream/storage.

1908 Example: ['storage_1', 'storage_1.2', 'stream']

1909

1910 :returns: sid of requested filename

1911 :exception IOError: if file not found

1912 """

1913

1914 # if filename is a string instead of a list, split it on slashes to

1915 # convert to a list:

1916 if isinstance(filename, basestring):

1917 filename = filename.split('/')

1918 # walk across storage tree, following given path:

1919 node = self.root

1920 for name in filename:

1921 for kid in node.kids:

1922 if kid.name.lower() == name.lower():

1923 break

1924 else:

1925 raise IOError("file not found")

1926 node = kid

1927 return node.sid

1928

1929 def openstream(self, filename):

1930 """

1931 Open a stream as a read-only file object (BytesIO).

1932 Note: filename is case-insensitive.

1933

1934 :param filename: path of stream in storage tree (except root entry), either:

1935

1936 - a string using Unix path syntax, for example:

1937 'storage_1/storage_1.2/stream'

1938 - or a list of storage filenames, path to the desired stream/storage.

1939 Example: ['storage_1', 'storage_1.2', 'stream']

1940

1941 :returns: file object (read-only)

1942 :exception IOError: if filename not found, or if this is not a stream.

1943 """

1944 sid = self._find(filename)

1945 entry = self.direntries[sid]

1946 if entry.entry_type != STGTY_STREAM:

1947 raise IOError("this file is not a stream")

1948 return self._open(entry.isectStart, entry.size)

1949

1950 def _write_mini_stream(self, entry, data_to_write):

1951 if not entry.sect_chain:

1952 entry.build_sect_chain(self)

1953 nb_sectors = len(entry.sect_chain)

1954

1955 if not self.root.sect_chain:

1956 self.root.build_sect_chain(self)

1957 block_size = self.sector_size // self.mini_sector_size

1958 for idx, sect in enumerate(entry.sect_chain):

1959 sect_base = sect // block_size

1960 sect_offset = sect % block_size

1961 fp_pos = (self.root.sect_chain[sect_base] + 1)*self.sector_size + sect_offset*self.mini_sector_size

1962 if idx < (nb_sectors - 1):

1963 data_per_sector = data_to_write[idx * self.mini_sector_size: (idx + 1) * self.mini_sector_size]

1964 else:

1965 data_per_sector = data_to_write[idx * self.mini_sector_size:]

1966 self._write_mini_sect(fp_pos, data_per_sector)

1967

1968 def write_stream(self, stream_name, data):

1969 """

1970 Write a stream to disk. For now, it is only possible to replace an

1971 existing stream by data of the same size.

1972

1973 :param stream_name: path of stream in storage tree (except root entry), either:

1974

1975 - a string using Unix path syntax, for example:

1976 'storage_1/storage_1.2/stream'

1977 - or a list of storage filenames, path to the desired stream/storage.

1978 Example: ['storage_1', 'storage_1.2', 'stream']

1979

1980 :param data: bytes, data to be written, must be the same size as the original

1981 stream.

1982 """

1983 if not isinstance(data, bytes):

1984 raise TypeError("write_stream: data must be a bytes string")

1985 sid = self._find(stream_name)

1986 entry = self.direntries[sid]

1987 if entry.entry_type != STGTY_STREAM:

1988 raise IOError("this is not a stream")

1989 size = entry.size

1990 if size != len(data):

1991 raise ValueError("write_stream: data must be the same size as the existing stream")

1992 if size < self.minisectorcutoff and entry.entry_type != STGTY_ROOT:

1993 return self._write_mini_stream(entry = entry, data_to_write = data)

1994

1995 sect = entry.isectStart

1996 # number of sectors to write

1997 nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize

1998 log.debug('nb_sectors = %d' % nb_sectors)

1999 for i in range(nb_sectors):

2000 # try:

2001 # self.fp.seek(offset + self.sectorsize * sect)

2002 # except Exception:

2003 # log.debug('sect=%d, seek=%d' %

2004 # (sect, offset+self.sectorsize*sect))

2005 # raise IOError('OLE sector index out of range')

2006 # extract one sector from data, the last one being smaller:

2007 if i<(nb_sectors-1):

2008 data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize]

2009 # TODO: comment this if it works

2010 assert(len(data_sector)==self.sectorsize)

2011 else:

2012 data_sector = data [i*self.sectorsize:]

2013 # TODO: comment this if it works

2014 log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d'

2015 % (size, self.sectorsize, len(data_sector), size % self.sectorsize))

2016 assert(len(data_sector) % self.sectorsize==size % self.sectorsize)

2017 self.write_sect(sect, data_sector)

2018 # self.fp.write(data_sector)

2019 # jump to next sector in the FAT:

2020 try:

2021 sect = self.fat[sect]

2022 except IndexError:

2023 # [PL] if pointer is out of the FAT an exception is raised

2024 raise IOError('incorrect OLE FAT, sector index out of range')

2025 # [PL] Last sector should be a "end of chain" marker:

2026 if sect != ENDOFCHAIN:

2027 raise IOError('incorrect last sector index in OLE stream')

2028

2029 def get_type(self, filename):

2030 """

2031 Test if given filename exists as a stream or a storage in the OLE

2032 container, and return its type.

2033

2034 :param filename: path of stream in storage tree. (see openstream for syntax)

2035 :returns: False if object does not exist, its entry type (>0) otherwise:

2036

2037 - STGTY_STREAM: a stream

2038 - STGTY_STORAGE: a storage

2039 - STGTY_ROOT: the root entry

2040 """

2041 try:

2042 sid = self._find(filename)

2043 entry = self.direntries[sid]

2044 return entry.entry_type

2045 except Exception:

2046 return False

2047

2048 def getclsid(self, filename):

2049 """

2050 Return clsid of a stream/storage.

2051

2052 :param filename: path of stream/storage in storage tree. (see openstream for

2053 syntax)

2054 :returns: Empty string if clsid is null, a printable representation of the clsid otherwise

2055

2056 new in version 0.44

2057 """

2058 sid = self._find(filename)

2059 entry = self.direntries[sid]

2060 return entry.clsid

2061

2062 def getmtime(self, filename):

2063 """

2064 Return modification time of a stream/storage.

2065

2066 :param filename: path of stream/storage in storage tree. (see openstream for

2067 syntax)

2068 :returns: None if modification time is null, a python datetime object

2069 otherwise (UTC timezone)

2070

2071 new in version 0.26

2072 """

2073 sid = self._find(filename)

2074 entry = self.direntries[sid]

2075 return entry.getmtime()

2076

2077 def getctime(self, filename):

2078 """

2079 Return creation time of a stream/storage.

2080

2081 :param filename: path of stream/storage in storage tree. (see openstream for

2082 syntax)

2083 :returns: None if creation time is null, a python datetime object

2084 otherwise (UTC timezone)

2085

2086 new in version 0.26

2087 """

2088 sid = self._find(filename)

2089 entry = self.direntries[sid]

2090 return entry.getctime()

2091

2092 def exists(self, filename):

2093 """

2094 Test if given filename exists as a stream or a storage in the OLE

2095 container.

2096 Note: filename is case-insensitive.

2097

2098 :param filename: path of stream in storage tree. (see openstream for syntax)

2099 :returns: True if object exist, else False.

2100 """

2101 try:

2102 sid = self._find(filename)

2103 return True

2104 except Exception:

2105 return False

2106

2107 def get_size(self, filename):

2108 """

2109 Return size of a stream in the OLE container, in bytes.

2110

2111 :param filename: path of stream in storage tree (see openstream for syntax)

2112 :returns: size in bytes (long integer)

2113 :exception IOError: if file not found

2114 :exception TypeError: if this is not a stream.

2115 """

2116 sid = self._find(filename)

2117 entry = self.direntries[sid]

2118 if entry.entry_type != STGTY_STREAM:

2119 # TODO: Should it return zero instead of raising an exception ?

2120 raise TypeError('object is not an OLE stream')

2121 return entry.size

2122

2123 def get_rootentry_name(self):

2124 """

2125 Return root entry name. Should usually be 'Root Entry' or 'R' in most

2126 implementations.

2127 """

2128 return self.root.name

2129

2130 def getproperties(self, filename, convert_time=False, no_conversion=None):

2131 """

2132 Return properties described in substream.

2133

2134 :param filename: path of stream in storage tree (see openstream for syntax)

2135 :param convert_time: bool, if True timestamps will be converted to Python datetime

2136 :param no_conversion: None or list of int, timestamps not to be converted

2137 (for example total editing time is not a real timestamp)

2138

2139 :returns: a dictionary of values indexed by id (integer)

2140 """

2141 #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx

2142 # make sure no_conversion is a list, just to simplify code below:

2143 if no_conversion == None:

2144 no_conversion = []

2145 # stream path as a string to report exceptions:

2146 streampath = filename

2147 if not isinstance(streampath, str):

2148 streampath = '/'.join(streampath)

2149 fp = self.openstream(filename)

2150 data = {}

2151 try:

2152 # header

2153 s = fp.read(28)

2154 clsid = _clsid(s[8:24])

2155 # format id

2156 s = fp.read(20)

2157 fmtid = _clsid(s[:16])

2158 fp.seek(i32(s, 16))

2159 # get section

2160 s = b"****" + fp.read(i32(fp.read(4))-4)

2161 # number of properties:

2162 num_props = i32(s, 4)

2163 except BaseException as exc:

2164 # catch exception while parsing property header, and only raise

2165 # a DEFECT_INCORRECT then return an empty dict, because this is not

2166 # a fatal error when parsing the whole file

2167 msg = 'Error while parsing properties header in stream {}: {}'.format(

2168 repr(streampath), exc)

2169 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2170 return data

2171 # clamp num_props based on the data length

2172 num_props = min(num_props, int(len(s) / 8))

2173 for i in iterrange(num_props):

2174 property_id = 0 # just in case of an exception

2175 try:

2176 property_id = i32(s, 8+i*8)

2177 offset = i32(s, 12+i*8)

2178 property_type = i32(s, offset)

2179

2180 vt_name = VT.get(property_type, 'UNKNOWN')

2181 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))

2182

2183 value = self._parse_property(s, offset+4, property_id, property_type, convert_time, no_conversion)

2184 data[property_id] = value

2185 except BaseException as exc:

2186 # catch exception while parsing each property, and only raise

2187 # a DEFECT_INCORRECT, because parsing can go on

2188 msg = 'Error while parsing property id %d in stream %s: %s' % (

2189 property_id, repr(streampath), exc)

2190 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2191

2192 return data

2193

2194 def _parse_property(self, s, offset, property_id, property_type, convert_time, no_conversion):

2195 v = None

2196 if property_type <= VT_BLOB or property_type in (VT_CLSID, VT_CF):

2197 v, _ = self._parse_property_basic(s, offset, property_id, property_type, convert_time, no_conversion)

2198 elif property_type == VT_VECTOR | VT_VARIANT:

2199 log.debug('property_type == VT_VECTOR | VT_VARIANT')

2200 off = 4

2201 count = i32(s, offset)

2202 values = []

2203 for _ in range(count):

2204 property_type = i32(s, offset + off)

2205 v, sz = self._parse_property_basic(s, offset + off + 4, property_id, property_type, convert_time, no_conversion)

2206 values.append(v)

2207 off += sz + 4

2208 v = values

2209

2210 elif property_type & VT_VECTOR:

2211 property_type_base = property_type & ~VT_VECTOR

2212 log.debug('property_type == VT_VECTOR | %s' % VT.get(property_type_base, 'UNKNOWN'))

2213 off = 4

2214 count = i32(s, offset)

2215 values = []

2216 for _ in range(count):

2217 v, sz = self._parse_property_basic(s, offset + off, property_id, property_type & ~VT_VECTOR, convert_time, no_conversion)

2218 values.append(v)

2219 off += sz

2220 v = values

2221 else:

2222 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))

2223 return v

2224

2225 def _parse_property_basic(self, s, offset, property_id, property_type, convert_time, no_conversion):

2226 value = None

2227 size = 0

2228 # test for common types first (should perhaps use

2229 # a dictionary instead?)

2230

2231 if property_type == VT_I2: # 16-bit signed integer

2232 value = i16(s, offset)

2233 if value >= 32768:

2234 value = value - 65536

2235 size = 2

2236 elif property_type == VT_UI2: # 2-byte unsigned integer

2237 value = i16(s, offset)

2238 size = 2

2239 elif property_type in (VT_I4, VT_INT, VT_ERROR):

2240 # VT_I4: 32-bit signed integer

2241 # VT_ERROR: HRESULT, similar to 32-bit signed integer,

2242 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx

2243 value = i32(s, offset)

2244 size = 4

2245 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer

2246 value = i32(s, offset) # FIXME

2247 size = 4

2248 elif property_type in (VT_BSTR, VT_LPSTR):

2249 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx

2250 # size is a 32 bits integer, including the null terminator, and

2251 # possibly trailing or embedded null chars

2252 #TODO: if codepage is unicode, the string should be converted as such

2253 count = i32(s, offset)

2254 value = s[offset+4:offset+4+count-1]

2255 # remove all null chars:

2256 value = value.replace(b'\x00', b'')

2257 size = 4 + count

2258 elif property_type == VT_BLOB:

2259 # binary large object (BLOB)

2260 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx

2261 count = i32(s, offset)

2262 value = s[offset+4:offset+4+count]

2263 size = 4 + count

2264 elif property_type == VT_LPWSTR:

2265 # UnicodeString

2266 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx

2267 # "the string should NOT contain embedded or additional trailing

2268 # null characters."

2269 count = i32(s, offset+4)

2270 value = self._decode_utf16_str(s[offset+4:offset+4+count*2])

2271 size = 4 + count * 2

2272 elif property_type == VT_FILETIME:

2273 value = long(i32(s, offset)) + (long(i32(s, offset+4))<<32)

2274 # FILETIME is a 64-bit int: "number of 100ns periods

2275 # since Jan 1,1601".

2276 if convert_time and property_id not in no_conversion:

2277 log.debug('Converting property #%d to python datetime, value=%d=%fs'

2278 %(property_id, value, float(value)/10000000))

2279 # convert FILETIME to Python datetime.datetime

2280 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/

2281 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)

2282 log.debug('timedelta days=%d' % (value//(10*1000000*3600*24)))

2283 value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10)

2284 else:

2285 # legacy code kept for backward compatibility: returns a

2286 # number of seconds since Jan 1,1601

2287 value = value // 10000000 # seconds

2288 size = 8

2289 elif property_type == VT_UI1: # 1-byte unsigned integer

2290 value = i8(s[offset])

2291 size = 1

2292 elif property_type == VT_CLSID:

2293 value = _clsid(s[offset:offset+16])

2294 size = 16

2295 elif property_type == VT_CF:

2296 # PropertyIdentifier or ClipboardData??

2297 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx

2298 count = i32(s, offset)

2299 value = s[offset+4:offset+4+count]

2300 size = 4 + count

2301 elif property_type == VT_BOOL:

2302 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True

2303 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx

2304 value = bool(i16(s, offset))

2305 size = 2

2306 else:

2307 value = None # everything else yields "None"

2308 log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))

2309

2310 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,

2311 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,

2312 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx

2313

2314 #print("%08x" % property_id, repr(value), end=" ")

2315 #print("(%s)" % VT[i32(s, offset) & 0xFFF])

2316 return value, size

2317

2318

2319 def get_metadata(self):

2320 """

2321 Parse standard properties streams, return an OleMetadata object

2322 containing all the available metadata.

2323 (also stored in the metadata attribute of the OleFileIO object)

2324

2325 new in version 0.25

2326 """

2327 self.metadata = OleMetadata()

2328 self.metadata.parse_properties(self)

2329 return self.metadata

2330

2331 def get_userdefined_properties(self, filename, convert_time=False, no_conversion=None):

2332 """

2333 Return properties described in substream.

2334

2335 :param filename: path of stream in storage tree (see openstream for syntax)

2336 :param convert_time: bool, if True timestamps will be converted to Python datetime

2337 :param no_conversion: None or list of int, timestamps not to be converted

2338 (for example total editing time is not a real timestamp)

2339

2340 :returns: a dictionary of values indexed by id (integer)

2341 """

2342 # REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx

2343 # REFERENCE: https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-oshared/2ea8be67-a4a0-4e2e-b42f-49a182645562

2344 #'D5CDD502-2E9C-101B-9397-08002B2CF9AE'

2345 # TODO: testing the code more rigorously

2346 # TODO: adding exception handeling

2347 FMTID_USERDEFINED_PROPERTIES = _clsid(b'\x05\xD5\xCD\xD5\x9C\x2E\x1B\x10\x93\x97\x08\x00\x2B\x2C\xF9\xAE')

2348

2349 # make sure no_conversion is a list, just to simplify code below:

2350 if no_conversion == None:

2351 no_conversion = []

2352 # stream path as a string to report exceptions:

2353 streampath = filename

2354 if not isinstance(streampath, str):

2355 streampath = '/'.join(streampath)

2356

2357 fp = self.openstream(filename)

2358

2359 data = []

2360

2361 # header

2362 s = fp.read(28)

2363 clsid = _clsid(s[8:24])

2364

2365 # PropertySetStream.cSections (4 bytes starts at 1c): number of property sets in this stream

2366 sections_count = i32(s, 24)

2367

2368 section_file_pointers = []

2369

2370 try:

2371 for i in range(sections_count):

2372 # format id

2373 s = fp.read(20)

2374 fmtid = _clsid(s[:16])

2375

2376 if fmtid == FMTID_USERDEFINED_PROPERTIES:

2377 file_pointer = i32(s, 16)

2378 fp.seek(file_pointer)

2379 # read saved sections

2380 s = b"****" + fp.read(i32(fp.read(4)) - 4)

2381 # number of properties:

2382 num_props = i32(s, 4)

2383

2384 PropertyIdentifierAndOffset = s[8: 8+8*num_props]

2385

2386 # property names (dictionary)

2387 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/99127b7f-c440-4697-91a4-c853086d6b33

2388 index = 8+8*num_props

2389 entry_count = i32(s[index: index+4])

2390 index += 4

2391 for i in range(entry_count):

2392 identifier = s[index: index +4]

2393 str_size = i32(s[index+4: index + 8])

2394 string = s[index+8: index+8+str_size].decode('utf_8').strip('\0')

2395 data.append({'property_name':string, 'value':None})

2396 index = index+8+str_size

2397 # clamp num_props based on the data length

2398 num_props = min(num_props, int(len(s) / 8))

2399

2400 # property values

2401 # ref: https://docs.microsoft.com/en-us/openspecs/windows_protocols/MS-OLEPS/f122b9d7-e5cf-4484-8466-83f6fd94b3cc

2402 for i in iterrange(2, num_props):

2403 property_id = 0 # just in case of an exception

2404 try:

2405 property_id = i32(s, 8 + i * 8)

2406 offset = i32(s, 12 + i * 8)

2407 property_type = i32(s, offset)

2408

2409 vt_name = VT.get(property_type, 'UNKNOWN')

2410 log.debug('property id=%d: type=%d/%s offset=%X' % (property_id, property_type, vt_name, offset))

2411

2412 # test for common types first (should perhaps use

2413 # a dictionary instead?)

2414

2415 if property_type == VT_I2: # 16-bit signed integer

2416 value = i16(s, offset + 4)

2417 if value >= 32768:

2418 value = value - 65536

2419 elif property_type == 1:

2420 # supposed to be VT_NULL but seems it is not NULL

2421 str_size = i32(s, offset + 8)

2422 value = s[offset + 12:offset + 12 + str_size - 1]

2423

2424 elif property_type == VT_UI2: # 2-byte unsigned integer

2425 value = i16(s, offset + 4)

2426 elif property_type in (VT_I4, VT_INT, VT_ERROR):

2427 # VT_I4: 32-bit signed integer

2428 # VT_ERROR: HRESULT, similar to 32-bit signed integer,

2429 # see https://msdn.microsoft.com/en-us/library/cc230330.aspx

2430 value = i32(s, offset + 4)

2431 elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer

2432 value = i32(s, offset + 4) # FIXME

2433 elif property_type in (VT_BSTR, VT_LPSTR):

2434 # CodePageString, see https://msdn.microsoft.com/en-us/library/dd942354.aspx

2435 # size is a 32 bits integer, including the null terminator, and

2436 # possibly trailing or embedded null chars

2437 # TODO: if codepage is unicode, the string should be converted as such

2438 count = i32(s, offset + 4)

2439 value = s[offset + 8:offset + 8 + count - 1]

2440 # remove all null chars:

2441 value = value.replace(b'\x00', b'')

2442 elif property_type == VT_BLOB:

2443 # binary large object (BLOB)

2444 # see https://msdn.microsoft.com/en-us/library/dd942282.aspx

2445 count = i32(s, offset + 4)

2446 value = s[offset + 8:offset + 8 + count]

2447 elif property_type == VT_LPWSTR:

2448 # UnicodeString

2449 # see https://msdn.microsoft.com/en-us/library/dd942313.aspx

2450 # "the string should NOT contain embedded or additional trailing

2451 # null characters."

2452 count = i32(s, offset + 4)

2453 value = self._decode_utf16_str(s[offset + 8:offset + 8 + count * 2])

2454 elif property_type == VT_FILETIME:

2455 value = long(i32(s, offset + 4)) + (long(i32(s, offset + 8)) << 32)

2456 # FILETIME is a 64-bit int: "number of 100ns periods

2457 # since Jan 1,1601".

2458 if convert_time and property_id not in no_conversion:

2459 log.debug('Converting property #%d to python datetime, value=%d=%fs'

2460 % (property_id, value, float(value) / 10000000))

2461 # convert FILETIME to Python datetime.datetime

2462 # inspired from https://code.activestate.com/recipes/511425-filetime-to-datetime/

2463 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)

2464 log.debug('timedelta days=%d' % (value // (10 * 1000000 * 3600 * 24)))

2465 value = _FILETIME_null_date + datetime.timedelta(microseconds=value // 10)

2466 else:

2467 # legacy code kept for backward compatibility: returns a

2468 # number of seconds since Jan 1,1601

2469 value = value // 10000000 # seconds

2470 elif property_type == VT_UI1: # 1-byte unsigned integer

2471 value = i8(s[offset + 4])

2472 elif property_type == VT_CLSID:

2473 value = _clsid(s[offset + 4:offset + 20])

2474 elif property_type == VT_CF:

2475 # PropertyIdentifier or ClipboardData??

2476 # see https://msdn.microsoft.com/en-us/library/dd941945.aspx

2477 count = i32(s, offset + 4)

2478 value = s[offset + 8:offset + 8 + count]

2479 elif property_type == VT_BOOL:

2480 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True

2481 # see https://msdn.microsoft.com/en-us/library/cc237864.aspx

2482 value = bool(i16(s, offset + 4))

2483 else:

2484 value = None # everything else yields "None"

2485 log.debug(

2486 'property id=%d: type=%d not implemented in parser yet' % (property_id, property_type))

2487

2488 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,

2489 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,

2490 # see https://msdn.microsoft.com/en-us/library/dd942033.aspx

2491

2492 # FIXME: add support for VT_VECTOR

2493 # VT_VECTOR is a 32 uint giving the number of items, followed by

2494 # the items in sequence. The VT_VECTOR value is combined with the

2495 # type of items, e.g. VT_VECTOR|VT_BSTR

2496 # see https://msdn.microsoft.com/en-us/library/dd942011.aspx

2497

2498 # print("%08x" % property_id, repr(value), end=" ")

2499 # print("(%s)" % VT[i32(s, offset) & 0xFFF])

2500

2501 data[i-2]['value']=value

2502 except BaseException as exc:

2503 # catch exception while parsing each property, and only raise

2504 # a DEFECT_INCORRECT, because parsing can go on

2505 msg = 'Error while parsing property id %d in stream %s: %s' % (

2506 property_id, repr(streampath), exc)

2507 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2508

2509 except BaseException as exc:

2510 # catch exception while parsing property header, and only raise

2511 # a DEFECT_INCORRECT then return an empty dict, because this is not

2512 # a fatal error when parsing the whole file

2513 msg = 'Error while parsing properties header in stream %s: %s' % (

2514 repr(streampath), exc)

2515 self._raise_defect(DEFECT_INCORRECT, msg, type(exc))

2516 return data

2517

2518 return data

2519

2520

2521 def get_document_variables(self):

2522 """

2523 Extract the document variables from Microsft Word docs

2524 :return: it returns a list of dictionaries, each of them contains var_name and value keys

2525 """

2526 # TODO: testing the code more rigorously

2527 # TODO: adding exception handeling

2528 data = []

2529 word_fp = self.openstream(['WordDocument'])

2530

2531 # Read fcStwUser from the WordDocument stream

2532 # fcStwUser (4 bytes): An unsigned integer which is an offset in 1Table Stream that StwUser locates.

2533 # fcStwUser is the 121th field in fibRgFcLcb97 (index 120)

2534 fib_base = word_fp.read(32)

2535 nfib = i16(fib_base[2:4])

2536 if nfib == 0x00C1: # fibRgFcLcb97

2537 csw = i16(word_fp.read(2))

2538 fibRgW = word_fp.read(csw * 2)

2539 cslw = i16(word_fp.read(2))

2540 fibRgLw = word_fp.read(cslw * 4)

2541 cbRgFcLcb = i16(word_fp.read(2))

2542 fibRgFcLcbBlob = word_fp.read(cbRgFcLcb * 4)

2543 fcStwUser = i32(fibRgFcLcbBlob[120*4:121*4])

2544 lcbStwUser = i32(fibRgFcLcbBlob[121 * 4:122 * 4])

2545

2546 if lcbStwUser > 0:

2547 # Read StwUser from 1Table stream (WordDocument.fcStwUser points to this structure)

2548 # this structure contains variable names and assigned values

2549 table_fp = self.openstream(['1Table'])

2550 table_fp.seek(fcStwUser)

2551

2552 # SttbNames (array, contain variable names)

2553 ss = table_fp.read(6)

2554

2555 char_size = 1

2556 if ss[:2] == b'\xff\xff':

2557 char_size = 2

2558

2559 cdata = i16(ss[2:])

2560

2561 cbExtra = i16(ss[4:])

2562

2563 # SttbNames (array, contains variable names)

2564 for i in range(cdata):

2565 cchData = i16(table_fp.read(2))

2566 data_str = table_fp.read(cchData *char_size )

2567 if char_size == 2:

2568 data_str = self._decode_utf16_str(data_str)

2569 data.append({'var_name':data_str, 'value':''})

2570 extra = table_fp.read(cbExtra)

2571

2572 # rgxchNames (array, contains values corresponding to variable names in SttbNames)

2573 for i in range(cdata):

2574 cchData = i16(table_fp.read(2))

2575 data_str = table_fp.read(cchData *char_size)

2576 if char_size == 2:

2577 data_str = self._decode_utf16_str(data_str)

2578 data[i]['value'] = data_str

2579

2580 return data

2581

2582# --------------------------------------------------------------------

2583# This script can be used to dump the directory of any OLE2 structured

2584# storage file.

2585

2586def main():

2587 """

2588 Main function when olefile is runs as a script from the command line.

2589 This will open an OLE2 file and display its structure and properties

2590 :return: nothing

2591 """

2592 import sys, optparse

2593

2594 DEFAULT_LOG_LEVEL = "warning" # Default log level

2595 LOG_LEVELS = {

2596 'debug': logging.DEBUG,

2597 'info': logging.INFO,

2598 'warning': logging.WARNING,

2599 'error': logging.ERROR,

2600 'critical': logging.CRITICAL

2601 }

2602

2603 usage = 'usage: %prog [options] <filename> [filename2 ...]'

2604 parser = optparse.OptionParser(usage=usage)

2605

2606 parser.add_option("-c", action="store_true", dest="check_streams",

2607 help='check all streams (for debugging purposes)')

2608 parser.add_option("-v", action="store_true", dest="extract_customvar",

2609 help='extract all document variables')

2610 parser.add_option("-p", action="store_true", dest="extract_customprop",

2611 help='extract all user-defined propertires')

2612 parser.add_option("-d", action="store_true", dest="debug_mode",

2613 help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)')

2614 parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,

2615 help="logging level debug/info/warning/error/critical (default=%default)")

2616

2617 (options, args) = parser.parse_args()

2618

2619 print('olefile version {} {} - https://www.decalage.info/en/olefile\n'.format(__version__, __date__))

2620

2621 # Print help if no arguments are passed

2622 if len(args) == 0:

2623 print(__doc__)

2624 parser.print_help()

2625 sys.exit()

2626

2627 if options.debug_mode:

2628 options.loglevel = 'debug'

2629

2630 # setup logging to the console

2631 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')

2632

2633 # also enable the module's logger:

2634 enable_logging()

2635

2636 for filename in args:

2637 try:

2638 ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT)

2639 print("-" * 68)

2640 print(filename)

2641 print("-" * 68)

2642 ole.dumpdirectory()

2643 for streamname in ole.listdir():

2644 if streamname[-1][0] == "\005":

2645 print("%r: properties" % streamname)

2646 try:

2647 props = ole.getproperties(streamname, convert_time=True)

2648 props = sorted(props.items())

2649 for k, v in props:

2650 # [PL]: avoid to display too large or binary values:

2651 if isinstance(v, (basestring, bytes)):

2652 if len(v) > 50:

2653 v = v[:50]

2654 if isinstance(v, bytes):

2655 # quick and dirty binary check:

2656 for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,

2657 21,22,23,24,25,26,27,28,29,30,31):

2658 if c in bytearray(v):

2659 v = '(binary data)'

2660 break

2661 print(" ", k, v)

2662 except Exception:

2663 log.exception('Error while parsing property stream %r' % streamname)

2664

2665 try:

2666 if options.extract_customprop:

2667 variables = ole.get_userdefined_properties(streamname, convert_time=True)

2668 if len(variables):

2669 print("%r: user-defined properties" % streamname)

2670 for index, variable in enumerate(variables):

2671 print('\t{} {}: {}'.format(index, variable['property_name'],variable['value']))

2672

2673 except:

2674 log.exception('Error while parsing user-defined property stream %r' % streamname)

2675 elif options.extract_customvar and streamname[-1]=="WordDocument":

2676 print("%r: document variables" % streamname)

2677 variables = ole.get_document_variables()

2678

2679 for index, var in enumerate(variables):

2680 print('\t{} {}: {}'.format(index, var['var_name'], var['value'][:50]))

2681 print("")

2682

2683

2684 if options.check_streams:

2685 # Read all streams to check if there are errors:

2686 print('\nChecking streams...')

2687 for streamname in ole.listdir():

2688 # print name using repr() to convert binary chars to \xNN:

2689 print('-', repr('/'.join(streamname)),'-', end=' ')

2690 st_type = ole.get_type(streamname)

2691 if st_type == STGTY_STREAM:

2692 print('size %d' % ole.get_size(streamname))

2693 # just try to read stream in memory:

2694 ole.openstream(streamname)

2695 else:

2696 print('NOT a stream : type=%d' % st_type)

2697 print()

2698

2699 # for streamname in ole.listdir():

2700 # # print name using repr() to convert binary chars to \xNN:

2701 # print('-', repr('/'.join(streamname)),'-', end=' ')

2702 # print(ole.getmtime(streamname))

2703 # print()

2704

2705 print('Modification/Creation times of all directory entries:')

2706 for entry in ole.direntries:

2707 if entry is not None:

2708 print('- {}: mtime={} ctime={}'.format(entry.name,

2709 entry.getmtime(), entry.getctime()))

2710 print()

2711

2712 # parse and display metadata:

2713 try:

2714 meta = ole.get_metadata()

2715 meta.dump()

2716 except Exception:

2717 log.exception('Error while parsing metadata')

2718 print()

2719 # [PL] Test a few new methods:

2720 root = ole.get_rootentry_name()

2721 print('Root entry name: "%s"' % root)

2722 if ole.exists('worddocument'):

2723 print("This is a Word document.")

2724 print("type of stream 'WordDocument':", ole.get_type('worddocument'))

2725 print("size :", ole.get_size('worddocument'))

2726 if ole.exists('macros/vba'):

2727 print("This document may contain VBA macros.")

2728

2729 # print parsing issues:

2730 print('\nNon-fatal issues raised during parsing:')

2731 if ole.parsing_issues:

2732 for exctype, msg in ole.parsing_issues:

2733 print('- {}: {}'.format(exctype.__name__, msg))

2734 else:

2735 print('None')

2736 ole.close()

2737 except Exception:

2738 log.exception('Error while parsing file %r' % filename)

2739

2740

2741if __name__ == "__main__":

2742 main()

2743

2744# this code was developed while listening to The Wedding Present "Sea Monsters"