1# Copyright (c) 2024, pypdf contributors 
    2# All rights reserved. 
    3# 
    4# Redistribution and use in source and binary forms, with or without 
    5# modification, are permitted provided that the following conditions are 
    6# met: 
    7# 
    8# * Redistributions of source code must retain the above copyright notice, 
    9# this list of conditions and the following disclaimer. 
    10# * Redistributions in binary form must reproduce the above copyright notice, 
    11# this list of conditions and the following disclaimer in the documentation 
    12# and/or other materials provided with the distribution. 
    13# * The name of the author may not be used to endorse or promote products 
    14# derived from this software without specific prior written permission. 
    15# 
    16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
    17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
    18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
    19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
    20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
    21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
    22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
    23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
    24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
    25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
    26# POSSIBILITY OF SUCH DAMAGE. 
    27 
    28import logging 
    29from io import BytesIO 
    30from typing import IO 
    31 
    32from .._utils import ( 
    33    WHITESPACES, 
    34    WHITESPACES_AS_BYTES, 
    35    StreamType, 
    36    read_non_whitespace, 
    37) 
    38from ..errors import PdfReadError 
    39 
    40logger = logging.getLogger(__name__) 
    41 
    42# An inline image should be used only for small images (4096 bytes or less), 
    43# but allow twice this for cases where this has been exceeded. 
    44BUFFER_SIZE = 8192 
    45 
    46 
    47def extract_inline_AHx(stream: StreamType) -> bytes: 
    48    """ 
    49    Extract HexEncoded stream from inline image. 
    50    The stream will be moved onto the EI. 
    51    """ 
    52    data_out: bytes = b"" 
    53    # Read data until delimiter > and EI as backup. 
    54    while True: 
    55        data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 
    56        if not data_buffered: 
    57            raise PdfReadError("Unexpected end of stream") 
    58        pos_tok = data_buffered.find(b">") 
    59        if pos_tok >= 0:  # found > 
    60            data_out += data_buffered[: pos_tok + 1] 
    61            stream.seek(-len(data_buffered) + pos_tok + 1, 1) 
    62            break 
    63        pos_ei = data_buffered.find(b"EI") 
    64        if pos_ei >= 0:  # found EI 
    65            stream.seek(-len(data_buffered) + pos_ei - 1, 1) 
    66            c = stream.read(1) 
    67            while c in WHITESPACES: 
    68                stream.seek(-2, 1) 
    69                c = stream.read(1) 
    70                pos_ei -= 1 
    71            data_out += data_buffered[:pos_ei] 
    72            break 
    73        if len(data_buffered) == 2: 
    74            data_out += data_buffered 
    75            raise PdfReadError("Unexpected end of stream") 
    76        # Neither > nor EI found 
    77        data_out += data_buffered[:-2] 
    78        stream.seek(-2, 1) 
    79 
    80    ei_tok = read_non_whitespace(stream) 
    81    ei_tok += stream.read(2) 
    82    stream.seek(-3, 1) 
    83    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 
    84        raise PdfReadError("EI stream not found") 
    85    return data_out 
    86 
    87 
    88def extract_inline_A85(stream: StreamType) -> bytes: 
    89    """ 
    90    Extract A85 stream from inline image. 
    91    The stream will be moved onto the EI. 
    92    """ 
    93    data_out: bytes = b"" 
    94    # Read data until delimiter ~> 
    95    while True: 
    96        data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 
    97        if not data_buffered: 
    98            raise PdfReadError("Unexpected end of stream") 
    99        pos_tok = data_buffered.find(b"~>") 
    100        if pos_tok >= 0:  # found! 
    101            data_out += data_buffered[: pos_tok + 2] 
    102            stream.seek(-len(data_buffered) + pos_tok + 2, 1) 
    103            break 
    104        if len(data_buffered) == 2:  # end of buffer 
    105            data_out += data_buffered 
    106            raise PdfReadError("Unexpected end of stream") 
    107        data_out += data_buffered[ 
    108            :-2 
    109        ]  # back by one char in case of in the middle of ~> 
    110        stream.seek(-2, 1) 
    111 
    112    ei_tok = read_non_whitespace(stream) 
    113    ei_tok += stream.read(2) 
    114    stream.seek(-3, 1) 
    115    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 
    116        raise PdfReadError("EI stream not found") 
    117    return data_out 
    118 
    119 
    120def extract_inline_RL(stream: StreamType) -> bytes: 
    121    """ 
    122    Extract RL (RunLengthDecode) stream from inline image. 
    123    The stream will be moved onto the EI. 
    124    """ 
    125    data_out: bytes = b"" 
    126    # Read data until delimiter 128 
    127    while True: 
    128        data_buffered = stream.read(BUFFER_SIZE) 
    129        if not data_buffered: 
    130            raise PdfReadError("Unexpected end of stream") 
    131        pos_tok = data_buffered.find(b"\x80") 
    132        if pos_tok >= 0:  # found 
    133            data_out += data_buffered[: pos_tok + 1] 
    134            stream.seek(-len(data_buffered) + pos_tok + 1, 1) 
    135            break 
    136        data_out += data_buffered 
    137 
    138    ei_tok = read_non_whitespace(stream) 
    139    ei_tok += stream.read(2) 
    140    stream.seek(-3, 1) 
    141    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 
    142        raise PdfReadError("EI stream not found") 
    143    return data_out 
    144 
    145 
    146def extract_inline_DCT(stream: StreamType) -> bytes: 
    147    """ 
    148    Extract DCT (JPEG) stream from inline image. 
    149    The stream will be moved onto the EI. 
    150    """ 
    151    def read(length: int) -> bytes: 
    152        # If 0 bytes are returned, and *size* was not 0, this indicates end of file. 
    153        # If the object is in non-blocking mode and no bytes are available, `None` is returned. 
    154        _result = stream.read(length) 
    155        if _result is None or len(_result) != length: 
    156            raise PdfReadError("Unexpected end of stream") 
    157        return _result 
    158 
    159    data_out: bytes = b"" 
    160    # Read Blocks of data (ID/Size/data) up to ID=FF/D9 
    161    # https://www.digicamsoft.com/itu/itu-t81-36.html 
    162    not_first = False 
    163    while True: 
    164        c = read(1) 
    165        if not_first or (c == b"\xff"): 
    166            data_out += c 
    167        if c != b"\xff": 
    168            continue 
    169        not_first = True 
    170        c = read(1) 
    171        data_out += c 
    172        if c == b"\xff": 
    173            stream.seek(-1, 1)  # pragma: no cover 
    174        elif c == b"\x00":  # stuffing 
    175            pass 
    176        elif c == b"\xd9":  # end 
    177            break 
    178        elif c in ( 
    179            b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" 
    180            b"\xda\xdb\xdc\xdd\xde\xdf" 
    181            b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" 
    182        ): 
    183            c = read(2) 
    184            data_out += c 
    185            sz = c[0] * 256 + c[1] 
    186            data_out += read(sz - 2) 
    187 
    188    ei_tok = read_non_whitespace(stream) 
    189    ei_tok += stream.read(2) 
    190    stream.seek(-3, 1) 
    191    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 
    192        raise PdfReadError("EI stream not found") 
    193    return data_out 
    194 
    195 
    196def extract_inline_default(stream: StreamType) -> bytes: 
    197    """Legacy method, used by default""" 
    198    stream_out = BytesIO() 
    199    # Read the inline image, while checking for EI (End Image) operator. 
    200    while True: 
    201        data_buffered = stream.read(BUFFER_SIZE) 
    202        if not data_buffered: 
    203            raise PdfReadError("Unexpected end of stream") 
    204        pos_ei = data_buffered.find( 
    205            b"E" 
    206        )  # We can not look straight for "EI" because it may not have been loaded in the buffer 
    207 
    208        if pos_ei == -1: 
    209            stream_out.write(data_buffered) 
    210        else: 
    211            # Write out everything including E (the one from EI to be removed) 
    212            stream_out.write(data_buffered[0 : pos_ei + 1]) 
    213            sav_pos_ei = stream_out.tell() - 1 
    214            # Seek back in the stream to read the E next 
    215            stream.seek(pos_ei + 1 - len(data_buffered), 1) 
    216            saved_pos = stream.tell() 
    217            # Check for End Image 
    218            tok2 = stream.read(1)  # I of "EI" 
    219            if tok2 != b"I": 
    220                stream.seek(saved_pos, 0) 
    221                continue 
    222            tok3 = stream.read(1)  # possible space after "EI" 
    223            if tok3 not in WHITESPACES: 
    224                stream.seek(saved_pos, 0) 
    225                continue 
    226            while tok3 in WHITESPACES: 
    227                tok3 = stream.read(1) 
    228            if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { 
    229                b"Q", 
    230                b"E", 
    231            }:  # for Q or EMC 
    232                stream.seek(saved_pos, 0) 
    233                continue 
    234            if is_followed_by_binary_data(stream): 
    235                # Inline image contains `EI ` sequence usually marking the end of it, but 
    236                # is followed by binary data which does not make sense for the actual end. 
    237                stream.seek(saved_pos, 0) 
    238                continue 
    239            # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient 
    240            # remove E(I) wrongly inserted earlier 
    241            stream.seek(saved_pos - 1, 0) 
    242            stream_out.truncate(sav_pos_ei) 
    243            break 
    244 
    245    return stream_out.getvalue() 
    246 
    247 
    248def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool: 
    249    """ 
    250    Check if the next bytes of the stream look like binary image data or regular page content. 
    251 
    252    This is just some heuristics due to the PDF specification being too imprecise about 
    253    inline images containing the `EI` marker which would end an image. Starting with PDF 2.0, 
    254    we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited 
    255    everywhere, we should not expect to be able to remove such hacks in the near future - especially 
    256    considering legacy documents as well. 
    257 
    258    The actual implementation draws some inspiration from 
    259    https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java 
    260    """ 
    261    position = stream.tell() 
    262    data = stream.read(length) 
    263    stream.seek(position) 
    264    if not data: 
    265        return False 
    266    operator_start = None 
    267    operator_end = None 
    268 
    269    for index, byte in enumerate(data): 
    270        if byte < 32 and byte not in WHITESPACES_AS_BYTES: 
    271            # This covers all characters not being displayable directly, although omitting whitespace 
    272            # to allow for operator detection. 
    273            return True 
    274        is_whitespace = byte in WHITESPACES_AS_BYTES 
    275        if operator_start is None and not is_whitespace: 
    276            # Interpret all other non-whitespace characters as the start of an operation. 
    277            operator_start = index 
    278        if operator_start is not None and is_whitespace: 
    279            # A whitespace stops an operation. 
    280            # Assume that having an inline image with tons of whitespace is rather unlikely. 
    281            operator_end = index 
    282            break 
    283 
    284    if operator_start is None: 
    285        # Inline images should not have tons of whitespaces, which would lead to no operator start. 
    286        return False 
    287    if operator_end is None: 
    288        # We probably are inside an operation. 
    289        operator_end = length 
    290    operator_length = operator_end - operator_start 
    291    operator = data[operator_start:operator_end] 
    292    if operator.startswith(b"/") and operator_length > 1: 
    293        # Name object. 
    294        return False 
    295    if operator.replace(b".", b"").isdigit(): 
    296        # Graphics operator, for example a move. A number (integer or float). 
    297        return False 
    298    if operator_length > 3:  # noqa: SIM103 
    299        # Usually, the operators inside a content stream should not have more than three characters, 
    300        # especially after an inline image. 
    301        return True 
    302    return False