1# Copyright (C) 2001-2006 Python Software Foundation 
    2# Author: Ben Gertzfield 
    3# Contact: email-sig@python.org 
    4 
    5"""Quoted-printable content transfer encoding per RFCs 2045-2047. 
    6 
    7This module handles the content transfer encoding method defined in RFC 2045 
    8to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to 
    9safely encode text that is in a character set similar to the 7-bit US ASCII 
    10character set, but that includes some 8-bit characters that are normally not 
    11allowed in email bodies or headers. 
    12 
    13Quoted-printable is very space-inefficient for encoding binary files; use the 
    14email.base64mime module for that instead. 
    15 
    16This module provides an interface to encode and decode both headers and bodies 
    17with quoted-printable encoding. 
    18 
    19RFC 2045 defines a method for including character set information in an 
    20`encoded-word' in a header.  This method is commonly used for 8-bit real names 
    21in To:/From:/Cc: etc. fields, as well as Subject: lines. 
    22 
    23This module does not do the line wrapping or end-of-line character 
    24conversion necessary for proper internationalized headers; it only 
    25does dumb encoding and decoding.  To deal with the various line 
    26wrapping issues, use the email.header module. 
    27""" 
    28from __future__ import unicode_literals 
    29from __future__ import division 
    30from __future__ import absolute_import 
    31from future.builtins import bytes, chr, dict, int, range, super 
    32 
    33__all__ = [ 
    34    'body_decode', 
    35    'body_encode', 
    36    'body_length', 
    37    'decode', 
    38    'decodestring', 
    39    'header_decode', 
    40    'header_encode', 
    41    'header_length', 
    42    'quote', 
    43    'unquote', 
    44    ] 
    45 
    46import re 
    47import io 
    48 
    49from string import ascii_letters, digits, hexdigits 
    50 
    51CRLF = '\r\n' 
    52NL = '\n' 
    53EMPTYSTRING = '' 
    54 
    55# Build a mapping of octets to the expansion of that octet.  Since we're only 
    56# going to have 256 of these things, this isn't terribly inefficient 
    57# space-wise.  Remember that headers and bodies have different sets of safe 
    58# characters.  Initialize both maps with the full expansion, and then override 
    59# the safe bytes with the more compact form. 
    60_QUOPRI_HEADER_MAP = dict((c, '=%02X' % c) for c in range(256)) 
    61_QUOPRI_BODY_MAP = _QUOPRI_HEADER_MAP.copy() 
    62 
    63# Safe header bytes which need no encoding. 
    64for c in bytes(b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii')): 
    65    _QUOPRI_HEADER_MAP[c] = chr(c) 
    66# Headers have one other special encoding; spaces become underscores. 
    67_QUOPRI_HEADER_MAP[ord(' ')] = '_' 
    68 
    69# Safe body bytes which need no encoding. 
    70for c in bytes(b' !"#$%&\'()*+,-./0123456789:;<>' 
    71               b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`' 
    72               b'abcdefghijklmnopqrstuvwxyz{|}~\t'): 
    73    _QUOPRI_BODY_MAP[c] = chr(c) 
    74 
    75 
    76 
    77# Helpers 
    78def header_check(octet): 
    79    """Return True if the octet should be escaped with header quopri.""" 
    80    return chr(octet) != _QUOPRI_HEADER_MAP[octet] 
    81 
    82 
    83def body_check(octet): 
    84    """Return True if the octet should be escaped with body quopri.""" 
    85    return chr(octet) != _QUOPRI_BODY_MAP[octet] 
    86 
    87 
    88def header_length(bytearray): 
    89    """Return a header quoted-printable encoding length. 
    90 
    91    Note that this does not include any RFC 2047 chrome added by 
    92    `header_encode()`. 
    93 
    94    :param bytearray: An array of bytes (a.k.a. octets). 
    95    :return: The length in bytes of the byte array when it is encoded with 
    96        quoted-printable for headers. 
    97    """ 
    98    return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray) 
    99 
    100 
    101def body_length(bytearray): 
    102    """Return a body quoted-printable encoding length. 
    103 
    104    :param bytearray: An array of bytes (a.k.a. octets). 
    105    :return: The length in bytes of the byte array when it is encoded with 
    106        quoted-printable for bodies. 
    107    """ 
    108    return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray) 
    109 
    110 
    111def _max_append(L, s, maxlen, extra=''): 
    112    if not isinstance(s, str): 
    113        s = chr(s) 
    114    if not L: 
    115        L.append(s.lstrip()) 
    116    elif len(L[-1]) + len(s) <= maxlen: 
    117        L[-1] += extra + s 
    118    else: 
    119        L.append(s.lstrip()) 
    120 
    121 
    122def unquote(s): 
    123    """Turn a string in the form =AB to the ASCII character with value 0xab""" 
    124    return chr(int(s[1:3], 16)) 
    125 
    126 
    127def quote(c): 
    128    return '=%02X' % ord(c) 
    129 
    130 
    131 
    132def header_encode(header_bytes, charset='iso-8859-1'): 
    133    """Encode a single header line with quoted-printable (like) encoding. 
    134 
    135    Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but 
    136    used specifically for email header fields to allow charsets with mostly 7 
    137    bit characters (and some 8 bit) to remain more or less readable in non-RFC 
    138    2045 aware mail clients. 
    139 
    140    charset names the character set to use in the RFC 2046 header.  It 
    141    defaults to iso-8859-1. 
    142    """ 
    143    # Return empty headers as an empty string. 
    144    if not header_bytes: 
    145        return '' 
    146    # Iterate over every byte, encoding if necessary. 
    147    encoded = [] 
    148    for octet in header_bytes: 
    149        encoded.append(_QUOPRI_HEADER_MAP[octet]) 
    150    # Now add the RFC chrome to each encoded chunk and glue the chunks 
    151    # together. 
    152    return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded)) 
    153 
    154 
    155class _body_accumulator(io.StringIO): 
    156 
    157    def __init__(self, maxlinelen, eol, *args, **kw): 
    158        super().__init__(*args, **kw) 
    159        self.eol = eol 
    160        self.maxlinelen = self.room = maxlinelen 
    161 
    162    def write_str(self, s): 
    163        """Add string s to the accumulated body.""" 
    164        self.write(s) 
    165        self.room -= len(s) 
    166 
    167    def newline(self): 
    168        """Write eol, then start new line.""" 
    169        self.write_str(self.eol) 
    170        self.room = self.maxlinelen 
    171 
    172    def write_soft_break(self): 
    173        """Write a soft break, then start a new line.""" 
    174        self.write_str('=') 
    175        self.newline() 
    176 
    177    def write_wrapped(self, s, extra_room=0): 
    178        """Add a soft line break if needed, then write s.""" 
    179        if self.room < len(s) + extra_room: 
    180            self.write_soft_break() 
    181        self.write_str(s) 
    182 
    183    def write_char(self, c, is_last_char): 
    184        if not is_last_char: 
    185            # Another character follows on this line, so we must leave 
    186            # extra room, either for it or a soft break, and whitespace 
    187            # need not be quoted. 
    188            self.write_wrapped(c, extra_room=1) 
    189        elif c not in ' \t': 
    190            # For this and remaining cases, no more characters follow, 
    191            # so there is no need to reserve extra room (since a hard 
    192            # break will immediately follow). 
    193            self.write_wrapped(c) 
    194        elif self.room >= 3: 
    195            # It's a whitespace character at end-of-line, and we have room 
    196            # for the three-character quoted encoding. 
    197            self.write(quote(c)) 
    198        elif self.room == 2: 
    199            # There's room for the whitespace character and a soft break. 
    200            self.write(c) 
    201            self.write_soft_break() 
    202        else: 
    203            # There's room only for a soft break.  The quoted whitespace 
    204            # will be the only content on the subsequent line. 
    205            self.write_soft_break() 
    206            self.write(quote(c)) 
    207 
    208 
    209def body_encode(body, maxlinelen=76, eol=NL): 
    210    """Encode with quoted-printable, wrapping at maxlinelen characters. 
    211 
    212    Each line of encoded text will end with eol, which defaults to "\\n".  Set 
    213    this to "\\r\\n" if you will be using the result of this function directly 
    214    in an email. 
    215 
    216    Each line will be wrapped at, at most, maxlinelen characters before the 
    217    eol string (maxlinelen defaults to 76 characters, the maximum value 
    218    permitted by RFC 2045).  Long lines will have the 'soft line break' 
    219    quoted-printable character "=" appended to them, so the decoded text will 
    220    be identical to the original text. 
    221 
    222    The minimum maxlinelen is 4 to have room for a quoted character ("=XX") 
    223    followed by a soft line break.  Smaller values will generate a 
    224    ValueError. 
    225 
    226    """ 
    227 
    228    if maxlinelen < 4: 
    229        raise ValueError("maxlinelen must be at least 4") 
    230    if not body: 
    231        return body 
    232 
    233    # The last line may or may not end in eol, but all other lines do. 
    234    last_has_eol = (body[-1] in '\r\n') 
    235 
    236    # This accumulator will make it easier to build the encoded body. 
    237    encoded_body = _body_accumulator(maxlinelen, eol) 
    238 
    239    lines = body.splitlines() 
    240    last_line_no = len(lines) - 1 
    241    for line_no, line in enumerate(lines): 
    242        last_char_index = len(line) - 1 
    243        for i, c in enumerate(line): 
    244            if body_check(ord(c)): 
    245                c = quote(c) 
    246            encoded_body.write_char(c, i==last_char_index) 
    247        # Add an eol if input line had eol.  All input lines have eol except 
    248        # possibly the last one. 
    249        if line_no < last_line_no or last_has_eol: 
    250            encoded_body.newline() 
    251 
    252    return encoded_body.getvalue() 
    253 
    254 
    255 
    256# BAW: I'm not sure if the intent was for the signature of this function to be 
    257# the same as base64MIME.decode() or not... 
    258def decode(encoded, eol=NL): 
    259    """Decode a quoted-printable string. 
    260 
    261    Lines are separated with eol, which defaults to \\n. 
    262    """ 
    263    if not encoded: 
    264        return encoded 
    265    # BAW: see comment in encode() above.  Again, we're building up the 
    266    # decoded string with string concatenation, which could be done much more 
    267    # efficiently. 
    268    decoded = '' 
    269 
    270    for line in encoded.splitlines(): 
    271        line = line.rstrip() 
    272        if not line: 
    273            decoded += eol 
    274            continue 
    275 
    276        i = 0 
    277        n = len(line) 
    278        while i < n: 
    279            c = line[i] 
    280            if c != '=': 
    281                decoded += c 
    282                i += 1 
    283            # Otherwise, c == "=".  Are we at the end of the line?  If so, add 
    284            # a soft line break. 
    285            elif i+1 == n: 
    286                i += 1 
    287                continue 
    288            # Decode if in form =AB 
    289            elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits: 
    290                decoded += unquote(line[i:i+3]) 
    291                i += 3 
    292            # Otherwise, not in form =AB, pass literally 
    293            else: 
    294                decoded += c 
    295                i += 1 
    296 
    297            if i == n: 
    298                decoded += eol 
    299    # Special case if original string did not end with eol 
    300    if encoded[-1] not in '\r\n' and decoded.endswith(eol): 
    301        decoded = decoded[:-1] 
    302    return decoded 
    303 
    304 
    305# For convenience and backwards compatibility w/ standard base64 module 
    306body_decode = decode 
    307decodestring = decode 
    308 
    309 
    310 
    311def _unquote_match(match): 
    312    """Turn a match in the form =AB to the ASCII character with value 0xab""" 
    313    s = match.group(0) 
    314    return unquote(s) 
    315 
    316 
    317# Header decoding is done a bit differently 
    318def header_decode(s): 
    319    """Decode a string encoded with RFC 2045 MIME header `Q' encoding. 
    320 
    321    This function does not parse a full MIME header value encoded with 
    322    quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use 
    323    the high level email.header class for that functionality. 
    324    """ 
    325    s = s.replace('_', ' ') 
    326    return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, re.ASCII)