1# Protocol Buffers - Google's data interchange format 
    2# Copyright 2008 Google Inc.  All rights reserved. 
    3# 
    4# Use of this source code is governed by a BSD-style 
    5# license that can be found in the LICENSE file or at 
    6# https://developers.google.com/open-source/licenses/bsd 
    7 
    8"""Encoding related utilities.""" 
    9import re 
    10 
    11def _AsciiIsPrint(i): 
    12  return i >= 32 and i < 127 
    13 
    14def _MakeStrEscapes(): 
    15  ret = {} 
    16  for i in range(0, 128): 
    17    if not _AsciiIsPrint(i): 
    18      ret[i] = r'\%03o' % i 
    19  ret[ord('\t')] = r'\t'  # optional escape 
    20  ret[ord('\n')] = r'\n'  # optional escape 
    21  ret[ord('\r')] = r'\r'  # optional escape 
    22  ret[ord('"')] = r'\"'  # necessary escape 
    23  ret[ord('\'')] = r"\'"  # optional escape 
    24  ret[ord('\\')] = r'\\'  # necessary escape 
    25  return ret 
    26 
    27# Maps int -> char, performing string escapes. 
    28_str_escapes = _MakeStrEscapes() 
    29 
    30# Maps int -> char, performing byte escaping and string escapes 
    31_byte_escapes = {i: chr(i) for i in range(0, 256)} 
    32_byte_escapes.update(_str_escapes) 
    33_byte_escapes.update({i: r'\%03o' % i for i in range(128, 256)}) 
    34 
    35 
    36def _DecodeUtf8EscapeErrors(text_bytes): 
    37  ret = '' 
    38  while text_bytes: 
    39    try: 
    40      ret += text_bytes.decode('utf-8').translate(_str_escapes) 
    41      text_bytes = '' 
    42    except UnicodeDecodeError as e: 
    43      ret += text_bytes[:e.start].decode('utf-8').translate(_str_escapes) 
    44      ret += _byte_escapes[text_bytes[e.start]] 
    45      text_bytes = text_bytes[e.start+1:] 
    46  return ret 
    47 
    48 
    49def CEscape(text, as_utf8) -> str: 
    50  """Escape a bytes string for use in an text protocol buffer. 
    51 
    52  Args: 
    53    text: A byte string to be escaped. 
    54    as_utf8: Specifies if result may contain non-ASCII characters. 
    55        In Python 3 this allows unescaped non-ASCII Unicode characters. 
    56        In Python 2 the return value will be valid UTF-8 rather than only ASCII. 
    57  Returns: 
    58    Escaped string (str). 
    59  """ 
    60  # Python's text.encode() 'string_escape' or 'unicode_escape' codecs do not 
    61  # satisfy our needs; they encodes unprintable characters using two-digit hex 
    62  # escapes whereas our C++ unescaping function allows hex escapes to be any 
    63  # length.  So, "\0011".encode('string_escape') ends up being "\\x011", which 
    64  # will be decoded in C++ as a single-character string with char code 0x11. 
    65  text_is_unicode = isinstance(text, str) 
    66  if as_utf8: 
    67    if text_is_unicode: 
    68      return text.translate(_str_escapes) 
    69    else: 
    70      return _DecodeUtf8EscapeErrors(text) 
    71  else: 
    72    if text_is_unicode: 
    73      text = text.encode('utf-8') 
    74    return ''.join([_byte_escapes[c] for c in text]) 
    75 
    76 
    77_CUNESCAPE_HEX = re.compile(r'(\\+)x([0-9a-fA-F])(?![0-9a-fA-F])') 
    78 
    79 
    80def CUnescape(text: str) -> bytes: 
    81  """Unescape a text string with C-style escape sequences to UTF-8 bytes. 
    82 
    83  Args: 
    84    text: The data to parse in a str. 
    85  Returns: 
    86    A byte string. 
    87  """ 
    88 
    89  def ReplaceHex(m): 
    90    # Only replace the match if the number of leading back slashes is odd. i.e. 
    91    # the slash itself is not escaped. 
    92    if len(m.group(1)) & 1: 
    93      return m.group(1) + 'x0' + m.group(2) 
    94    return m.group(0) 
    95 
    96  # This is required because the 'string_escape' encoding doesn't 
    97  # allow single-digit hex escapes (like '\xf'). 
    98  result = _CUNESCAPE_HEX.sub(ReplaceHex, text) 
    99 
    100  # Replaces Unicode escape sequences with their character equivalents. 
    101  result = result.encode('raw_unicode_escape').decode('raw_unicode_escape') 
    102  # Encode Unicode characters as UTF-8, then decode to Latin-1 escaping 
    103  # unprintable characters. 
    104  result = result.encode('utf-8').decode('unicode_escape') 
    105  # Convert Latin-1 text back to a byte string (latin-1 codec also works here). 
    106  return result.encode('latin-1')