Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/utils/surrogateescape.py: 29%

1"""

2This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error

3handler of Python 3.

5Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc

6"""

8# This code is released under the Python license and the BSD 2-clause license

10import codecs

11import sys

13from future import utils

16FS_ERRORS = 'surrogateescape'

18# # -- Python 2/3 compatibility -------------------------------------

19# FS_ERRORS = 'my_surrogateescape'

21def u(text):

22 if utils.PY3:

23 return text

24 else:

25 return text.decode('unicode_escape')

27def b(data):

28 if utils.PY3:

29 return data.encode('latin1')

30 else:

31 return data

33if utils.PY3:

34 _unichr = chr

35 bytes_chr = lambda code: bytes((code,))

36else:

37 _unichr = unichr

38 bytes_chr = chr

40def surrogateescape_handler(exc):

41 """

42 Pure Python implementation of the PEP 383: the "surrogateescape" error

43 handler of Python 3. Undecodable bytes will be replaced by a Unicode

44 character U+DCxx on decoding, and these are translated into the

45 original bytes on encoding.

46 """

47 mystring = exc.object[exc.start:exc.end]

49 try:

50 if isinstance(exc, UnicodeDecodeError):

51 # mystring is a byte-string in this case

52 decoded = replace_surrogate_decode(mystring)

53 elif isinstance(exc, UnicodeEncodeError):

54 # In the case of u'\udcc3'.encode('ascii',

55 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an

56 # exception anyway after this function is called, even though I think

57 # it's doing what it should. It seems that the strict encoder is called

58 # to encode the unicode string that this function returns ...

59 decoded = replace_surrogate_encode(mystring)

60 else:

61 raise exc

62 except NotASurrogateError:

63 raise exc

64 return (decoded, exc.end)

67class NotASurrogateError(Exception):

68 pass

71def replace_surrogate_encode(mystring):

72 """

73 Returns a (unicode) string, not the more logical bytes, because the codecs

74 register_error functionality expects this.

75 """

76 decoded = []

77 for ch in mystring:

78 # if utils.PY3:

79 # code = ch

80 # else:

81 code = ord(ch)

83 # The following magic comes from Py3.3's Python/codecs.c file:

84 if not 0xD800 <= code <= 0xDCFF:

85 # Not a surrogate. Fail with the original exception.

86 raise NotASurrogateError

87 # mybytes = [0xe0 | (code >> 12),

88 # 0x80 | ((code >> 6) & 0x3f),

89 # 0x80 | (code & 0x3f)]

90 # Is this a good idea?

91 if 0xDC00 <= code <= 0xDC7F:

92 decoded.append(_unichr(code - 0xDC00))

93 elif code <= 0xDCFF:

94 decoded.append(_unichr(code - 0xDC00))

95 else:

96 raise NotASurrogateError

97 return str().join(decoded)

100def replace_surrogate_decode(mybytes):

101 """

102 Returns a (unicode) string

103 """

104 decoded = []

105 for ch in mybytes:

106 # We may be parsing newbytes (in which case ch is an int) or a native

107 # str on Py2

108 if isinstance(ch, int):

109 code = ch

110 else:

111 code = ord(ch)

112 if 0x80 <= code <= 0xFF:

113 decoded.append(_unichr(0xDC00 + code))

114 elif code <= 0x7F:

115 decoded.append(_unichr(code))

116 else:

117 # # It may be a bad byte

118 # # Try swallowing it.

119 # continue

120 # print("RAISE!")

121 raise NotASurrogateError

122 return str().join(decoded)

123

124

125def encodefilename(fn):

126 if FS_ENCODING == 'ascii':

127 # ASCII encoder of Python 2 expects that the error handler returns a

128 # Unicode string encodable to ASCII, whereas our surrogateescape error

129 # handler has to return bytes in 0x80-0xFF range.

130 encoded = []

131 for index, ch in enumerate(fn):

132 code = ord(ch)

133 if code < 128:

134 ch = bytes_chr(code)

135 elif 0xDC80 <= code <= 0xDCFF:

136 ch = bytes_chr(code - 0xDC00)

137 else:

138 raise UnicodeEncodeError(FS_ENCODING,

139 fn, index, index+1,

140 'ordinal not in range(128)')

141 encoded.append(ch)

142 return bytes().join(encoded)

143 elif FS_ENCODING == 'utf-8':

144 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF

145 # doesn't go through our error handler

146 encoded = []

147 for index, ch in enumerate(fn):

148 code = ord(ch)

149 if 0xD800 <= code <= 0xDFFF:

150 if 0xDC80 <= code <= 0xDCFF:

151 ch = bytes_chr(code - 0xDC00)

152 encoded.append(ch)

153 else:

154 raise UnicodeEncodeError(

155 FS_ENCODING,

156 fn, index, index+1, 'surrogates not allowed')

157 else:

158 ch_utf8 = ch.encode('utf-8')

159 encoded.append(ch_utf8)

160 return bytes().join(encoded)

161 else:

162 return fn.encode(FS_ENCODING, FS_ERRORS)

163

164def decodefilename(fn):

165 return fn.decode(FS_ENCODING, FS_ERRORS)

166

167FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')

168# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')

169# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')

170

171

172# normalize the filesystem encoding name.

173# For example, we expect "utf-8", not "UTF8".

174FS_ENCODING = codecs.lookup(FS_ENCODING).name

175

176

177def register_surrogateescape():

178 """

179 Registers the surrogateescape error handler on Python 2 (only)

180 """

181 if utils.PY3:

182 return

183 try:

184 codecs.lookup_error(FS_ERRORS)

185 except LookupError:

186 codecs.register_error(FS_ERRORS, surrogateescape_handler)

187

188

189if __name__ == '__main__':

190 pass

191 # # Tests:

192 # register_surrogateescape()

193

194 # b = decodefilename(fn)

195 # assert b == encoded, "%r != %r" % (b, encoded)

196 # c = encodefilename(b)

197 # assert c == fn, '%r != %r' % (c, fn)

198 # # print("ok")