Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/utils/surrogateescape.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

93 statements  

1""" 

2This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error 

3handler of Python 3. 

4 

5Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc 

6""" 

7 

8# This code is released under the Python license and the BSD 2-clause license 

9 

10import codecs 

11import sys 

12 

13from future import utils 

14 

15 

16FS_ERRORS = 'surrogateescape' 

17 

18# # -- Python 2/3 compatibility ------------------------------------- 

19# FS_ERRORS = 'my_surrogateescape' 

20 

21def u(text): 

22 if utils.PY3: 

23 return text 

24 else: 

25 return text.decode('unicode_escape') 

26 

27def b(data): 

28 if utils.PY3: 

29 return data.encode('latin1') 

30 else: 

31 return data 

32 

33if utils.PY3: 

34 _unichr = chr 

35 bytes_chr = lambda code: bytes((code,)) 

36else: 

37 _unichr = unichr 

38 bytes_chr = chr 

39 

40def surrogateescape_handler(exc): 

41 """ 

42 Pure Python implementation of the PEP 383: the "surrogateescape" error 

43 handler of Python 3. Undecodable bytes will be replaced by a Unicode 

44 character U+DCxx on decoding, and these are translated into the 

45 original bytes on encoding. 

46 """ 

47 mystring = exc.object[exc.start:exc.end] 

48 

49 try: 

50 if isinstance(exc, UnicodeDecodeError): 

51 # mystring is a byte-string in this case 

52 decoded = replace_surrogate_decode(mystring) 

53 elif isinstance(exc, UnicodeEncodeError): 

54 # In the case of u'\udcc3'.encode('ascii', 

55 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an 

56 # exception anyway after this function is called, even though I think 

57 # it's doing what it should. It seems that the strict encoder is called 

58 # to encode the unicode string that this function returns ... 

59 decoded = replace_surrogate_encode(mystring) 

60 else: 

61 raise exc 

62 except NotASurrogateError: 

63 raise exc 

64 return (decoded, exc.end) 

65 

66 

67class NotASurrogateError(Exception): 

68 pass 

69 

70 

71def replace_surrogate_encode(mystring): 

72 """ 

73 Returns a (unicode) string, not the more logical bytes, because the codecs 

74 register_error functionality expects this. 

75 """ 

76 decoded = [] 

77 for ch in mystring: 

78 # if utils.PY3: 

79 # code = ch 

80 # else: 

81 code = ord(ch) 

82 

83 # The following magic comes from Py3.3's Python/codecs.c file: 

84 if not 0xD800 <= code <= 0xDCFF: 

85 # Not a surrogate. Fail with the original exception. 

86 raise NotASurrogateError 

87 # mybytes = [0xe0 | (code >> 12), 

88 # 0x80 | ((code >> 6) & 0x3f), 

89 # 0x80 | (code & 0x3f)] 

90 # Is this a good idea? 

91 if 0xDC00 <= code <= 0xDC7F: 

92 decoded.append(_unichr(code - 0xDC00)) 

93 elif code <= 0xDCFF: 

94 decoded.append(_unichr(code - 0xDC00)) 

95 else: 

96 raise NotASurrogateError 

97 return str().join(decoded) 

98 

99 

100def replace_surrogate_decode(mybytes): 

101 """ 

102 Returns a (unicode) string 

103 """ 

104 decoded = [] 

105 for ch in mybytes: 

106 # We may be parsing newbytes (in which case ch is an int) or a native 

107 # str on Py2 

108 if isinstance(ch, int): 

109 code = ch 

110 else: 

111 code = ord(ch) 

112 if 0x80 <= code <= 0xFF: 

113 decoded.append(_unichr(0xDC00 + code)) 

114 elif code <= 0x7F: 

115 decoded.append(_unichr(code)) 

116 else: 

117 # # It may be a bad byte 

118 # # Try swallowing it. 

119 # continue 

120 # print("RAISE!") 

121 raise NotASurrogateError 

122 return str().join(decoded) 

123 

124 

125def encodefilename(fn): 

126 if FS_ENCODING == 'ascii': 

127 # ASCII encoder of Python 2 expects that the error handler returns a 

128 # Unicode string encodable to ASCII, whereas our surrogateescape error 

129 # handler has to return bytes in 0x80-0xFF range. 

130 encoded = [] 

131 for index, ch in enumerate(fn): 

132 code = ord(ch) 

133 if code < 128: 

134 ch = bytes_chr(code) 

135 elif 0xDC80 <= code <= 0xDCFF: 

136 ch = bytes_chr(code - 0xDC00) 

137 else: 

138 raise UnicodeEncodeError(FS_ENCODING, 

139 fn, index, index+1, 

140 'ordinal not in range(128)') 

141 encoded.append(ch) 

142 return bytes().join(encoded) 

143 elif FS_ENCODING == 'utf-8': 

144 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF 

145 # doesn't go through our error handler 

146 encoded = [] 

147 for index, ch in enumerate(fn): 

148 code = ord(ch) 

149 if 0xD800 <= code <= 0xDFFF: 

150 if 0xDC80 <= code <= 0xDCFF: 

151 ch = bytes_chr(code - 0xDC00) 

152 encoded.append(ch) 

153 else: 

154 raise UnicodeEncodeError( 

155 FS_ENCODING, 

156 fn, index, index+1, 'surrogates not allowed') 

157 else: 

158 ch_utf8 = ch.encode('utf-8') 

159 encoded.append(ch_utf8) 

160 return bytes().join(encoded) 

161 else: 

162 return fn.encode(FS_ENCODING, FS_ERRORS) 

163 

164def decodefilename(fn): 

165 return fn.decode(FS_ENCODING, FS_ERRORS) 

166 

167FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') 

168# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') 

169# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') 

170 

171 

172# normalize the filesystem encoding name. 

173# For example, we expect "utf-8", not "UTF8". 

174FS_ENCODING = codecs.lookup(FS_ENCODING).name 

175 

176 

177def register_surrogateescape(): 

178 """ 

179 Registers the surrogateescape error handler on Python 2 (only) 

180 """ 

181 if utils.PY3: 

182 return 

183 try: 

184 codecs.lookup_error(FS_ERRORS) 

185 except LookupError: 

186 codecs.register_error(FS_ERRORS, surrogateescape_handler) 

187 

188 

189if __name__ == '__main__': 

190 pass 

191 # # Tests: 

192 # register_surrogateescape() 

193 

194 # b = decodefilename(fn) 

195 # assert b == encoded, "%r != %r" % (b, encoded) 

196 # c = encodefilename(b) 

197 # assert c == fn, '%r != %r' % (c, fn) 

198 # # print("ok")