Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/bad_codecs/utf8_variants.py: 69%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

62 statements  

1r""" 

2This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can 

3decode text that's been encoded with a popular non-standard version of UTF-8. 

4This includes CESU-8, the accidental encoding made by layering UTF-8 on top of 

5UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for 

6codepoint 0. 

7 

8This is particularly relevant in Python 3, which provides no other way of 

9decoding CESU-8 [1]_. 

10 

11The easiest way to use the codec is to simply import `ftfy.bad_codecs`: 

12 

13 >>> import ftfy.bad_codecs 

14 >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var') 

15 >>> print(repr(result).lstrip('u')) 

16 'here comes a null! \x00' 

17 

18The codec does not at all enforce "correct" CESU-8. For example, the Unicode 

19Consortium's not-quite-standard describing CESU-8 requires that there is only 

20one possible encoding of any character, so it does not allow mixing of valid 

21UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8 

22decoder does. 

23 

24Characters in the Basic Multilingual Plane still have only one encoding. This 

25codec still enforces the rule, within the BMP, that characters must appear in 

26their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`, 

27instead of just `0x00`, may be used to encode the null character `U+0000`, like 

28in Java. 

29 

30If you encode with this codec, you get legitimate UTF-8. Decoding with this 

31codec and then re-encoding is not idempotent, although encoding and then 

32decoding is. So this module won't produce CESU-8 for you. Look for that 

33functionality in the sister module, "Breaks Text For You", coming approximately 

34never. 

35 

36.. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: 

37 first decode the bytes (incorrectly), then encode them, then decode them 

38 again, using UTF-8 as the codec every time. But Python 2 is dead, so use 

39 ftfy instead. 

40""" 

41 

42import codecs 

43import re 

44from encodings.utf_8 import ( 

45 IncrementalDecoder as UTF8IncrementalDecoder, 

46) 

47from encodings.utf_8 import ( 

48 IncrementalEncoder as UTF8IncrementalEncoder, 

49) 

50from typing import Callable, Optional 

51 

52NAME = "utf-8-variants" 

53 

54# This regular expression matches all possible six-byte CESU-8 sequences, 

55# plus truncations of them at the end of the string. (If any of the 

56# subgroups matches $, then all the subgroups after it also have to match $, 

57# as there are no more characters to match.) 

58CESU8_EXPR = ( 

59 b"(" 

60 b"\xed" 

61 b"([\xa0-\xaf]|$)" 

62 b"([\x80-\xbf]|$)" 

63 b"(\xed|$)" 

64 b"([\xb0-\xbf]|$)" 

65 b"([\x80-\xbf]|$)" 

66 b")" 

67) 

68 

69CESU8_RE = re.compile(CESU8_EXPR) 

70 

71# This expression matches isolated surrogate characters that aren't 

72# CESU-8, which have to be handled carefully on Python 2. 

73SURROGATE_EXPR = b"(\xed([\xa0-\xbf]|$)([\x80-\xbf]|$))" 

74 

75# This expression matches the Java encoding of U+0, including if it's 

76# truncated and we need more bytes. 

77NULL_EXPR = b"(\xc0(\x80|$))" 

78 

79# This regex matches cases that we need to decode differently from 

80# standard UTF-8. 

81SPECIAL_BYTES_RE = re.compile(b"|".join([NULL_EXPR, CESU8_EXPR, SURROGATE_EXPR])) 

82 

83 

84class IncrementalDecoder(UTF8IncrementalDecoder): 

85 """ 

86 An incremental decoder that extends Python's built-in UTF-8 decoder. 

87 

88 This encoder needs to take in bytes, possibly arriving in a stream, and 

89 output the correctly decoded text. The general strategy for doing this 

90 is to fall back on the real UTF-8 decoder whenever possible, because 

91 the real UTF-8 decoder is way optimized, but to call specialized methods 

92 we define here for the cases the real encoder isn't expecting. 

93 """ 

94 

95 @staticmethod 

96 def _buffer_decode( # type: ignore[override] 

97 input: bytes, errors: Optional[str], final: bool 

98 ) -> tuple[str, int]: 

99 """ 

100 Decode bytes that may be arriving in a stream, following the Codecs 

101 API. 

102 

103 `input` is the incoming sequence of bytes. `errors` tells us how to 

104 handle errors, though we delegate all error-handling cases to the real 

105 UTF-8 decoder to ensure correct behavior. `final` indicates whether 

106 this is the end of the sequence, in which case we should raise an 

107 error given incomplete input. 

108 

109 Returns as much decoded text as possible, and the number of bytes 

110 consumed. 

111 """ 

112 # decoded_segments are the pieces of text we have decoded so far, 

113 # and position is our current position in the byte string. (Bytes 

114 # before this position have been consumed, and bytes after it have 

115 # yet to be decoded.) 

116 decoded_segments = [] 

117 position = 0 

118 while True: 

119 # Use _buffer_decode_step to decode a segment of text. 

120 decoded, consumed = IncrementalDecoder._buffer_decode_step( 

121 input[position:], errors, final 

122 ) 

123 if consumed == 0: 

124 # Either there's nothing left to decode, or we need to wait 

125 # for more input. Either way, we're done for now. 

126 break 

127 

128 # Append the decoded text to the list, and update our position. 

129 decoded_segments.append(decoded) 

130 position += consumed 

131 

132 if final: 

133 # _buffer_decode_step must consume all the bytes when `final` is 

134 # true. 

135 assert position == len(input) 

136 

137 return "".join(decoded_segments), position 

138 

139 @staticmethod 

140 def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tuple[str, int]: 

141 """ 

142 There are three possibilities for each decoding step: 

143 

144 - Decode as much real UTF-8 as possible. 

145 - Decode a six-byte CESU-8 sequence at the current position. 

146 - Decode a Java-style null at the current position. 

147 

148 This method figures out which step is appropriate, and does it. 

149 """ 

150 # Get a reference to the superclass method that we'll be using for 

151 # most of the real work. 

152 sup = UTF8IncrementalDecoder._buffer_decode 

153 

154 # Find the next byte position that indicates a variant of UTF-8. 

155 match = SPECIAL_BYTES_RE.search(input) 

156 if match is None: 

157 return sup(input, errors, final) 

158 

159 cutoff = match.start() 

160 if cutoff > 0: 

161 return sup(input[:cutoff], errors, True) 

162 

163 # Some byte sequence that we intend to handle specially matches 

164 # at the beginning of the input. 

165 if input.startswith(b"\xc0"): 

166 if len(input) > 1: 

167 # Decode the two-byte sequence 0xc0 0x80. 

168 return "\u0000", 2 

169 if final: 

170 # We hit the end of the stream. Let the superclass method 

171 # handle it. 

172 return sup(input, errors, True) 

173 # Wait to see another byte. 

174 return "", 0 

175 # Decode a possible six-byte sequence starting with 0xed. 

176 return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) 

177 

178 @staticmethod 

179 def _buffer_decode_surrogates( 

180 sup: Callable[[bytes, Optional[str], bool], tuple[str, int]], 

181 input: bytes, 

182 errors: Optional[str], 

183 final: bool, 

184 ) -> tuple[str, int]: 

185 """ 

186 When we have improperly encoded surrogates, we can still see the 

187 bits that they were meant to represent. 

188 

189 The surrogates were meant to encode a 20-bit number, to which we 

190 add 0x10000 to get a codepoint. That 20-bit number now appears in 

191 this form: 

192 

193 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst 

194 

195 The CESU8_RE above matches byte sequences of this form. Then we need 

196 to extract the bits and assemble a codepoint number from them. 

197 """ 

198 if len(input) < 6: 

199 if final: 

200 # We found 0xed near the end of the stream, and there aren't 

201 # six bytes to decode. Delegate to the superclass method to 

202 # handle it as normal UTF-8. It might be a Hangul character 

203 # or an error. 

204 return sup(input, errors, final) 

205 # We found a surrogate, the stream isn't over yet, and we don't 

206 # know enough of the following bytes to decode anything, so 

207 # consume zero bytes and wait. 

208 return "", 0 

209 if CESU8_RE.match(input): 

210 # Given this is a CESU-8 sequence, do some math to pull out 

211 # the intended 20-bit value, and consume six bytes. 

212 codepoint = ( 

213 ((input[1] & 0x0F) << 16) 

214 + ((input[2] & 0x3F) << 10) 

215 + ((input[4] & 0x0F) << 6) 

216 + (input[5] & 0x3F) 

217 + 0x10000 

218 ) 

219 return chr(codepoint), 6 

220 # This looked like a CESU-8 sequence, but it wasn't one. 

221 # 0xed indicates the start of a three-byte sequence, so give 

222 # three bytes to the superclass to decode as usual. 

223 return sup(input[:3], errors, False) 

224 

225 

226# The encoder is identical to UTF-8. 

227IncrementalEncoder = UTF8IncrementalEncoder 

228 

229 

230class StreamWriter(codecs.StreamWriter): 

231 @staticmethod 

232 def encode(input: str, errors: str = "strict") -> tuple[bytes, int]: 

233 return IncrementalEncoder(errors).encode(input, final=True), len(input) 

234 

235 

236class StreamReader(codecs.StreamReader): 

237 @staticmethod 

238 def decode(input: bytes, errors: str = "strict") -> tuple[str, int]: 

239 return IncrementalDecoder(errors).decode(input, final=True), len(input) 

240 

241 

242CODEC_INFO = codecs.CodecInfo( 

243 name=NAME, 

244 encode=StreamWriter.encode, 

245 decode=StreamReader.decode, # type: ignore[arg-type] 

246 incrementalencoder=IncrementalEncoder, 

247 incrementaldecoder=IncrementalDecoder, 

248 streamreader=StreamReader, 

249 streamwriter=StreamWriter, 

250)