Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/bad_codecs/utf8

1r"""

2This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can

3decode text that's been encoded with a popular non-standard version of UTF-8.

4This includes CESU-8, the accidental encoding made by layering UTF-8 on top of

5UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for

6codepoint 0.

8This is particularly relevant in Python 3, which provides no other way of

9decoding CESU-8 [1]_.

11The easiest way to use the codec is to simply import `ftfy.bad_codecs`:

13 >>> import ftfy.bad_codecs

14 >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')

15 >>> print(repr(result).lstrip('u'))

16 'here comes a null! \x00'

18The codec does not at all enforce "correct" CESU-8. For example, the Unicode

19Consortium's not-quite-standard describing CESU-8 requires that there is only

20one possible encoding of any character, so it does not allow mixing of valid

21UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8

22decoder does.

24Characters in the Basic Multilingual Plane still have only one encoding. This

25codec still enforces the rule, within the BMP, that characters must appear in

26their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,

27instead of just `0x00`, may be used to encode the null character `U+0000`, like

28in Java.

30If you encode with this codec, you get legitimate UTF-8. Decoding with this

31codec and then re-encoding is not idempotent, although encoding and then

32decoding is. So this module won't produce CESU-8 for you. Look for that

33functionality in the sister module, "Breaks Text For You", coming approximately

34never.

36.. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec:

37 first decode the bytes (incorrectly), then encode them, then decode them

38 again, using UTF-8 as the codec every time. But Python 2 is dead, so use

39 ftfy instead.

40"""

42import re

43import codecs

44from typing import Tuple

45from encodings.utf_8 import (

46 IncrementalDecoder as UTF8IncrementalDecoder,

47 IncrementalEncoder as UTF8IncrementalEncoder,

48)

50NAME = "utf-8-variants"

52# This regular expression matches all possible six-byte CESU-8 sequences,

53# plus truncations of them at the end of the string. (If any of the

54# subgroups matches $, then all the subgroups after it also have to match $,

55# as there are no more characters to match.)

56CESU8_EXPR = (

57 b"("

58 b"\xed"

59 b"([\xa0-\xaf]|$)"

60 b"([\x80-\xbf]|$)"

61 b"(\xed|$)"

62 b"([\xb0-\xbf]|$)"

63 b"([\x80-\xbf]|$)"

64 b")"

65)

67CESU8_RE = re.compile(CESU8_EXPR)

69# This expression matches isolated surrogate characters that aren't

70# CESU-8, which have to be handled carefully on Python 2.

71SURROGATE_EXPR = b"(\xed([\xa0-\xbf]|$)([\x80-\xbf]|$))"

73# This expression matches the Java encoding of U+0, including if it's

74# truncated and we need more bytes.

75NULL_EXPR = b"(\xc0(\x80|$))"

77# This regex matches cases that we need to decode differently from

78# standard UTF-8.

79SPECIAL_BYTES_RE = re.compile(b"|".join([NULL_EXPR, CESU8_EXPR, SURROGATE_EXPR]))

82class IncrementalDecoder(UTF8IncrementalDecoder):

83 """

84 An incremental decoder that extends Python's built-in UTF-8 decoder.

86 This encoder needs to take in bytes, possibly arriving in a stream, and

87 output the correctly decoded text. The general strategy for doing this

88 is to fall back on the real UTF-8 decoder whenever possible, because

89 the real UTF-8 decoder is way optimized, but to call specialized methods

90 we define here for the cases the real encoder isn't expecting.

91 """

93 def _buffer_decode(self, input, errors, final):

94 """

95 Decode bytes that may be arriving in a stream, following the Codecs

96 API.

98 `input` is the incoming sequence of bytes. `errors` tells us how to

99 handle errors, though we delegate all error-handling cases to the real

100 UTF-8 decoder to ensure correct behavior. `final` indicates whether

101 this is the end of the sequence, in which case we should raise an

102 error given incomplete input.

103

104 Returns as much decoded text as possible, and the number of bytes

105 consumed.

106 """

107 # decoded_segments are the pieces of text we have decoded so far,

108 # and position is our current position in the byte string. (Bytes

109 # before this position have been consumed, and bytes after it have

110 # yet to be decoded.)

111 decoded_segments = []

112 position = 0

113 while True:

114 # Use _buffer_decode_step to decode a segment of text.

115 decoded, consumed = self._buffer_decode_step(

116 input[position:], errors, final

117 )

118 if consumed == 0:

119 # Either there's nothing left to decode, or we need to wait

120 # for more input. Either way, we're done for now.

121 break

122

123 # Append the decoded text to the list, and update our position.

124 decoded_segments.append(decoded)

125 position += consumed

126

127 if final:

128 # _buffer_decode_step must consume all the bytes when `final` is

129 # true.

130 assert position == len(input)

131

132 return "".join(decoded_segments), position

133

134 def _buffer_decode_step(self, input, errors, final):

135 """

136 There are three possibilities for each decoding step:

137

138 - Decode as much real UTF-8 as possible.

139 - Decode a six-byte CESU-8 sequence at the current position.

140 - Decode a Java-style null at the current position.

141

142 This method figures out which step is appropriate, and does it.

143 """

144 # Get a reference to the superclass method that we'll be using for

145 # most of the real work.

146 sup = UTF8IncrementalDecoder._buffer_decode

147

148 # Find the next byte position that indicates a variant of UTF-8.

149 match = SPECIAL_BYTES_RE.search(input)

150 if match is None:

151 return sup(input, errors, final)

152

153 cutoff = match.start()

154 if cutoff > 0:

155 return sup(input[:cutoff], errors, True)

156

157 # Some byte sequence that we intend to handle specially matches

158 # at the beginning of the input.

159 if input.startswith(b"\xc0"):

160 if len(input) > 1:

161 # Decode the two-byte sequence 0xc0 0x80.

162 return "\u0000", 2

163 else:

164 if final:

165 # We hit the end of the stream. Let the superclass method

166 # handle it.

167 return sup(input, errors, True)

168 else:

169 # Wait to see another byte.

170 return "", 0

171 else:

172 # Decode a possible six-byte sequence starting with 0xed.

173 return self._buffer_decode_surrogates(sup, input, errors, final)

174

175 @staticmethod

176 def _buffer_decode_surrogates(sup, input, errors, final):

177 """

178 When we have improperly encoded surrogates, we can still see the

179 bits that they were meant to represent.

180

181 The surrogates were meant to encode a 20-bit number, to which we

182 add 0x10000 to get a codepoint. That 20-bit number now appears in

183 this form:

184

185 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

186

187 The CESU8_RE above matches byte sequences of this form. Then we need

188 to extract the bits and assemble a codepoint number from them.

189 """

190 if len(input) < 6:

191 if final:

192 # We found 0xed near the end of the stream, and there aren't

193 # six bytes to decode. Delegate to the superclass method to

194 # handle it as normal UTF-8. It might be a Hangul character

195 # or an error.

196 return sup(input, errors, final)

197 else:

198 # We found a surrogate, the stream isn't over yet, and we don't

199 # know enough of the following bytes to decode anything, so

200 # consume zero bytes and wait.

201 return "", 0

202 else:

203 if CESU8_RE.match(input):

204 # Given this is a CESU-8 sequence, do some math to pull out

205 # the intended 20-bit value, and consume six bytes.

206 codepoint = (

207 ((input[1] & 0x0F) << 16)

208 + ((input[2] & 0x3F) << 10)

209 + ((input[4] & 0x0F) << 6)

210 + (input[5] & 0x3F)

211 + 0x10000

212 )

213 return chr(codepoint), 6

214 else:

215 # This looked like a CESU-8 sequence, but it wasn't one.

216 # 0xed indicates the start of a three-byte sequence, so give

217 # three bytes to the superclass to decode as usual.

218 return sup(input[:3], errors, False)

219

220

221# The encoder is identical to UTF-8.

222IncrementalEncoder = UTF8IncrementalEncoder

223

224

225class StreamWriter(codecs.StreamWriter):

226 @staticmethod

227 def encode(input: str, errors: str = "strict") -> Tuple[bytes, int]:

228 return IncrementalEncoder(errors).encode(input, final=True), len(input)

229

230

231class StreamReader(codecs.StreamReader):

232 @staticmethod

233 def decode(input: bytes, errors: str = "strict") -> Tuple[str, int]:

234 return IncrementalDecoder(errors).decode(input, final=True), len(input)

235

236

237CODEC_INFO = codecs.CodecInfo(

238 name=NAME,

239 encode=StreamWriter.encode,

240 decode=StreamReader.decode,

241 incrementalencoder=IncrementalEncoder,

242 incrementaldecoder=IncrementalDecoder,

243 streamreader=StreamReader,

244 streamwriter=StreamWriter,

245)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/bad_codecs/utf8_variants.py: 77%

57 statements