Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/webencodings/__init__.py: 13%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

120 statements  

1# coding: utf-8 

2""" 

3 

4 webencodings 

5 ~~~~~~~~~~~~ 

6 

7 This is a Python implementation of the `WHATWG Encoding standard 

8 <http://encoding.spec.whatwg.org/>`. See README for details. 

9 

10 :copyright: Copyright 2012 by Simon Sapin 

11 :license: BSD, see LICENSE for details. 

12 

13""" 

14 

15from __future__ import unicode_literals 

16 

17import codecs 

18 

19from .labels import LABELS 

20 

21 

22VERSION = '0.5.1' 

23 

24 

25# Some names in Encoding are not valid Python aliases. Remap these. 

26PYTHON_NAMES = { 

27 'iso-8859-8-i': 'iso-8859-8', 

28 'x-mac-cyrillic': 'mac-cyrillic', 

29 'macintosh': 'mac-roman', 

30 'windows-874': 'cp874'} 

31 

32CACHE = {} 

33 

34 

35def ascii_lower(string): 

36 r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. 

37 

38 :param string: An Unicode string. 

39 :returns: A new Unicode string. 

40 

41 This is used for `ASCII case-insensitive 

42 <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ 

43 matching of encoding labels. 

44 The same matching is also used, among other things, 

45 for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. 

46 

47 This is different from the :meth:`~py:str.lower` method of Unicode strings 

48 which also affect non-ASCII characters, 

49 sometimes mapping them into the ASCII range: 

50 

51 >>> keyword = u'Bac\N{KELVIN SIGN}ground' 

52 >>> assert keyword.lower() == u'background' 

53 >>> assert ascii_lower(keyword) != keyword.lower() 

54 >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' 

55 

56 """ 

57 # This turns out to be faster than unicode.translate() 

58 return string.encode('utf8').lower().decode('utf8') 

59 

60 

61def lookup(label): 

62 """ 

63 Look for an encoding by its label. 

64 This is the spec’s `get an encoding 

65 <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. 

66 Supported labels are listed there. 

67 

68 :param label: A string. 

69 :returns: 

70 An :class:`Encoding` object, or :obj:`None` for an unknown label. 

71 

72 """ 

73 # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. 

74 label = ascii_lower(label.strip('\t\n\f\r ')) 

75 name = LABELS.get(label) 

76 if name is None: 

77 return None 

78 encoding = CACHE.get(name) 

79 if encoding is None: 

80 if name == 'x-user-defined': 

81 from .x_user_defined import codec_info 

82 else: 

83 python_name = PYTHON_NAMES.get(name, name) 

84 # Any python_name value that gets to here should be valid. 

85 codec_info = codecs.lookup(python_name) 

86 encoding = Encoding(name, codec_info) 

87 CACHE[name] = encoding 

88 return encoding 

89 

90 

91def _get_encoding(encoding_or_label): 

92 """ 

93 Accept either an encoding object or label. 

94 

95 :param encoding: An :class:`Encoding` object or a label string. 

96 :returns: An :class:`Encoding` object. 

97 :raises: :exc:`~exceptions.LookupError` for an unknown label. 

98 

99 """ 

100 if hasattr(encoding_or_label, 'codec_info'): 

101 return encoding_or_label 

102 

103 encoding = lookup(encoding_or_label) 

104 if encoding is None: 

105 raise LookupError('Unknown encoding label: %r' % encoding_or_label) 

106 return encoding 

107 

108 

109class Encoding(object): 

110 """Reresents a character encoding such as UTF-8, 

111 that can be used for decoding or encoding. 

112 

113 .. attribute:: name 

114 

115 Canonical name of the encoding 

116 

117 .. attribute:: codec_info 

118 

119 The actual implementation of the encoding, 

120 a stdlib :class:`~codecs.CodecInfo` object. 

121 See :func:`codecs.register`. 

122 

123 """ 

124 def __init__(self, name, codec_info): 

125 self.name = name 

126 self.codec_info = codec_info 

127 

128 def __repr__(self): 

129 return '<Encoding %s>' % self.name 

130 

131 

132#: The UTF-8 encoding. Should be used for new content and formats. 

133UTF8 = lookup('utf-8') 

134 

135_UTF16LE = lookup('utf-16le') 

136_UTF16BE = lookup('utf-16be') 

137 

138 

139def decode(input, fallback_encoding, errors='replace'): 

140 """ 

141 Decode a single string. 

142 

143 :param input: A byte string 

144 :param fallback_encoding: 

145 An :class:`Encoding` object or a label string. 

146 The encoding to use if :obj:`input` does note have a BOM. 

147 :param errors: Type of error handling. See :func:`codecs.register`. 

148 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 

149 :return: 

150 A ``(output, encoding)`` tuple of an Unicode string 

151 and an :obj:`Encoding`. 

152 

153 """ 

154 # Fail early if `encoding` is an invalid label. 

155 fallback_encoding = _get_encoding(fallback_encoding) 

156 bom_encoding, input = _detect_bom(input) 

157 encoding = bom_encoding or fallback_encoding 

158 return encoding.codec_info.decode(input, errors)[0], encoding 

159 

160 

161def _detect_bom(input): 

162 """Return (bom_encoding, input), with any BOM removed from the input.""" 

163 if input.startswith(b'\xFF\xFE'): 

164 return _UTF16LE, input[2:] 

165 if input.startswith(b'\xFE\xFF'): 

166 return _UTF16BE, input[2:] 

167 if input.startswith(b'\xEF\xBB\xBF'): 

168 return UTF8, input[3:] 

169 return None, input 

170 

171 

172def encode(input, encoding=UTF8, errors='strict'): 

173 """ 

174 Encode a single string. 

175 

176 :param input: An Unicode string. 

177 :param encoding: An :class:`Encoding` object or a label string. 

178 :param errors: Type of error handling. See :func:`codecs.register`. 

179 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 

180 :return: A byte string. 

181 

182 """ 

183 return _get_encoding(encoding).codec_info.encode(input, errors)[0] 

184 

185 

186def iter_decode(input, fallback_encoding, errors='replace'): 

187 """ 

188 "Pull"-based decoder. 

189 

190 :param input: 

191 An iterable of byte strings. 

192 

193 The input is first consumed just enough to determine the encoding 

194 based on the precense of a BOM, 

195 then consumed on demand when the return value is. 

196 :param fallback_encoding: 

197 An :class:`Encoding` object or a label string. 

198 The encoding to use if :obj:`input` does note have a BOM. 

199 :param errors: Type of error handling. See :func:`codecs.register`. 

200 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 

201 :returns: 

202 An ``(output, encoding)`` tuple. 

203 :obj:`output` is an iterable of Unicode strings, 

204 :obj:`encoding` is the :obj:`Encoding` that is being used. 

205 

206 """ 

207 

208 decoder = IncrementalDecoder(fallback_encoding, errors) 

209 generator = _iter_decode_generator(input, decoder) 

210 encoding = next(generator) 

211 return generator, encoding 

212 

213 

214def _iter_decode_generator(input, decoder): 

215 """Return a generator that first yields the :obj:`Encoding`, 

216 then yields output chukns as Unicode strings. 

217 

218 """ 

219 decode = decoder.decode 

220 input = iter(input) 

221 for chunck in input: 

222 output = decode(chunck) 

223 if output: 

224 assert decoder.encoding is not None 

225 yield decoder.encoding 

226 yield output 

227 break 

228 else: 

229 # Input exhausted without determining the encoding 

230 output = decode(b'', final=True) 

231 assert decoder.encoding is not None 

232 yield decoder.encoding 

233 if output: 

234 yield output 

235 return 

236 

237 for chunck in input: 

238 output = decode(chunck) 

239 if output: 

240 yield output 

241 output = decode(b'', final=True) 

242 if output: 

243 yield output 

244 

245 

246def iter_encode(input, encoding=UTF8, errors='strict'): 

247 """ 

248 “Pull”-based encoder. 

249 

250 :param input: An iterable of Unicode strings. 

251 :param encoding: An :class:`Encoding` object or a label string. 

252 :param errors: Type of error handling. See :func:`codecs.register`. 

253 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 

254 :returns: An iterable of byte strings. 

255 

256 """ 

257 # Fail early if `encoding` is an invalid label. 

258 encode = IncrementalEncoder(encoding, errors).encode 

259 return _iter_encode_generator(input, encode) 

260 

261 

262def _iter_encode_generator(input, encode): 

263 for chunck in input: 

264 output = encode(chunck) 

265 if output: 

266 yield output 

267 output = encode('', final=True) 

268 if output: 

269 yield output 

270 

271 

272class IncrementalDecoder(object): 

273 """ 

274 “Push”-based decoder. 

275 

276 :param fallback_encoding: 

277 An :class:`Encoding` object or a label string. 

278 The encoding to use if :obj:`input` does note have a BOM. 

279 :param errors: Type of error handling. See :func:`codecs.register`. 

280 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 

281 

282 """ 

283 def __init__(self, fallback_encoding, errors='replace'): 

284 # Fail early if `encoding` is an invalid label. 

285 self._fallback_encoding = _get_encoding(fallback_encoding) 

286 self._errors = errors 

287 self._buffer = b'' 

288 self._decoder = None 

289 #: The actual :class:`Encoding` that is being used, 

290 #: or :obj:`None` if that is not determined yet. 

291 #: (Ie. if there is not enough input yet to determine 

292 #: if there is a BOM.) 

293 self.encoding = None # Not known yet. 

294 

295 def decode(self, input, final=False): 

296 """Decode one chunk of the input. 

297 

298 :param input: A byte string. 

299 :param final: 

300 Indicate that no more input is available. 

301 Must be :obj:`True` if this is the last call. 

302 :returns: An Unicode string. 

303 

304 """ 

305 decoder = self._decoder 

306 if decoder is not None: 

307 return decoder(input, final) 

308 

309 input = self._buffer + input 

310 encoding, input = _detect_bom(input) 

311 if encoding is None: 

312 if len(input) < 3 and not final: # Not enough data yet. 

313 self._buffer = input 

314 return '' 

315 else: # No BOM 

316 encoding = self._fallback_encoding 

317 decoder = encoding.codec_info.incrementaldecoder(self._errors).decode 

318 self._decoder = decoder 

319 self.encoding = encoding 

320 return decoder(input, final) 

321 

322 

323class IncrementalEncoder(object): 

324 """ 

325 “Push”-based encoder. 

326 

327 :param encoding: An :class:`Encoding` object or a label string. 

328 :param errors: Type of error handling. See :func:`codecs.register`. 

329 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 

330 

331 .. method:: encode(input, final=False) 

332 

333 :param input: An Unicode string. 

334 :param final: 

335 Indicate that no more input is available. 

336 Must be :obj:`True` if this is the last call. 

337 :returns: A byte string. 

338 

339 """ 

340 def __init__(self, encoding=UTF8, errors='strict'): 

341 encoding = _get_encoding(encoding) 

342 self.encode = encoding.codec_info.incrementalencoder(errors).encode