Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/bad

1r"""

2`ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes"

3in a messy but common way: by outputting the Unicode codepoints with the same

4numbers.

6This is incredibly ugly, and it's also in the HTML5 standard.

8A single-byte encoding maps each byte to a Unicode character, except that some

9bytes are left unmapped. In the commonly-used Windows-1252 encoding, for

10example, bytes 0x81 and 0x8D, among others, have no meaning.

12Python, wanting to preserve some sense of decorum, will handle these bytes

13as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're

14different from each other. It just hasn't defined what they are in terms of

15Unicode.

17Software that has to interoperate with Windows-1252 and Unicode -- such as all

18the common Web browsers -- will pick some Unicode characters for them to map

19to, and the characters they pick are the Unicode characters with the same

20numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the

21resulting characters tend to fall into a range of Unicode that's set aside for

22obsolete Latin-1 control characters anyway.

24These sloppy codecs let Python do the same thing, thus interoperating with

25other software that works this way. It defines a sloppy version of many

26single-byte encodings with holes. (There is no need for a sloppy version of

27an encoding without holes: for example, there is no such thing as

28sloppy-iso-8859-2 or sloppy-macroman.)

30The following encodings will become defined:

32- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)

33- sloppy-windows-1251 (Cyrillic)

34- sloppy-windows-1252 (Western European, based on Latin-1)

35- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)

36- sloppy-windows-1254 (Turkish, based on ISO-8859-9)

37- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)

38- sloppy-windows-1256 (Arabic)

39- sloppy-windows-1257 (Baltic, based on ISO-8859-13)

40- sloppy-windows-1258 (Vietnamese)

41- sloppy-cp874 (Thai, based on ISO-8859-11)

42- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)

43- sloppy-iso-8859-6 (different Arabic)

44- sloppy-iso-8859-7 (Greek)

45- sloppy-iso-8859-8 (Hebrew)

46- sloppy-iso-8859-11 (Thai)

48Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be

49defined.

51Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`)

52are used within ftfy.

54Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how

55sloppy-windows-1252 merges Windows-1252 with Latin-1:

57 >>> from ftfy import explain_unicode

58 >>> some_bytes = b'\x80\x81\x82'

59 >>> explain_unicode(some_bytes.decode('latin-1'))

60 U+0080 \x80 [Cc] <unknown>

61 U+0081 \x81 [Cc] <unknown>

62 U+0082 \x82 [Cc] <unknown>

64 >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))

65 U+20AC € [Sc] EURO SIGN

66 U+FFFD � [So] REPLACEMENT CHARACTER

67 U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK

69 >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))

70 U+20AC € [Sc] EURO SIGN

71 U+0081 \x81 [Cc] <unknown>

72 U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK

73"""

75from __future__ import annotations

77import codecs

78from encodings import normalize_encoding

80REPLACEMENT_CHAR = "\ufffd"

83def make_sloppy_codec(encoding: str) -> codecs.CodecInfo:

84 """

85 Take a codec name, and return a 'sloppy' version of that codec that can

86 encode and decode the unassigned bytes in that encoding.

88 Single-byte encodings in the standard library are defined using some

89 boilerplate classes surrounding the functions that do the actual work,

90 `codecs.charmap_decode` and `charmap_encode`. This function, given an

91 encoding name, *defines* those boilerplate classes.

92 """

93 # Make a bytestring of all 256 possible bytes.

94 all_bytes = bytes(range(256))

96 # Get a list of what they would decode to in Latin-1.

97 sloppy_chars = list(all_bytes.decode("latin-1"))

99 # Get a list of what they decode to in the given encoding. Use the

100 # replacement character for unassigned bytes.

101 decoded_chars = all_bytes.decode(encoding, errors="replace")

102

103 # Update the sloppy_chars list. Each byte that was successfully decoded

104 # gets its decoded value in the list. The unassigned bytes are left as

105 # they are, which gives their decoding in Latin-1.

106 for i, char in enumerate(decoded_chars):

107 if char != REPLACEMENT_CHAR:

108 sloppy_chars[i] = char

109

110 # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"

111 # control code, to encode the Unicode replacement character U+FFFD.

112 sloppy_chars[0x1A] = REPLACEMENT_CHAR

113

114 # Create the data structures that tell the charmap methods how to encode

115 # and decode in this sloppy encoding.

116 decoding_table = "".join(sloppy_chars)

117 encoding_table = codecs.charmap_build(decoding_table)

118

119 # Now produce all the class boilerplate. Look at the Python source for

120 # `encodings.cp1252` for comparison; this is almost exactly the same,

121 # except I made it follow pep8.

122 class Codec(codecs.Codec):

123 def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]:

124 return codecs.charmap_encode(input, errors, encoding_table)

125

126 def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]:

127 return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type]

128

129 class IncrementalEncoder(codecs.IncrementalEncoder):

130 def encode(self, input: str, final: bool = False) -> bytes:

131 return codecs.charmap_encode(input, self.errors, encoding_table)[0]

132

133 class IncrementalDecoder(codecs.IncrementalDecoder):

134 def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override]

135 return codecs.charmap_decode(input, self.errors, decoding_table)[0] # type: ignore[arg-type]

136

137 class StreamWriter(Codec, codecs.StreamWriter):

138 pass

139

140 class StreamReader(Codec, codecs.StreamReader):

141 pass

142

143 return codecs.CodecInfo(

144 name="sloppy-" + encoding,

145 encode=Codec().encode,

146 decode=Codec().decode, # type: ignore[arg-type]

147 incrementalencoder=IncrementalEncoder,

148 incrementaldecoder=IncrementalDecoder,

149 streamreader=StreamReader,

150 streamwriter=StreamWriter,

151 )

152

153

154# Define a codec for each incomplete encoding. The resulting CODECS dictionary

155# can be used by the main module of ftfy.bad_codecs.

156CODECS = {}

157INCOMPLETE_ENCODINGS = (

158 [f"windows-{num}" for num in range(1250, 1259)]

159 + [f"iso-8859-{num}" for num in (3, 6, 7, 8, 11)]

160 + [f"cp{num}" for num in range(1250, 1259)]

161 + ["cp874"]

162)

163

164for _encoding in INCOMPLETE_ENCODINGS:

165 _new_name = normalize_encoding("sloppy-" + _encoding)

166 CODECS[_new_name] = make_sloppy_codec(_encoding)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/bad_codecs/sloppy.py: 92%

36 statements