Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/html/__init_

1"""

2General functions for HTML manipulation.

3"""

5import re as _re

6from html.entities import html5 as _html5

9__all__ = ['escape', 'unescape']

12def escape(s, quote=True):

13 """

14 Replace special characters "&", "<" and ">" to HTML-safe sequences.

15 If the optional flag quote is true (the default), the quotation mark

16 characters, both double quote (") and single quote (') characters are also

17 translated.

18 """

19 s = s.replace("&", "&") # Must be done first!

20 s = s.replace("<", "<")

21 s = s.replace(">", ">")

22 if quote:

23 s = s.replace('"', """)

24 s = s.replace('\'', "'")

25 return s

28# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references

30_invalid_charrefs = {

31 0x00: '\ufffd', # REPLACEMENT CHARACTER

32 0x0d: '\r', # CARRIAGE RETURN

33 0x80: '\u20ac', # EURO SIGN

34 0x81: '\x81', # <control>

35 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK

36 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK

37 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK

38 0x85: '\u2026', # HORIZONTAL ELLIPSIS

39 0x86: '\u2020', # DAGGER

40 0x87: '\u2021', # DOUBLE DAGGER

41 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT

42 0x89: '\u2030', # PER MILLE SIGN

43 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON

44 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK

45 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE

46 0x8d: '\x8d', # <control>

47 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON

48 0x8f: '\x8f', # <control>

49 0x90: '\x90', # <control>

50 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK

51 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK

52 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK

53 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK

54 0x95: '\u2022', # BULLET

55 0x96: '\u2013', # EN DASH

56 0x97: '\u2014', # EM DASH

57 0x98: '\u02dc', # SMALL TILDE

58 0x99: '\u2122', # TRADE MARK SIGN

59 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON

60 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK

61 0x9c: '\u0153', # LATIN SMALL LIGATURE OE

62 0x9d: '\x9d', # <control>

63 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON

64 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS

65}

67_invalid_codepoints = {

68 # 0x0001 to 0x0008

69 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,

70 # 0x000E to 0x001F

71 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

72 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,

73 # 0x007F to 0x009F

74 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,

75 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,

76 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,

77 # 0xFDD0 to 0xFDEF

78 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,

79 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,

80 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,

81 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,

82 # others

83 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,

84 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,

85 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,

86 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,

87 0x10fffe, 0x10ffff

88}

91def _replace_charref(s):

92 s = s.group(1)

93 if s[0] == '#':

94 # numeric charref

95 if s[1] in 'xX':

96 num = int(s[2:].rstrip(';'), 16)

97 else:

98 num = int(s[1:].rstrip(';'))

99 if num in _invalid_charrefs:

100 return _invalid_charrefs[num]

101 if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:

102 return '\uFFFD'

103 if num in _invalid_codepoints:

104 return ''

105 return chr(num)

106 else:

107 # named charref

108 if s in _html5:

109 return _html5[s]

110 # find the longest matching name (as defined by the standard)

111 for x in range(len(s)-1, 1, -1):

112 if s[:x] in _html5:

113 return _html5[s[:x]] + s[x:]

114 else:

115 return '&' + s

116

117

118_charref = _re.compile(r'&(#[0-9]+;?'

119 r'|#[xX][0-9a-fA-F]+;?'

120 r'|[^\t\n\f <&#;]{1,32};?)')

121

122def unescape(s):

123 """

124 Convert all named and numeric character references (e.g. >, >,

125 &x3e;) in the string s to the corresponding unicode characters.

126 This function uses the rules defined by the HTML 5 standard

127 for both valid and invalid character references, and the list of

128 HTML 5 named character references defined in html.entities.html5.

129 """

130 if '&' not in s:

131 return s

132 return _charref.sub(_replace_charref, s)

Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/html/init.py: 24%

37 statements