Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/html/__init__.py: 24%

37 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-20 07:00 +0000

1""" 

2General functions for HTML manipulation. 

3""" 

4 

5import re as _re 

6from html.entities import html5 as _html5 

7 

8 

9__all__ = ['escape', 'unescape'] 

10 

11 

12def escape(s, quote=True): 

13 """ 

14 Replace special characters "&", "<" and ">" to HTML-safe sequences. 

15 If the optional flag quote is true (the default), the quotation mark 

16 characters, both double quote (") and single quote (') characters are also 

17 translated. 

18 """ 

19 s = s.replace("&", "&amp;") # Must be done first! 

20 s = s.replace("<", "&lt;") 

21 s = s.replace(">", "&gt;") 

22 if quote: 

23 s = s.replace('"', "&quot;") 

24 s = s.replace('\'', "&#x27;") 

25 return s 

26 

27 

28# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references 

29 

30_invalid_charrefs = { 

31 0x00: '\ufffd', # REPLACEMENT CHARACTER 

32 0x0d: '\r', # CARRIAGE RETURN 

33 0x80: '\u20ac', # EURO SIGN 

34 0x81: '\x81', # <control> 

35 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK 

36 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK 

37 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK 

38 0x85: '\u2026', # HORIZONTAL ELLIPSIS 

39 0x86: '\u2020', # DAGGER 

40 0x87: '\u2021', # DOUBLE DAGGER 

41 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT 

42 0x89: '\u2030', # PER MILLE SIGN 

43 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON 

44 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 

45 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE 

46 0x8d: '\x8d', # <control> 

47 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON 

48 0x8f: '\x8f', # <control> 

49 0x90: '\x90', # <control> 

50 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK 

51 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK 

52 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK 

53 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK 

54 0x95: '\u2022', # BULLET 

55 0x96: '\u2013', # EN DASH 

56 0x97: '\u2014', # EM DASH 

57 0x98: '\u02dc', # SMALL TILDE 

58 0x99: '\u2122', # TRADE MARK SIGN 

59 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON 

60 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 

61 0x9c: '\u0153', # LATIN SMALL LIGATURE OE 

62 0x9d: '\x9d', # <control> 

63 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON 

64 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS 

65} 

66 

67_invalid_codepoints = { 

68 # 0x0001 to 0x0008 

69 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 

70 # 0x000E to 0x001F 

71 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 

72 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 

73 # 0x007F to 0x009F 

74 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 

75 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 

76 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 

77 # 0xFDD0 to 0xFDEF 

78 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, 

79 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, 

80 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea, 

81 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, 

82 # others 

83 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 

84 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 

85 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff, 

86 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff, 

87 0x10fffe, 0x10ffff 

88} 

89 

90 

91def _replace_charref(s): 

92 s = s.group(1) 

93 if s[0] == '#': 

94 # numeric charref 

95 if s[1] in 'xX': 

96 num = int(s[2:].rstrip(';'), 16) 

97 else: 

98 num = int(s[1:].rstrip(';')) 

99 if num in _invalid_charrefs: 

100 return _invalid_charrefs[num] 

101 if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: 

102 return '\uFFFD' 

103 if num in _invalid_codepoints: 

104 return '' 

105 return chr(num) 

106 else: 

107 # named charref 

108 if s in _html5: 

109 return _html5[s] 

110 # find the longest matching name (as defined by the standard) 

111 for x in range(len(s)-1, 1, -1): 

112 if s[:x] in _html5: 

113 return _html5[s[:x]] + s[x:] 

114 else: 

115 return '&' + s 

116 

117 

118_charref = _re.compile(r'&(#[0-9]+;?' 

119 r'|#[xX][0-9a-fA-F]+;?' 

120 r'|[^\t\n\f <&#;]{1,32};?)') 

121 

122def unescape(s): 

123 """ 

124 Convert all named and numeric character references (e.g. &gt;, &#62;, 

125 &x3e;) in the string s to the corresponding unicode characters. 

126 This function uses the rules defined by the HTML 5 standard 

127 for both valid and invalid character references, and the list of 

128 HTML 5 named character references defined in html.entities.html5. 

129 """ 

130 if '&' not in s: 

131 return s 

132 return _charref.sub(_replace_charref, s)