Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/html/__init__.py: 24%
37 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-20 07:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-20 07:00 +0000
1"""
2General functions for HTML manipulation.
3"""
5import re as _re
6from html.entities import html5 as _html5
9__all__ = ['escape', 'unescape']
12def escape(s, quote=True):
13 """
14 Replace special characters "&", "<" and ">" to HTML-safe sequences.
15 If the optional flag quote is true (the default), the quotation mark
16 characters, both double quote (") and single quote (') characters are also
17 translated.
18 """
19 s = s.replace("&", "&") # Must be done first!
20 s = s.replace("<", "<")
21 s = s.replace(">", ">")
22 if quote:
23 s = s.replace('"', """)
24 s = s.replace('\'', "'")
25 return s
28# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
30_invalid_charrefs = {
31 0x00: '\ufffd', # REPLACEMENT CHARACTER
32 0x0d: '\r', # CARRIAGE RETURN
33 0x80: '\u20ac', # EURO SIGN
34 0x81: '\x81', # <control>
35 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
36 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
37 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
38 0x85: '\u2026', # HORIZONTAL ELLIPSIS
39 0x86: '\u2020', # DAGGER
40 0x87: '\u2021', # DOUBLE DAGGER
41 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
42 0x89: '\u2030', # PER MILLE SIGN
43 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
44 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
45 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
46 0x8d: '\x8d', # <control>
47 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
48 0x8f: '\x8f', # <control>
49 0x90: '\x90', # <control>
50 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
51 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
52 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
53 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
54 0x95: '\u2022', # BULLET
55 0x96: '\u2013', # EN DASH
56 0x97: '\u2014', # EM DASH
57 0x98: '\u02dc', # SMALL TILDE
58 0x99: '\u2122', # TRADE MARK SIGN
59 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
60 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
61 0x9c: '\u0153', # LATIN SMALL LIGATURE OE
62 0x9d: '\x9d', # <control>
63 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
64 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
65}
67_invalid_codepoints = {
68 # 0x0001 to 0x0008
69 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
70 # 0x000E to 0x001F
71 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
72 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73 # 0x007F to 0x009F
74 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
75 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
76 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
77 # 0xFDD0 to 0xFDEF
78 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
79 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
80 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
81 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
82 # others
83 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
84 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
85 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
86 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
87 0x10fffe, 0x10ffff
88}
91def _replace_charref(s):
92 s = s.group(1)
93 if s[0] == '#':
94 # numeric charref
95 if s[1] in 'xX':
96 num = int(s[2:].rstrip(';'), 16)
97 else:
98 num = int(s[1:].rstrip(';'))
99 if num in _invalid_charrefs:
100 return _invalid_charrefs[num]
101 if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
102 return '\uFFFD'
103 if num in _invalid_codepoints:
104 return ''
105 return chr(num)
106 else:
107 # named charref
108 if s in _html5:
109 return _html5[s]
110 # find the longest matching name (as defined by the standard)
111 for x in range(len(s)-1, 1, -1):
112 if s[:x] in _html5:
113 return _html5[s[:x]] + s[x:]
114 else:
115 return '&' + s
118_charref = _re.compile(r'&(#[0-9]+;?'
119 r'|#[xX][0-9a-fA-F]+;?'
120 r'|[^\t\n\f <&#;]{1,32};?)')
122def unescape(s):
123 """
124 Convert all named and numeric character references (e.g. >, >,
125 &x3e;) in the string s to the corresponding unicode characters.
126 This function uses the rules defined by the HTML 5 standard
127 for both valid and invalid character references, and the list of
128 HTML 5 named character references defined in html.entities.html5.
129 """
130 if '&' not in s:
131 return s
132 return _charref.sub(_replace_charref, s)