Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/openid/yadis/parsehtml.py: 62%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

87 statements  

1__all__ = ['findHTMLMeta', 'MetaNotFound'] 

2 

3from html.parser import HTMLParser 

4import html.entities 

5import re 

6import sys 

7 

8from openid.yadis.constants import YADIS_HEADER_NAME 

9 

10# Size of the chunks to search at a time (also the amount that gets 

11# read at a time) 

12CHUNK_SIZE = 1024 * 16 # 16 KB 

13 

14 

15class ParseDone(Exception): 

16 """Exception to hold the URI that was located when the parse is 

17 finished. If the parse finishes without finding the URI, set it to 

18 None.""" 

19 

20 

21class MetaNotFound(Exception): 

22 """Exception to hold the content of the page if we did not find 

23 the appropriate <meta> tag""" 

24 

25 

26re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE 

27ent_pat = r''' 

28& 

29 

30(?: \#x (?P<hex> [a-f0-9]+ ) 

31| \# (?P<dec> \d+ ) 

32| (?P<word> \w+ ) 

33) 

34 

35;''' 

36 

37ent_re = re.compile(ent_pat, re_flags) 

38 

39 

40def substituteMO(mo): 

41 if mo.lastgroup == 'hex': 

42 codepoint = int(mo.group('hex'), 16) 

43 elif mo.lastgroup == 'dec': 

44 codepoint = int(mo.group('dec')) 

45 else: 

46 assert mo.lastgroup == 'word' 

47 codepoint = html.entities.name2codepoint.get(mo.group('word')) 

48 

49 if codepoint is None: 

50 return mo.group() 

51 else: 

52 return chr(codepoint) 

53 

54 

55def substituteEntities(s): 

56 return ent_re.sub(substituteMO, s) 

57 

58 

59class YadisHTMLParser(HTMLParser): 

60 """Parser that finds a meta http-equiv tag in the head of a html 

61 document. 

62 

63 When feeding in data, if the tag is matched or it will never be 

64 found, the parser will raise ParseDone with the uri as the first 

65 attribute. 

66 

67 Parsing state diagram 

68 ===================== 

69 

70 Any unlisted input does not affect the state:: 

71 

72 1, 2, 5 8 

73 +--------------------------+ +-+ 

74 | | | | 

75 4 | 3 1, 2, 5, 7 v | v 

76 TOP -> HTML -> HEAD ----------> TERMINATED 

77 | | ^ | ^ ^ 

78 | | 3 | | | | 

79 | +------------+ +-> FOUND ------+ | 

80 | 6 8 | 

81 | 1, 2 | 

82 +------------------------------------+ 

83 

84 1. any of </body>, </html>, </head> -> TERMINATE 

85 2. <body> -> TERMINATE 

86 3. <head> -> HEAD 

87 4. <html> -> HTML 

88 5. <html> -> TERMINATE 

89 6. <meta http-equiv='X-XRDS-Location'> -> FOUND 

90 7. <head> -> TERMINATE 

91 8. Any input -> TERMINATE 

92 """ 

93 TOP = 0 

94 HTML = 1 

95 HEAD = 2 

96 FOUND = 3 

97 TERMINATED = 4 

98 

99 def __init__(self): 

100 if (sys.version_info.minor <= 2): 

101 # Python 3.2 and below actually require the `strict` argument 

102 # to `html.parser.HTMLParser` -- otherwise it's deprecated and 

103 # we don't want to pass it 

104 super(YadisHTMLParser, self).__init__(strict=False) 

105 else: 

106 super(YadisHTMLParser, self).__init__() 

107 self.phase = self.TOP 

108 

109 def _terminate(self): 

110 self.phase = self.TERMINATED 

111 raise ParseDone(None) 

112 

113 def handle_endtag(self, tag): 

114 # If we ever see an end of head, body, or html, bail out right away. 

115 # [1] 

116 if tag in ['head', 'body', 'html']: 

117 self._terminate() 

118 

119 def handle_starttag(self, tag, attrs): 

120 # if we ever see a start body tag, bail out right away, since 

121 # we want to prevent the meta tag from appearing in the body 

122 # [2] 

123 if tag == 'body': 

124 self._terminate() 

125 

126 if self.phase == self.TOP: 

127 # At the top level, allow a html tag or a head tag to move 

128 # to the head or html phase 

129 if tag == 'head': 

130 # [3] 

131 self.phase = self.HEAD 

132 elif tag == 'html': 

133 # [4] 

134 self.phase = self.HTML 

135 

136 elif self.phase == self.HTML: 

137 # if we are in the html tag, allow a head tag to move to 

138 # the HEAD phase. If we get another html tag, then bail 

139 # out 

140 if tag == 'head': 

141 # [3] 

142 self.phase = self.HEAD 

143 elif tag == 'html': 

144 # [5] 

145 self._terminate() 

146 

147 elif self.phase == self.HEAD: 

148 # If we are in the head phase, look for the appropriate 

149 # meta tag. If we get a head or body tag, bail out. 

150 if tag == 'meta': 

151 attrs_d = dict(attrs) 

152 http_equiv = attrs_d.get('http-equiv', '').lower() 

153 if http_equiv == YADIS_HEADER_NAME.lower(): 

154 raw_attr = attrs_d.get('content') 

155 yadis_loc = substituteEntities(raw_attr) 

156 # [6] 

157 self.phase = self.FOUND 

158 raise ParseDone(yadis_loc) 

159 

160 elif tag in ('head', 'html'): 

161 # [5], [7] 

162 self._terminate() 

163 

164 def feed(self, chars): 

165 # [8] 

166 if self.phase in (self.TERMINATED, self.FOUND): 

167 self._terminate() 

168 

169 return super(YadisHTMLParser, self).feed(chars) 

170 

171 

172def findHTMLMeta(stream): 

173 """Look for a meta http-equiv tag with the YADIS header name. 

174 

175 @param stream: Source of the html text 

176 @type stream: Object that implements a read() method that works 

177 like file.read 

178 

179 @return: The URI from which to fetch the XRDS document 

180 @rtype: str 

181 

182 @raises MetaNotFound: raised with the content that was 

183 searched as the first parameter. 

184 """ 

185 parser = YadisHTMLParser() 

186 chunks = [] 

187 

188 while 1: 

189 chunk = stream.read(CHUNK_SIZE) 

190 if not chunk: 

191 # End of file 

192 break 

193 

194 chunks.append(chunk) 

195 try: 

196 parser.feed(chunk) 

197 except ParseDone as why: 

198 uri = why.args[0] 

199 if uri is None: 

200 # Parse finished, but we may need the rest of the file 

201 chunks.append(stream.read()) 

202 break 

203 else: 

204 return uri 

205 

206 content = ''.join(chunks) 

207 raise MetaNotFound(content)