Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/openid/yadis/parsehtml.py: 62%

1__all__ = ['findHTMLMeta', 'MetaNotFound']

3from html.parser import HTMLParser

4import html.entities

5import re

6import sys

8from openid.yadis.constants import YADIS_HEADER_NAME

10# Size of the chunks to search at a time (also the amount that gets

11# read at a time)

12CHUNK_SIZE = 1024 * 16 # 16 KB

15class ParseDone(Exception):

16 """Exception to hold the URI that was located when the parse is

17 finished. If the parse finishes without finding the URI, set it to

18 None."""

21class MetaNotFound(Exception):

22 """Exception to hold the content of the page if we did not find

23 the appropriate <meta> tag"""

26re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE

27ent_pat = r'''

28&

30(?: \#x (?P<hex> [a-f0-9]+ )

31| \# (?P<dec> \d+ )

32| (?P<word> \w+ )

33)

35;'''

37ent_re = re.compile(ent_pat, re_flags)

40def substituteMO(mo):

41 if mo.lastgroup == 'hex':

42 codepoint = int(mo.group('hex'), 16)

43 elif mo.lastgroup == 'dec':

44 codepoint = int(mo.group('dec'))

45 else:

46 assert mo.lastgroup == 'word'

47 codepoint = html.entities.name2codepoint.get(mo.group('word'))

49 if codepoint is None:

50 return mo.group()

51 else:

52 return chr(codepoint)

55def substituteEntities(s):

56 return ent_re.sub(substituteMO, s)

59class YadisHTMLParser(HTMLParser):

60 """Parser that finds a meta http-equiv tag in the head of a html

61 document.

63 When feeding in data, if the tag is matched or it will never be

64 found, the parser will raise ParseDone with the uri as the first

65 attribute.

67 Parsing state diagram

68 =====================

70 Any unlisted input does not affect the state::

72 1, 2, 5 8

73 +--------------------------+ +-+

74 | | | |

75 4 | 3 1, 2, 5, 7 v | v

76 TOP -> HTML -> HEAD ----------> TERMINATED

77 | | ^ | ^ ^

78 | | 3 | | | |

79 | +------------+ +-> FOUND ------+ |

80 | 6 8 |

81 | 1, 2 |

82 +------------------------------------+

84 1. any of </body>, </html>, </head> -> TERMINATE

85 2. <body> -> TERMINATE

86 3. <head> -> HEAD

87 4. <html> -> HTML

88 5. <html> -> TERMINATE

89 6. <meta http-equiv='X-XRDS-Location'> -> FOUND

90 7. <head> -> TERMINATE

91 8. Any input -> TERMINATE

92 """

93 TOP = 0

94 HTML = 1

95 HEAD = 2

96 FOUND = 3

97 TERMINATED = 4

99 def __init__(self):

100 if (sys.version_info.minor <= 2):

101 # Python 3.2 and below actually require the `strict` argument

102 # to `html.parser.HTMLParser` -- otherwise it's deprecated and

103 # we don't want to pass it

104 super(YadisHTMLParser, self).__init__(strict=False)

105 else:

106 super(YadisHTMLParser, self).__init__()

107 self.phase = self.TOP

108

109 def _terminate(self):

110 self.phase = self.TERMINATED

111 raise ParseDone(None)

112

113 def handle_endtag(self, tag):

114 # If we ever see an end of head, body, or html, bail out right away.

115 # [1]

116 if tag in ['head', 'body', 'html']:

117 self._terminate()

118

119 def handle_starttag(self, tag, attrs):

120 # if we ever see a start body tag, bail out right away, since

121 # we want to prevent the meta tag from appearing in the body

122 # [2]

123 if tag == 'body':

124 self._terminate()

125

126 if self.phase == self.TOP:

127 # At the top level, allow a html tag or a head tag to move

128 # to the head or html phase

129 if tag == 'head':

130 # [3]

131 self.phase = self.HEAD

132 elif tag == 'html':

133 # [4]

134 self.phase = self.HTML

135

136 elif self.phase == self.HTML:

137 # if we are in the html tag, allow a head tag to move to

138 # the HEAD phase. If we get another html tag, then bail

139 # out

140 if tag == 'head':

141 # [3]

142 self.phase = self.HEAD

143 elif tag == 'html':

144 # [5]

145 self._terminate()

146

147 elif self.phase == self.HEAD:

148 # If we are in the head phase, look for the appropriate

149 # meta tag. If we get a head or body tag, bail out.

150 if tag == 'meta':

151 attrs_d = dict(attrs)

152 http_equiv = attrs_d.get('http-equiv', '').lower()

153 if http_equiv == YADIS_HEADER_NAME.lower():

154 raw_attr = attrs_d.get('content')

155 yadis_loc = substituteEntities(raw_attr)

156 # [6]

157 self.phase = self.FOUND

158 raise ParseDone(yadis_loc)

159

160 elif tag in ('head', 'html'):

161 # [5], [7]

162 self._terminate()

163

164 def feed(self, chars):

165 # [8]

166 if self.phase in (self.TERMINATED, self.FOUND):

167 self._terminate()

168

169 return super(YadisHTMLParser, self).feed(chars)

170

171

172def findHTMLMeta(stream):

173 """Look for a meta http-equiv tag with the YADIS header name.

174

175 @param stream: Source of the html text

176 @type stream: Object that implements a read() method that works

177 like file.read

178

179 @return: The URI from which to fetch the XRDS document

180 @rtype: str

181

182 @raises MetaNotFound: raised with the content that was

183 searched as the first parameter.

184 """

185 parser = YadisHTMLParser()

186 chunks = []

187

188 while 1:

189 chunk = stream.read(CHUNK_SIZE)

190 if not chunk:

191 # End of file

192 break

193

194 chunks.append(chunk)

195 try:

196 parser.feed(chunk)

197 except ParseDone as why:

198 uri = why.args[0]

199 if uri is None:

200 # Parse finished, but we may need the rest of the file

201 chunks.append(stream.read())

202 break

203 else:

204 return uri

205

206 content = ''.join(chunks)

207 raise MetaNotFound(content)