1__all__ = ['findHTMLMeta', 'MetaNotFound']
2
3from html.parser import HTMLParser
4import html.entities
5import re
6import sys
7
8from openid.yadis.constants import YADIS_HEADER_NAME
9
10# Size of the chunks to search at a time (also the amount that gets
11# read at a time)
12CHUNK_SIZE = 1024 * 16 # 16 KB
13
14
15class ParseDone(Exception):
16 """Exception to hold the URI that was located when the parse is
17 finished. If the parse finishes without finding the URI, set it to
18 None."""
19
20
21class MetaNotFound(Exception):
22 """Exception to hold the content of the page if we did not find
23 the appropriate <meta> tag"""
24
25
26re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE
27ent_pat = r'''
28&
29
30(?: \#x (?P<hex> [a-f0-9]+ )
31| \# (?P<dec> \d+ )
32| (?P<word> \w+ )
33)
34
35;'''
36
37ent_re = re.compile(ent_pat, re_flags)
38
39
40def substituteMO(mo):
41 if mo.lastgroup == 'hex':
42 codepoint = int(mo.group('hex'), 16)
43 elif mo.lastgroup == 'dec':
44 codepoint = int(mo.group('dec'))
45 else:
46 assert mo.lastgroup == 'word'
47 codepoint = html.entities.name2codepoint.get(mo.group('word'))
48
49 if codepoint is None:
50 return mo.group()
51 else:
52 return chr(codepoint)
53
54
55def substituteEntities(s):
56 return ent_re.sub(substituteMO, s)
57
58
59class YadisHTMLParser(HTMLParser):
60 """Parser that finds a meta http-equiv tag in the head of a html
61 document.
62
63 When feeding in data, if the tag is matched or it will never be
64 found, the parser will raise ParseDone with the uri as the first
65 attribute.
66
67 Parsing state diagram
68 =====================
69
70 Any unlisted input does not affect the state::
71
72 1, 2, 5 8
73 +--------------------------+ +-+
74 | | | |
75 4 | 3 1, 2, 5, 7 v | v
76 TOP -> HTML -> HEAD ----------> TERMINATED
77 | | ^ | ^ ^
78 | | 3 | | | |
79 | +------------+ +-> FOUND ------+ |
80 | 6 8 |
81 | 1, 2 |
82 +------------------------------------+
83
84 1. any of </body>, </html>, </head> -> TERMINATE
85 2. <body> -> TERMINATE
86 3. <head> -> HEAD
87 4. <html> -> HTML
88 5. <html> -> TERMINATE
89 6. <meta http-equiv='X-XRDS-Location'> -> FOUND
90 7. <head> -> TERMINATE
91 8. Any input -> TERMINATE
92 """
93 TOP = 0
94 HTML = 1
95 HEAD = 2
96 FOUND = 3
97 TERMINATED = 4
98
99 def __init__(self):
100 if (sys.version_info.minor <= 2):
101 # Python 3.2 and below actually require the `strict` argument
102 # to `html.parser.HTMLParser` -- otherwise it's deprecated and
103 # we don't want to pass it
104 super(YadisHTMLParser, self).__init__(strict=False)
105 else:
106 super(YadisHTMLParser, self).__init__()
107 self.phase = self.TOP
108
109 def _terminate(self):
110 self.phase = self.TERMINATED
111 raise ParseDone(None)
112
113 def handle_endtag(self, tag):
114 # If we ever see an end of head, body, or html, bail out right away.
115 # [1]
116 if tag in ['head', 'body', 'html']:
117 self._terminate()
118
119 def handle_starttag(self, tag, attrs):
120 # if we ever see a start body tag, bail out right away, since
121 # we want to prevent the meta tag from appearing in the body
122 # [2]
123 if tag == 'body':
124 self._terminate()
125
126 if self.phase == self.TOP:
127 # At the top level, allow a html tag or a head tag to move
128 # to the head or html phase
129 if tag == 'head':
130 # [3]
131 self.phase = self.HEAD
132 elif tag == 'html':
133 # [4]
134 self.phase = self.HTML
135
136 elif self.phase == self.HTML:
137 # if we are in the html tag, allow a head tag to move to
138 # the HEAD phase. If we get another html tag, then bail
139 # out
140 if tag == 'head':
141 # [3]
142 self.phase = self.HEAD
143 elif tag == 'html':
144 # [5]
145 self._terminate()
146
147 elif self.phase == self.HEAD:
148 # If we are in the head phase, look for the appropriate
149 # meta tag. If we get a head or body tag, bail out.
150 if tag == 'meta':
151 attrs_d = dict(attrs)
152 http_equiv = attrs_d.get('http-equiv', '').lower()
153 if http_equiv == YADIS_HEADER_NAME.lower():
154 raw_attr = attrs_d.get('content')
155 yadis_loc = substituteEntities(raw_attr)
156 # [6]
157 self.phase = self.FOUND
158 raise ParseDone(yadis_loc)
159
160 elif tag in ('head', 'html'):
161 # [5], [7]
162 self._terminate()
163
164 def feed(self, chars):
165 # [8]
166 if self.phase in (self.TERMINATED, self.FOUND):
167 self._terminate()
168
169 return super(YadisHTMLParser, self).feed(chars)
170
171
172def findHTMLMeta(stream):
173 """Look for a meta http-equiv tag with the YADIS header name.
174
175 @param stream: Source of the html text
176 @type stream: Object that implements a read() method that works
177 like file.read
178
179 @return: The URI from which to fetch the XRDS document
180 @rtype: str
181
182 @raises MetaNotFound: raised with the content that was
183 searched as the first parameter.
184 """
185 parser = YadisHTMLParser()
186 chunks = []
187
188 while 1:
189 chunk = stream.read(CHUNK_SIZE)
190 if not chunk:
191 # End of file
192 break
193
194 chunks.append(chunk)
195 try:
196 parser.feed(chunk)
197 except ParseDone as why:
198 uri = why.args[0]
199 if uri is None:
200 # Parse finished, but we may need the rest of the file
201 chunks.append(stream.read())
202 break
203 else:
204 return uri
205
206 content = ''.join(chunks)
207 raise MetaNotFound(content)