Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/genshi/input.py: 54%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
14"""Support for constructing markup streams from files, strings, or other
15sources.
16"""
18from itertools import chain
19import codecs
20from xml.parsers import expat
22from genshi.compat import html_entities, html_parser, text_type, unichr, \
23 StringIO, BytesIO
24from genshi.core import Attrs, QName, Stream, stripentities
25from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
26 END_NS, START_CDATA, END_CDATA, PI, COMMENT
29__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
30__docformat__ = 'restructuredtext en'
33def ET(element):
34 """Convert a given ElementTree element to a markup stream.
36 :param element: an ElementTree element
37 :return: a markup stream
38 """
39 tag_name = QName(element.tag.lstrip('{'))
40 attrs = Attrs([(QName(attr.lstrip('{')), value)
41 for attr, value in element.items()])
43 yield START, (tag_name, attrs), (None, -1, -1)
44 if element.text:
45 yield TEXT, element.text, (None, -1, -1)
46 for child in element:
47 for item in ET(child):
48 yield item
49 yield END, tag_name, (None, -1, -1)
50 if element.tail:
51 yield TEXT, element.tail, (None, -1, -1)
54class ParseError(Exception):
55 """Exception raised when fatal syntax errors are found in the input being
56 parsed.
57 """
59 def __init__(self, message, filename=None, lineno=-1, offset=-1):
60 """Exception initializer.
62 :param message: the error message from the parser
63 :param filename: the path to the file that was parsed
64 :param lineno: the number of the line on which the error was encountered
65 :param offset: the column number where the error was encountered
66 """
67 self.msg = message
68 if filename:
69 message += ', in ' + filename
70 Exception.__init__(self, message)
71 self.filename = filename or '<string>'
72 self.lineno = lineno
73 self.offset = offset
76class XMLParser(object):
77 """Generator-based XML parser based on roughly equivalent code in
78 Kid/ElementTree.
80 The parsing is initiated by iterating over the parser object:
82 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
83 >>> for kind, data, pos in parser:
84 ... print('%s %s' % (kind, data))
85 START (QName('root'), Attrs([(QName('id'), '2')]))
86 START (QName('child'), Attrs())
87 TEXT Foo
88 END child
89 END root
90 """
92 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
93 html_entities.name2codepoint.items()]
94 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
96 def __init__(self, source, filename=None, encoding=None):
97 """Initialize the parser for the given XML input.
99 :param source: the XML text as a file-like object
100 :param filename: the name of the file, if appropriate
101 :param encoding: the encoding of the file; if not specified, the
102 encoding is assumed to be ASCII, UTF-8, or UTF-16, or
103 whatever the encoding specified in the XML declaration
104 (if any)
105 """
106 self.source = source
107 self.filename = filename
109 # Setup the Expat parser
110 parser = expat.ParserCreate(encoding, '}')
111 parser.buffer_text = True
112 # Python 3 does not have returns_unicode
113 if hasattr(parser, 'returns_unicode'):
114 parser.returns_unicode = True
115 parser.ordered_attributes = True
117 parser.StartElementHandler = self._handle_start
118 parser.EndElementHandler = self._handle_end
119 parser.CharacterDataHandler = self._handle_data
120 parser.StartDoctypeDeclHandler = self._handle_doctype
121 parser.StartNamespaceDeclHandler = self._handle_start_ns
122 parser.EndNamespaceDeclHandler = self._handle_end_ns
123 parser.StartCdataSectionHandler = self._handle_start_cdata
124 parser.EndCdataSectionHandler = self._handle_end_cdata
125 parser.ProcessingInstructionHandler = self._handle_pi
126 parser.XmlDeclHandler = self._handle_xml_decl
127 parser.CommentHandler = self._handle_comment
129 # Tell Expat that we'll handle non-XML entities ourselves
130 # (in _handle_other)
131 parser.DefaultHandler = self._handle_other
132 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
133 parser.UseForeignDTD()
134 parser.ExternalEntityRefHandler = self._build_foreign
136 self.expat = parser
137 self._queue = []
139 def parse(self):
140 """Generator that parses the XML source, yielding markup events.
142 :return: a markup event stream
143 :raises ParseError: if the XML text is not well formed
144 """
145 def _generate():
146 try:
147 bufsize = 4 * 1024 # 4K
148 done = False
149 while 1:
150 while not done and len(self._queue) == 0:
151 data = self.source.read(bufsize)
152 if not data: # end of data
153 if hasattr(self, 'expat'):
154 self.expat.Parse('', True)
155 del self.expat # get rid of circular references
156 done = True
157 else:
158 if isinstance(data, text_type):
159 data = data.encode('utf-8')
160 self.expat.Parse(data, False)
161 for event in self._queue:
162 yield event
163 self._queue = []
164 if done:
165 break
166 except expat.ExpatError as e:
167 msg = str(e)
168 raise ParseError(msg, self.filename, e.lineno, e.offset)
169 return Stream(_generate()).filter(_coalesce)
171 def __iter__(self):
172 return iter(self.parse())
174 def _build_foreign(self, context, base, sysid, pubid):
175 parser = self.expat.ExternalEntityParserCreate(context)
176 parser.ParseFile(BytesIO(self._external_dtd))
177 return 1
179 def _enqueue(self, kind, data=None, pos=None):
180 if pos is None:
181 pos = self._getpos()
182 if kind is TEXT:
183 # Expat reports the *end* of the text event as current position. We
184 # try to fix that up here as much as possible. Unfortunately, the
185 # offset is only valid for single-line text. For multi-line text,
186 # it is apparently not possible to determine at what offset it
187 # started
188 if '\n' in data:
189 lines = data.splitlines()
190 lineno = pos[1] - len(lines) + 1
191 offset = -1
192 else:
193 lineno = pos[1]
194 offset = pos[2] - len(data)
195 pos = (pos[0], lineno, offset)
196 self._queue.append((kind, data, pos))
198 def _getpos_unknown(self):
199 return (self.filename, -1, -1)
201 def _getpos(self):
202 return (self.filename, self.expat.CurrentLineNumber,
203 self.expat.CurrentColumnNumber)
205 def _handle_start(self, tag, attrib):
206 attrs = Attrs([(QName(name), value) for name, value in
207 zip(*[iter(attrib)] * 2)])
208 self._enqueue(START, (QName(tag), attrs))
210 def _handle_end(self, tag):
211 self._enqueue(END, QName(tag))
213 def _handle_data(self, text):
214 self._enqueue(TEXT, text)
216 def _handle_xml_decl(self, version, encoding, standalone):
217 self._enqueue(XML_DECL, (version, encoding, standalone))
219 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
220 self._enqueue(DOCTYPE, (name, pubid, sysid))
222 def _handle_start_ns(self, prefix, uri):
223 self._enqueue(START_NS, (prefix or '', uri))
225 def _handle_end_ns(self, prefix):
226 self._enqueue(END_NS, prefix or '')
228 def _handle_start_cdata(self):
229 self._enqueue(START_CDATA)
231 def _handle_end_cdata(self):
232 self._enqueue(END_CDATA)
234 def _handle_pi(self, target, data):
235 self._enqueue(PI, (target, data))
237 def _handle_comment(self, text):
238 self._enqueue(COMMENT, text)
240 def _handle_other(self, text):
241 if text.startswith('&'):
242 # deal with undefined entities
243 try:
244 text = unichr(html_entities.name2codepoint[text[1:-1]])
245 self._enqueue(TEXT, text)
246 except KeyError:
247 filename, lineno, offset = self._getpos()
248 error = expat.error('undefined entity "%s": line %d, column %d'
249 % (text, lineno, offset))
250 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
251 error.lineno = lineno
252 error.offset = offset
253 raise error
256def XML(text):
257 """Parse the given XML source and return a markup stream.
259 Unlike with `XMLParser`, the returned stream is reusable, meaning it can be
260 iterated over multiple times:
262 >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>')
263 >>> print(xml)
264 <doc><elem>Foo</elem><elem>Bar</elem></doc>
265 >>> print(xml.select('elem'))
266 <elem>Foo</elem><elem>Bar</elem>
267 >>> print(xml.select('elem/text()'))
268 FooBar
270 :param text: the XML source
271 :return: the parsed XML event stream
272 :raises ParseError: if the XML text is not well-formed
273 """
274 return Stream(list(XMLParser(StringIO(text))))
277class HTMLParser(html_parser.HTMLParser, object):
278 """Parser for HTML input based on the Python `HTMLParser` module.
280 This class provides the same interface for generating stream events as
281 `XMLParser`, and attempts to automatically balance tags.
283 The parsing is initiated by iterating over the parser object:
285 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')
286 >>> for kind, data, pos in parser:
287 ... print('%s %s' % (kind, data))
288 START (QName('ul'), Attrs([(QName('compact'), 'compact')]))
289 START (QName('li'), Attrs())
290 TEXT Foo
291 END li
292 END ul
293 """
295 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
296 'hr', 'img', 'input', 'isindex', 'link', 'meta',
297 'param'])
299 def __init__(self, source, filename=None, encoding=None):
300 """Initialize the parser for the given HTML input.
302 :param source: the HTML text as a file-like object
303 :param filename: the name of the file, if known
304 :param filename: encoding of the file; ignored if the input is unicode
305 """
306 html_parser.HTMLParser.__init__(self)
307 self.source = source
308 self.filename = filename
309 self.encoding = encoding
310 self._queue = []
311 self._open_tags = []
313 def parse(self):
314 """Generator that parses the HTML source, yielding markup events.
316 :return: a markup event stream
317 :raises ParseError: if the HTML text is not well formed
318 """
319 def _generate():
320 if self.encoding:
321 reader = codecs.getreader(self.encoding)
322 source = reader(self.source)
323 else:
324 source = self.source
325 try:
326 bufsize = 4 * 1024 # 4K
327 done = False
328 while 1:
329 while not done and len(self._queue) == 0:
330 data = source.read(bufsize)
331 if not data: # end of data
332 self.close()
333 done = True
334 else:
335 if not isinstance(data, text_type):
336 raise UnicodeError("source returned bytes, but no encoding specified")
337 self.feed(data)
338 for kind, data, pos in self._queue:
339 yield kind, data, pos
340 self._queue = []
341 if done:
342 open_tags = self._open_tags
343 open_tags.reverse()
344 for tag in open_tags:
345 yield END, QName(tag), pos
346 break
347 except Exception as e:
348 # Python's simple HTMLParser does not raise detailed
349 # errors except in strict mode which was deprecated
350 # in Python 3.3 and removed in Python 3.5 and which in
351 # any case is not used is this code.
352 msg = str(e)
353 raise ParseError(msg, self.filename)
354 return Stream(_generate()).filter(_coalesce)
356 def __iter__(self):
357 return iter(self.parse())
359 def _enqueue(self, kind, data, pos=None):
360 if pos is None:
361 pos = self._getpos()
362 self._queue.append((kind, data, pos))
364 def _getpos(self):
365 lineno, column = self.getpos()
366 return (self.filename, lineno, column)
368 def handle_starttag(self, tag, attrib):
369 fixed_attrib = []
370 for name, value in attrib: # Fixup minimized attributes
371 if value is None:
372 value = name
373 fixed_attrib.append((QName(name), stripentities(value)))
375 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
376 if tag in self._EMPTY_ELEMS:
377 self._enqueue(END, QName(tag))
378 else:
379 self._open_tags.append(tag)
381 def handle_endtag(self, tag):
382 if tag not in self._EMPTY_ELEMS:
383 while self._open_tags:
384 open_tag = self._open_tags.pop()
385 self._enqueue(END, QName(open_tag))
386 if open_tag.lower() == tag.lower():
387 break
389 def handle_data(self, text):
390 self._enqueue(TEXT, text)
392 def handle_charref(self, name):
393 if name.lower().startswith('x'):
394 text = unichr(int(name[1:], 16))
395 else:
396 text = unichr(int(name))
397 self._enqueue(TEXT, text)
399 def handle_entityref(self, name):
400 try:
401 text = unichr(html_entities.name2codepoint[name])
402 except KeyError:
403 text = '&%s;' % name
404 self._enqueue(TEXT, text)
406 def handle_pi(self, data):
407 if data.endswith('?'):
408 data = data[:-1]
409 try:
410 target, data = data.split(None, 1)
411 except ValueError:
412 # PI with no data
413 target = data
414 data = ''
415 self._enqueue(PI, (target.strip(), data.strip()))
417 def handle_comment(self, text):
418 self._enqueue(COMMENT, text)
421def HTML(text, encoding=None):
422 """Parse the given HTML source and return a markup stream.
424 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
425 iterated over multiple times:
427 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
428 >>> print(html)
429 <body><h1>Foo</h1></body>
430 >>> print(html.select('h1'))
431 <h1>Foo</h1>
432 >>> print(html.select('h1/text()'))
433 Foo
435 :param text: the HTML source
436 :return: the parsed XML event stream
437 :raises ParseError: if the HTML text is not well-formed, and error recovery
438 fails
439 """
440 if isinstance(text, text_type):
441 # If it's unicode text the encoding should be set to None.
442 # The option to pass in an incorrect encoding is for ease
443 # of writing doctests that work in both Python 2.x and 3.x.
444 return Stream(list(HTMLParser(StringIO(text), encoding=None)))
445 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
448def _coalesce(stream):
449 """Coalesces adjacent TEXT events into a single event."""
450 textbuf = []
451 textpos = None
452 for kind, data, pos in chain(stream, [(None, None, None)]):
453 if kind is TEXT:
454 textbuf.append(data)
455 if textpos is None:
456 textpos = pos
457 else:
458 if textbuf:
459 yield TEXT, ''.join(textbuf), textpos
460 del textbuf[:]
461 textpos = None
462 if kind:
463 yield kind, data, pos