Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/genshi/input.py: 54%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

251 statements  

1# -*- coding: utf-8 -*- 

2# 

3# Copyright (C) 2006-2009 Edgewall Software 

4# All rights reserved. 

5# 

6# This software is licensed as described in the file COPYING, which 

7# you should have received as part of this distribution. The terms 

8# are also available at http://genshi.edgewall.org/wiki/License. 

9# 

10# This software consists of voluntary contributions made by many 

11# individuals. For the exact contribution history, see the revision 

12# history and logs, available at http://genshi.edgewall.org/log/. 

13 

14"""Support for constructing markup streams from files, strings, or other 

15sources. 

16""" 

17 

18from itertools import chain 

19import codecs 

20from xml.parsers import expat 

21 

22from genshi.compat import html_entities, html_parser, text_type, unichr, \ 

23 StringIO, BytesIO 

24from genshi.core import Attrs, QName, Stream, stripentities 

25from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \ 

26 END_NS, START_CDATA, END_CDATA, PI, COMMENT 

27 

28 

29__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] 

30__docformat__ = 'restructuredtext en' 

31 

32 

33def ET(element): 

34 """Convert a given ElementTree element to a markup stream. 

35  

36 :param element: an ElementTree element 

37 :return: a markup stream 

38 """ 

39 tag_name = QName(element.tag.lstrip('{')) 

40 attrs = Attrs([(QName(attr.lstrip('{')), value) 

41 for attr, value in element.items()]) 

42 

43 yield START, (tag_name, attrs), (None, -1, -1) 

44 if element.text: 

45 yield TEXT, element.text, (None, -1, -1) 

46 for child in element: 

47 for item in ET(child): 

48 yield item 

49 yield END, tag_name, (None, -1, -1) 

50 if element.tail: 

51 yield TEXT, element.tail, (None, -1, -1) 

52 

53 

54class ParseError(Exception): 

55 """Exception raised when fatal syntax errors are found in the input being 

56 parsed. 

57 """ 

58 

59 def __init__(self, message, filename=None, lineno=-1, offset=-1): 

60 """Exception initializer. 

61  

62 :param message: the error message from the parser 

63 :param filename: the path to the file that was parsed 

64 :param lineno: the number of the line on which the error was encountered 

65 :param offset: the column number where the error was encountered 

66 """ 

67 self.msg = message 

68 if filename: 

69 message += ', in ' + filename 

70 Exception.__init__(self, message) 

71 self.filename = filename or '<string>' 

72 self.lineno = lineno 

73 self.offset = offset 

74 

75 

76class XMLParser(object): 

77 """Generator-based XML parser based on roughly equivalent code in 

78 Kid/ElementTree. 

79  

80 The parsing is initiated by iterating over the parser object: 

81  

82 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) 

83 >>> for kind, data, pos in parser: 

84 ... print('%s %s' % (kind, data)) 

85 START (QName('root'), Attrs([(QName('id'), '2')])) 

86 START (QName('child'), Attrs()) 

87 TEXT Foo 

88 END child 

89 END root 

90 """ 

91 

92 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in 

93 html_entities.name2codepoint.items()] 

94 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8') 

95 

96 def __init__(self, source, filename=None, encoding=None): 

97 """Initialize the parser for the given XML input. 

98  

99 :param source: the XML text as a file-like object 

100 :param filename: the name of the file, if appropriate 

101 :param encoding: the encoding of the file; if not specified, the 

102 encoding is assumed to be ASCII, UTF-8, or UTF-16, or 

103 whatever the encoding specified in the XML declaration 

104 (if any) 

105 """ 

106 self.source = source 

107 self.filename = filename 

108 

109 # Setup the Expat parser 

110 parser = expat.ParserCreate(encoding, '}') 

111 parser.buffer_text = True 

112 # Python 3 does not have returns_unicode 

113 if hasattr(parser, 'returns_unicode'): 

114 parser.returns_unicode = True 

115 parser.ordered_attributes = True 

116 

117 parser.StartElementHandler = self._handle_start 

118 parser.EndElementHandler = self._handle_end 

119 parser.CharacterDataHandler = self._handle_data 

120 parser.StartDoctypeDeclHandler = self._handle_doctype 

121 parser.StartNamespaceDeclHandler = self._handle_start_ns 

122 parser.EndNamespaceDeclHandler = self._handle_end_ns 

123 parser.StartCdataSectionHandler = self._handle_start_cdata 

124 parser.EndCdataSectionHandler = self._handle_end_cdata 

125 parser.ProcessingInstructionHandler = self._handle_pi 

126 parser.XmlDeclHandler = self._handle_xml_decl 

127 parser.CommentHandler = self._handle_comment 

128 

129 # Tell Expat that we'll handle non-XML entities ourselves 

130 # (in _handle_other) 

131 parser.DefaultHandler = self._handle_other 

132 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 

133 parser.UseForeignDTD() 

134 parser.ExternalEntityRefHandler = self._build_foreign 

135 

136 self.expat = parser 

137 self._queue = [] 

138 

139 def parse(self): 

140 """Generator that parses the XML source, yielding markup events. 

141  

142 :return: a markup event stream 

143 :raises ParseError: if the XML text is not well formed 

144 """ 

145 def _generate(): 

146 try: 

147 bufsize = 4 * 1024 # 4K 

148 done = False 

149 while 1: 

150 while not done and len(self._queue) == 0: 

151 data = self.source.read(bufsize) 

152 if not data: # end of data 

153 if hasattr(self, 'expat'): 

154 self.expat.Parse('', True) 

155 del self.expat # get rid of circular references 

156 done = True 

157 else: 

158 if isinstance(data, text_type): 

159 data = data.encode('utf-8') 

160 self.expat.Parse(data, False) 

161 for event in self._queue: 

162 yield event 

163 self._queue = [] 

164 if done: 

165 break 

166 except expat.ExpatError as e: 

167 msg = str(e) 

168 raise ParseError(msg, self.filename, e.lineno, e.offset) 

169 return Stream(_generate()).filter(_coalesce) 

170 

171 def __iter__(self): 

172 return iter(self.parse()) 

173 

174 def _build_foreign(self, context, base, sysid, pubid): 

175 parser = self.expat.ExternalEntityParserCreate(context) 

176 parser.ParseFile(BytesIO(self._external_dtd)) 

177 return 1 

178 

179 def _enqueue(self, kind, data=None, pos=None): 

180 if pos is None: 

181 pos = self._getpos() 

182 if kind is TEXT: 

183 # Expat reports the *end* of the text event as current position. We 

184 # try to fix that up here as much as possible. Unfortunately, the 

185 # offset is only valid for single-line text. For multi-line text, 

186 # it is apparently not possible to determine at what offset it 

187 # started 

188 if '\n' in data: 

189 lines = data.splitlines() 

190 lineno = pos[1] - len(lines) + 1 

191 offset = -1 

192 else: 

193 lineno = pos[1] 

194 offset = pos[2] - len(data) 

195 pos = (pos[0], lineno, offset) 

196 self._queue.append((kind, data, pos)) 

197 

198 def _getpos_unknown(self): 

199 return (self.filename, -1, -1) 

200 

201 def _getpos(self): 

202 return (self.filename, self.expat.CurrentLineNumber, 

203 self.expat.CurrentColumnNumber) 

204 

205 def _handle_start(self, tag, attrib): 

206 attrs = Attrs([(QName(name), value) for name, value in 

207 zip(*[iter(attrib)] * 2)]) 

208 self._enqueue(START, (QName(tag), attrs)) 

209 

210 def _handle_end(self, tag): 

211 self._enqueue(END, QName(tag)) 

212 

213 def _handle_data(self, text): 

214 self._enqueue(TEXT, text) 

215 

216 def _handle_xml_decl(self, version, encoding, standalone): 

217 self._enqueue(XML_DECL, (version, encoding, standalone)) 

218 

219 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): 

220 self._enqueue(DOCTYPE, (name, pubid, sysid)) 

221 

222 def _handle_start_ns(self, prefix, uri): 

223 self._enqueue(START_NS, (prefix or '', uri)) 

224 

225 def _handle_end_ns(self, prefix): 

226 self._enqueue(END_NS, prefix or '') 

227 

228 def _handle_start_cdata(self): 

229 self._enqueue(START_CDATA) 

230 

231 def _handle_end_cdata(self): 

232 self._enqueue(END_CDATA) 

233 

234 def _handle_pi(self, target, data): 

235 self._enqueue(PI, (target, data)) 

236 

237 def _handle_comment(self, text): 

238 self._enqueue(COMMENT, text) 

239 

240 def _handle_other(self, text): 

241 if text.startswith('&'): 

242 # deal with undefined entities 

243 try: 

244 text = unichr(html_entities.name2codepoint[text[1:-1]]) 

245 self._enqueue(TEXT, text) 

246 except KeyError: 

247 filename, lineno, offset = self._getpos() 

248 error = expat.error('undefined entity "%s": line %d, column %d' 

249 % (text, lineno, offset)) 

250 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY 

251 error.lineno = lineno 

252 error.offset = offset 

253 raise error 

254 

255 

256def XML(text): 

257 """Parse the given XML source and return a markup stream. 

258  

259 Unlike with `XMLParser`, the returned stream is reusable, meaning it can be 

260 iterated over multiple times: 

261  

262 >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>') 

263 >>> print(xml) 

264 <doc><elem>Foo</elem><elem>Bar</elem></doc> 

265 >>> print(xml.select('elem')) 

266 <elem>Foo</elem><elem>Bar</elem> 

267 >>> print(xml.select('elem/text()')) 

268 FooBar 

269  

270 :param text: the XML source 

271 :return: the parsed XML event stream 

272 :raises ParseError: if the XML text is not well-formed 

273 """ 

274 return Stream(list(XMLParser(StringIO(text)))) 

275 

276 

277class HTMLParser(html_parser.HTMLParser, object): 

278 """Parser for HTML input based on the Python `HTMLParser` module. 

279  

280 This class provides the same interface for generating stream events as 

281 `XMLParser`, and attempts to automatically balance tags. 

282  

283 The parsing is initiated by iterating over the parser object: 

284  

285 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8') 

286 >>> for kind, data, pos in parser: 

287 ... print('%s %s' % (kind, data)) 

288 START (QName('ul'), Attrs([(QName('compact'), 'compact')])) 

289 START (QName('li'), Attrs()) 

290 TEXT Foo 

291 END li 

292 END ul 

293 """ 

294 

295 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', 

296 'hr', 'img', 'input', 'isindex', 'link', 'meta', 

297 'param']) 

298 

299 def __init__(self, source, filename=None, encoding=None): 

300 """Initialize the parser for the given HTML input. 

301  

302 :param source: the HTML text as a file-like object 

303 :param filename: the name of the file, if known 

304 :param filename: encoding of the file; ignored if the input is unicode 

305 """ 

306 html_parser.HTMLParser.__init__(self) 

307 self.source = source 

308 self.filename = filename 

309 self.encoding = encoding 

310 self._queue = [] 

311 self._open_tags = [] 

312 

313 def parse(self): 

314 """Generator that parses the HTML source, yielding markup events. 

315  

316 :return: a markup event stream 

317 :raises ParseError: if the HTML text is not well formed 

318 """ 

319 def _generate(): 

320 if self.encoding: 

321 reader = codecs.getreader(self.encoding) 

322 source = reader(self.source) 

323 else: 

324 source = self.source 

325 try: 

326 bufsize = 4 * 1024 # 4K 

327 done = False 

328 while 1: 

329 while not done and len(self._queue) == 0: 

330 data = source.read(bufsize) 

331 if not data: # end of data 

332 self.close() 

333 done = True 

334 else: 

335 if not isinstance(data, text_type): 

336 raise UnicodeError("source returned bytes, but no encoding specified") 

337 self.feed(data) 

338 for kind, data, pos in self._queue: 

339 yield kind, data, pos 

340 self._queue = [] 

341 if done: 

342 open_tags = self._open_tags 

343 open_tags.reverse() 

344 for tag in open_tags: 

345 yield END, QName(tag), pos 

346 break 

347 except Exception as e: 

348 # Python's simple HTMLParser does not raise detailed 

349 # errors except in strict mode which was deprecated 

350 # in Python 3.3 and removed in Python 3.5 and which in 

351 # any case is not used is this code. 

352 msg = str(e) 

353 raise ParseError(msg, self.filename) 

354 return Stream(_generate()).filter(_coalesce) 

355 

356 def __iter__(self): 

357 return iter(self.parse()) 

358 

359 def _enqueue(self, kind, data, pos=None): 

360 if pos is None: 

361 pos = self._getpos() 

362 self._queue.append((kind, data, pos)) 

363 

364 def _getpos(self): 

365 lineno, column = self.getpos() 

366 return (self.filename, lineno, column) 

367 

368 def handle_starttag(self, tag, attrib): 

369 fixed_attrib = [] 

370 for name, value in attrib: # Fixup minimized attributes 

371 if value is None: 

372 value = name 

373 fixed_attrib.append((QName(name), stripentities(value))) 

374 

375 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) 

376 if tag in self._EMPTY_ELEMS: 

377 self._enqueue(END, QName(tag)) 

378 else: 

379 self._open_tags.append(tag) 

380 

381 def handle_endtag(self, tag): 

382 if tag not in self._EMPTY_ELEMS: 

383 while self._open_tags: 

384 open_tag = self._open_tags.pop() 

385 self._enqueue(END, QName(open_tag)) 

386 if open_tag.lower() == tag.lower(): 

387 break 

388 

389 def handle_data(self, text): 

390 self._enqueue(TEXT, text) 

391 

392 def handle_charref(self, name): 

393 if name.lower().startswith('x'): 

394 text = unichr(int(name[1:], 16)) 

395 else: 

396 text = unichr(int(name)) 

397 self._enqueue(TEXT, text) 

398 

399 def handle_entityref(self, name): 

400 try: 

401 text = unichr(html_entities.name2codepoint[name]) 

402 except KeyError: 

403 text = '&%s;' % name 

404 self._enqueue(TEXT, text) 

405 

406 def handle_pi(self, data): 

407 if data.endswith('?'): 

408 data = data[:-1] 

409 try: 

410 target, data = data.split(None, 1) 

411 except ValueError: 

412 # PI with no data 

413 target = data 

414 data = '' 

415 self._enqueue(PI, (target.strip(), data.strip())) 

416 

417 def handle_comment(self, text): 

418 self._enqueue(COMMENT, text) 

419 

420 

421def HTML(text, encoding=None): 

422 """Parse the given HTML source and return a markup stream. 

423  

424 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be 

425 iterated over multiple times: 

426  

427 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8') 

428 >>> print(html) 

429 <body><h1>Foo</h1></body> 

430 >>> print(html.select('h1')) 

431 <h1>Foo</h1> 

432 >>> print(html.select('h1/text()')) 

433 Foo 

434  

435 :param text: the HTML source 

436 :return: the parsed XML event stream 

437 :raises ParseError: if the HTML text is not well-formed, and error recovery 

438 fails 

439 """ 

440 if isinstance(text, text_type): 

441 # If it's unicode text the encoding should be set to None. 

442 # The option to pass in an incorrect encoding is for ease 

443 # of writing doctests that work in both Python 2.x and 3.x. 

444 return Stream(list(HTMLParser(StringIO(text), encoding=None))) 

445 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding))) 

446 

447 

448def _coalesce(stream): 

449 """Coalesces adjacent TEXT events into a single event.""" 

450 textbuf = [] 

451 textpos = None 

452 for kind, data, pos in chain(stream, [(None, None, None)]): 

453 if kind is TEXT: 

454 textbuf.append(data) 

455 if textpos is None: 

456 textpos = pos 

457 else: 

458 if textbuf: 

459 yield TEXT, ''.join(textbuf), textpos 

460 del textbuf[:] 

461 textpos = None 

462 if kind: 

463 yield kind, data, pos