Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/genshi/input.py: 54%

1# -*- coding: utf-8 -*-

6# This software is licensed as described in the file COPYING, which

7# you should have received as part of this distribution. The terms

8# are also available at http://genshi.edgewall.org/wiki/License.

10# This software consists of voluntary contributions made by many

11# individuals. For the exact contribution history, see the revision

12# history and logs, available at http://genshi.edgewall.org/log/.

14"""Support for constructing markup streams from files, strings, or other

15sources.

16"""

18from itertools import chain

19import codecs

20from xml.parsers import expat

22from genshi.compat import html_entities, html_parser, text_type, unichr, \

23 StringIO, BytesIO

24from genshi.core import Attrs, QName, Stream, stripentities

25from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \

26 END_NS, START_CDATA, END_CDATA, PI, COMMENT

29__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']

30__docformat__ = 'restructuredtext en'

33def ET(element):

34 """Convert a given ElementTree element to a markup stream.

36 :param element: an ElementTree element

37 :return: a markup stream

38 """

39 tag_name = QName(element.tag.lstrip('{'))

40 attrs = Attrs([(QName(attr.lstrip('{')), value)

41 for attr, value in element.items()])

43 yield START, (tag_name, attrs), (None, -1, -1)

44 if element.text:

45 yield TEXT, element.text, (None, -1, -1)

46 for child in element:

47 for item in ET(child):

48 yield item

49 yield END, tag_name, (None, -1, -1)

50 if element.tail:

51 yield TEXT, element.tail, (None, -1, -1)

54class ParseError(Exception):

55 """Exception raised when fatal syntax errors are found in the input being

56 parsed.

57 """

59 def __init__(self, message, filename=None, lineno=-1, offset=-1):

60 """Exception initializer.

62 :param message: the error message from the parser

63 :param filename: the path to the file that was parsed

64 :param lineno: the number of the line on which the error was encountered

65 :param offset: the column number where the error was encountered

66 """

67 self.msg = message

68 if filename:

69 message += ', in ' + filename

70 Exception.__init__(self, message)

71 self.filename = filename or '<string>'

72 self.lineno = lineno

73 self.offset = offset

76class XMLParser(object):

77 """Generator-based XML parser based on roughly equivalent code in

78 Kid/ElementTree.

80 The parsing is initiated by iterating over the parser object:

82 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))

83 >>> for kind, data, pos in parser:

84 ... print('%s %s' % (kind, data))

85 START (QName('root'), Attrs([(QName('id'), '2')]))

86 START (QName('child'), Attrs())

87 TEXT Foo

88 END child

89 END root

90 """

92 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in

93 html_entities.name2codepoint.items()]

94 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')

96 def __init__(self, source, filename=None, encoding=None):

97 """Initialize the parser for the given XML input.

99 :param source: the XML text as a file-like object

100 :param filename: the name of the file, if appropriate

101 :param encoding: the encoding of the file; if not specified, the

102 encoding is assumed to be ASCII, UTF-8, or UTF-16, or

103 whatever the encoding specified in the XML declaration

104 (if any)

105 """

106 self.source = source

107 self.filename = filename

108

109 # Setup the Expat parser

110 parser = expat.ParserCreate(encoding, '}')

111 parser.buffer_text = True

112 # Python 3 does not have returns_unicode

113 if hasattr(parser, 'returns_unicode'):

114 parser.returns_unicode = True

115 parser.ordered_attributes = True

116

117 parser.StartElementHandler = self._handle_start

118 parser.EndElementHandler = self._handle_end

119 parser.CharacterDataHandler = self._handle_data

120 parser.StartDoctypeDeclHandler = self._handle_doctype

121 parser.StartNamespaceDeclHandler = self._handle_start_ns

122 parser.EndNamespaceDeclHandler = self._handle_end_ns

123 parser.StartCdataSectionHandler = self._handle_start_cdata

124 parser.EndCdataSectionHandler = self._handle_end_cdata

125 parser.ProcessingInstructionHandler = self._handle_pi

126 parser.XmlDeclHandler = self._handle_xml_decl

127 parser.CommentHandler = self._handle_comment

128

129 # Tell Expat that we'll handle non-XML entities ourselves

130 # (in _handle_other)

131 parser.DefaultHandler = self._handle_other

132 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)

133 parser.UseForeignDTD()

134 parser.ExternalEntityRefHandler = self._build_foreign

135

136 self.expat = parser

137 self._queue = []

138

139 def parse(self):

140 """Generator that parses the XML source, yielding markup events.

141

142 :return: a markup event stream

143 :raises ParseError: if the XML text is not well formed

144 """

145 def _generate():

146 try:

147 bufsize = 4 * 1024 # 4K

148 done = False

149 while 1:

150 while not done and len(self._queue) == 0:

151 data = self.source.read(bufsize)

152 if not data: # end of data

153 if hasattr(self, 'expat'):

154 self.expat.Parse('', True)

155 del self.expat # get rid of circular references

156 done = True

157 else:

158 if isinstance(data, text_type):

159 data = data.encode('utf-8')

160 self.expat.Parse(data, False)

161 for event in self._queue:

162 yield event

163 self._queue = []

164 if done:

165 break

166 except expat.ExpatError as e:

167 msg = str(e)

168 raise ParseError(msg, self.filename, e.lineno, e.offset)

169 return Stream(_generate()).filter(_coalesce)

170

171 def __iter__(self):

172 return iter(self.parse())

173

174 def _build_foreign(self, context, base, sysid, pubid):

175 parser = self.expat.ExternalEntityParserCreate(context)

176 parser.ParseFile(BytesIO(self._external_dtd))

177 return 1

178

179 def _enqueue(self, kind, data=None, pos=None):

180 if pos is None:

181 pos = self._getpos()

182 if kind is TEXT:

183 # Expat reports the *end* of the text event as current position. We

184 # try to fix that up here as much as possible. Unfortunately, the

185 # offset is only valid for single-line text. For multi-line text,

186 # it is apparently not possible to determine at what offset it

187 # started

188 if '\n' in data:

189 lines = data.splitlines()

190 lineno = pos[1] - len(lines) + 1

191 offset = -1

192 else:

193 lineno = pos[1]

194 offset = pos[2] - len(data)

195 pos = (pos[0], lineno, offset)

196 self._queue.append((kind, data, pos))

197

198 def _getpos_unknown(self):

199 return (self.filename, -1, -1)

200

201 def _getpos(self):

202 return (self.filename, self.expat.CurrentLineNumber,

203 self.expat.CurrentColumnNumber)

204

205 def _handle_start(self, tag, attrib):

206 attrs = Attrs([(QName(name), value) for name, value in

207 zip(*[iter(attrib)] * 2)])

208 self._enqueue(START, (QName(tag), attrs))

209

210 def _handle_end(self, tag):

211 self._enqueue(END, QName(tag))

212

213 def _handle_data(self, text):

214 self._enqueue(TEXT, text)

215

216 def _handle_xml_decl(self, version, encoding, standalone):

217 self._enqueue(XML_DECL, (version, encoding, standalone))

218

219 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):

220 self._enqueue(DOCTYPE, (name, pubid, sysid))

221

222 def _handle_start_ns(self, prefix, uri):

223 self._enqueue(START_NS, (prefix or '', uri))

224

225 def _handle_end_ns(self, prefix):

226 self._enqueue(END_NS, prefix or '')

227

228 def _handle_start_cdata(self):

229 self._enqueue(START_CDATA)

230

231 def _handle_end_cdata(self):

232 self._enqueue(END_CDATA)

233

234 def _handle_pi(self, target, data):

235 self._enqueue(PI, (target, data))

236

237 def _handle_comment(self, text):

238 self._enqueue(COMMENT, text)

239

240 def _handle_other(self, text):

241 if text.startswith('&'):

242 # deal with undefined entities

243 try:

244 text = unichr(html_entities.name2codepoint[text[1:-1]])

245 self._enqueue(TEXT, text)

246 except KeyError:

247 filename, lineno, offset = self._getpos()

248 error = expat.error('undefined entity "%s": line %d, column %d'

249 % (text, lineno, offset))

250 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY

251 error.lineno = lineno

252 error.offset = offset

253 raise error

254

255

256def XML(text):

257 """Parse the given XML source and return a markup stream.

258

259 Unlike with `XMLParser`, the returned stream is reusable, meaning it can be

260 iterated over multiple times:

261

262 >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>')

263 >>> print(xml)

264 <doc><elem>Foo</elem><elem>Bar</elem></doc>

265 >>> print(xml.select('elem'))

266 <elem>Foo</elem><elem>Bar</elem>

267 >>> print(xml.select('elem/text()'))

268 FooBar

269

270 :param text: the XML source

271 :return: the parsed XML event stream

272 :raises ParseError: if the XML text is not well-formed

273 """

274 return Stream(list(XMLParser(StringIO(text))))

275

276

277class HTMLParser(html_parser.HTMLParser, object):

278 """Parser for HTML input based on the Python `HTMLParser` module.

279

280 This class provides the same interface for generating stream events as

281 `XMLParser`, and attempts to automatically balance tags.

282

283 The parsing is initiated by iterating over the parser object:

284

285 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')

286 >>> for kind, data, pos in parser:

287 ... print('%s %s' % (kind, data))

288 START (QName('ul'), Attrs([(QName('compact'), 'compact')]))

289 START (QName('li'), Attrs())

290 TEXT Foo

291 END li

292 END ul

293 """

294

295 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',

296 'hr', 'img', 'input', 'isindex', 'link', 'meta',

297 'param'])

298

299 def __init__(self, source, filename=None, encoding=None):

300 """Initialize the parser for the given HTML input.

301

302 :param source: the HTML text as a file-like object

303 :param filename: the name of the file, if known

304 :param filename: encoding of the file; ignored if the input is unicode

305 """

306 html_parser.HTMLParser.__init__(self)

307 self.source = source

308 self.filename = filename

309 self.encoding = encoding

310 self._queue = []

311 self._open_tags = []

312

313 def parse(self):

314 """Generator that parses the HTML source, yielding markup events.

315

316 :return: a markup event stream

317 :raises ParseError: if the HTML text is not well formed

318 """

319 def _generate():

320 if self.encoding:

321 reader = codecs.getreader(self.encoding)

322 source = reader(self.source)

323 else:

324 source = self.source

325 try:

326 bufsize = 4 * 1024 # 4K

327 done = False

328 while 1:

329 while not done and len(self._queue) == 0:

330 data = source.read(bufsize)

331 if not data: # end of data

332 self.close()

333 done = True

334 else:

335 if not isinstance(data, text_type):

336 raise UnicodeError("source returned bytes, but no encoding specified")

337 self.feed(data)

338 for kind, data, pos in self._queue:

339 yield kind, data, pos

340 self._queue = []

341 if done:

342 open_tags = self._open_tags

343 open_tags.reverse()

344 for tag in open_tags:

345 yield END, QName(tag), pos

346 break

347 except Exception as e:

348 # Python's simple HTMLParser does not raise detailed

349 # errors except in strict mode which was deprecated

350 # in Python 3.3 and removed in Python 3.5 and which in

351 # any case is not used is this code.

352 msg = str(e)

353 raise ParseError(msg, self.filename)

354 return Stream(_generate()).filter(_coalesce)

355

356 def __iter__(self):

357 return iter(self.parse())

358

359 def _enqueue(self, kind, data, pos=None):

360 if pos is None:

361 pos = self._getpos()

362 self._queue.append((kind, data, pos))

363

364 def _getpos(self):

365 lineno, column = self.getpos()

366 return (self.filename, lineno, column)

367

368 def handle_starttag(self, tag, attrib):

369 fixed_attrib = []

370 for name, value in attrib: # Fixup minimized attributes

371 if value is None:

372 value = name

373 fixed_attrib.append((QName(name), stripentities(value)))

374

375 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))

376 if tag in self._EMPTY_ELEMS:

377 self._enqueue(END, QName(tag))

378 else:

379 self._open_tags.append(tag)

380

381 def handle_endtag(self, tag):

382 if tag not in self._EMPTY_ELEMS:

383 while self._open_tags:

384 open_tag = self._open_tags.pop()

385 self._enqueue(END, QName(open_tag))

386 if open_tag.lower() == tag.lower():

387 break

388

389 def handle_data(self, text):

390 self._enqueue(TEXT, text)

391

392 def handle_charref(self, name):

393 if name.lower().startswith('x'):

394 text = unichr(int(name[1:], 16))

395 else:

396 text = unichr(int(name))

397 self._enqueue(TEXT, text)

398

399 def handle_entityref(self, name):

400 try:

401 text = unichr(html_entities.name2codepoint[name])

402 except KeyError:

403 text = '&%s;' % name

404 self._enqueue(TEXT, text)

405

406 def handle_pi(self, data):

407 if data.endswith('?'):

408 data = data[:-1]

409 try:

410 target, data = data.split(None, 1)

411 except ValueError:

412 # PI with no data

413 target = data

414 data = ''

415 self._enqueue(PI, (target.strip(), data.strip()))

416

417 def handle_comment(self, text):

418 self._enqueue(COMMENT, text)

419

420

421def HTML(text, encoding=None):

422 """Parse the given HTML source and return a markup stream.

423

424 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be

425 iterated over multiple times:

426

427 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')

428 >>> print(html)

429 <body><h1>Foo</h1></body>

430 >>> print(html.select('h1'))

431 <h1>Foo</h1>

432 >>> print(html.select('h1/text()'))

433 Foo

434

435 :param text: the HTML source

436 :return: the parsed XML event stream

437 :raises ParseError: if the HTML text is not well-formed, and error recovery

438 fails

439 """

440 if isinstance(text, text_type):

441 # If it's unicode text the encoding should be set to None.

442 # The option to pass in an incorrect encoding is for ease

443 # of writing doctests that work in both Python 2.x and 3.x.

444 return Stream(list(HTMLParser(StringIO(text), encoding=None)))

445 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))

446

447

448def _coalesce(stream):

449 """Coalesces adjacent TEXT events into a single event."""

450 textbuf = []

451 textpos = None

452 for kind, data, pos in chain(stream, [(None, None, None)]):

453 if kind is TEXT:

454 textbuf.append(data)

455 if textpos is None:

456 textpos = pos

457 else:

458 if textbuf:

459 yield TEXT, ''.join(textbuf), textpos

460 del textbuf[:]

461 textpos = None

462 if kind:

463 yield kind, data, pos