Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/backports/html/parser.py: 85%

1"""A parser for HTML and XHTML.

3Backported for python-future from Python 3.3.

4"""

6# This file is based on sgmllib.py, but the API is slightly different.

8# XXX There should be a way to distinguish between PCDATA (parsed

9# character data -- the normal case), RCDATA (replaceable character

10# data -- only char and entity references and end tags are special)

11# and CDATA (character data -- only end tags are special).

13from __future__ import (absolute_import, division,

14 print_function, unicode_literals)

15from future.builtins import *

16from future.backports import _markupbase

17import re

18import warnings

20# Regular expressions used for parsing

22interesting_normal = re.compile('[&<]')

23incomplete = re.compile('&[a-zA-Z#]')

25entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')

26charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

28starttagopen = re.compile('<[a-zA-Z]')

29piclose = re.compile('>')

30commentclose = re.compile(r'--\s*>')

31tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')

32# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

33# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

34tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')

35# Note:

36# 1) the strict attrfind isn't really strict, but we can't make it

37# correctly strict without breaking backward compatibility;

38# 2) if you change attrfind remember to update locatestarttagend too;

39# 3) if you change attrfind and/or locatestarttagend the parser will

40# explode, so don't do it.

41attrfind = re.compile(

42 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'

43 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')

44attrfind_tolerant = re.compile(

45 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'

46 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')

47locatestarttagend = re.compile(r"""

48 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

49 (?:\s+ # whitespace before attribute name

50 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name

51 (?:\s*=\s* # value indicator

52 (?:'[^']*' # LITA-enclosed value

53 |\"[^\"]*\" # LIT-enclosed value

54 |[^'\">\s]+ # bare value

55 )

56 )?

57 )

58 )*

59 \s* # trailing whitespace

60""", re.VERBOSE)

61locatestarttagend_tolerant = re.compile(r"""

62 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

63 (?:[\s/]* # optional whitespace before attribute name

64 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name

65 (?:\s*=+\s* # value indicator

66 (?:'[^']*' # LITA-enclosed value

67 |"[^"]*" # LIT-enclosed value

68 |(?!['"])[^>\s]* # bare value

69 )

70 (?:\s*,)* # possibly followed by a comma

71 )?(?:\s|/(?!>))*

72 )*

73 )?

74 \s* # trailing whitespace

75""", re.VERBOSE)

76endendtag = re.compile('>')

77# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between

78# </ and the tag name, so maybe this should be fixed

79endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

82class HTMLParseError(Exception):

83 """Exception raised for all parse errors."""

85 def __init__(self, msg, position=(None, None)):

86 assert msg

87 self.msg = msg

88 self.lineno = position[0]

89 self.offset = position[1]

91 def __str__(self):

92 result = self.msg

93 if self.lineno is not None:

94 result = result + ", at line %d" % self.lineno

95 if self.offset is not None:

96 result = result + ", column %d" % (self.offset + 1)

97 return result

100class HTMLParser(_markupbase.ParserBase):

101 """Find tags and other markup and call handler functions.

102

103 Usage:

104 p = HTMLParser()

105 p.feed(data)

106 ...

107 p.close()

108

109 Start tags are handled by calling self.handle_starttag() or

110 self.handle_startendtag(); end tags by self.handle_endtag(). The

111 data between tags is passed from the parser to the derived class

112 by calling self.handle_data() with the data as argument (the data

113 may be split up in arbitrary chunks). Entity references are

114 passed by calling self.handle_entityref() with the entity

115 reference as the argument. Numeric character references are

116 passed to self.handle_charref() with the string containing the

117 reference as the argument.

118 """

119

120 CDATA_CONTENT_ELEMENTS = ("script", "style")

121

122 def __init__(self, strict=False):

123 """Initialize and reset this instance.

124

125 If strict is set to False (the default) the parser will parse invalid

126 markup, otherwise it will raise an error. Note that the strict mode

127 is deprecated.

128 """

129 if strict:

130 warnings.warn("The strict mode is deprecated.",

131 DeprecationWarning, stacklevel=2)

132 self.strict = strict

133 self.reset()

134

135 def reset(self):

136 """Reset this instance. Loses all unprocessed data."""

137 self.rawdata = ''

138 self.lasttag = '???'

139 self.interesting = interesting_normal

140 self.cdata_elem = None

141 _markupbase.ParserBase.reset(self)

142

143 def feed(self, data):

144 r"""Feed data to the parser.

145

146 Call this as often as you want, with as little or as much text

147 as you want (may include '\n').

148 """

149 self.rawdata = self.rawdata + data

150 self.goahead(0)

151

152 def close(self):

153 """Handle any buffered data."""

154 self.goahead(1)

155

156 def error(self, message):

157 raise HTMLParseError(message, self.getpos())

158

159 __starttag_text = None

160

161 def get_starttag_text(self):

162 """Return full source of start tag: '<...>'."""

163 return self.__starttag_text

164

165 def set_cdata_mode(self, elem):

166 self.cdata_elem = elem.lower()

167 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

168

169 def clear_cdata_mode(self):

170 self.interesting = interesting_normal

171 self.cdata_elem = None

172

173 # Internal -- handle data as far as reasonable. May leave state

174 # and data to be processed by a subsequent call. If 'end' is

175 # true, force handling all data as if followed by EOF marker.

176 def goahead(self, end):

177 rawdata = self.rawdata

178 i = 0

179 n = len(rawdata)

180 while i < n:

181 match = self.interesting.search(rawdata, i) # < or &

182 if match:

183 j = match.start()

184 else:

185 if self.cdata_elem:

186 break

187 j = n

188 if i < j: self.handle_data(rawdata[i:j])

189 i = self.updatepos(i, j)

190 if i == n: break

191 startswith = rawdata.startswith

192 if startswith('<', i):

193 if starttagopen.match(rawdata, i): # < + letter

194 k = self.parse_starttag(i)

195 elif startswith("</", i):

196 k = self.parse_endtag(i)

197 elif startswith("<!--", i):

198 k = self.parse_comment(i)

199 elif startswith("<?", i):

200 k = self.parse_pi(i)

201 elif startswith("<!", i):

202 if self.strict:

203 k = self.parse_declaration(i)

204 else:

205 k = self.parse_html_declaration(i)

206 elif (i + 1) < n:

207 self.handle_data("<")

208 k = i + 1

209 else:

210 break

211 if k < 0:

212 if not end:

213 break

214 if self.strict:

215 self.error("EOF in middle of construct")

216 k = rawdata.find('>', i + 1)

217 if k < 0:

218 k = rawdata.find('<', i + 1)

219 if k < 0:

220 k = i + 1

221 else:

222 k += 1

223 self.handle_data(rawdata[i:k])

224 i = self.updatepos(i, k)

225 elif startswith("&#", i):

226 match = charref.match(rawdata, i)

227 if match:

228 name = match.group()[2:-1]

229 self.handle_charref(name)

230 k = match.end()

231 if not startswith(';', k-1):

232 k = k - 1

233 i = self.updatepos(i, k)

234 continue

235 else:

236 if ";" in rawdata[i:]: #bail by consuming &#

237 self.handle_data(rawdata[0:2])

238 i = self.updatepos(i, 2)

239 break

240 elif startswith('&', i):

241 match = entityref.match(rawdata, i)

242 if match:

243 name = match.group(1)

244 self.handle_entityref(name)

245 k = match.end()

246 if not startswith(';', k-1):

247 k = k - 1

248 i = self.updatepos(i, k)

249 continue

250 match = incomplete.match(rawdata, i)

251 if match:

252 # match.group() will contain at least 2 chars

253 if end and match.group() == rawdata[i:]:

254 if self.strict:

255 self.error("EOF in middle of entity or char ref")

256 else:

257 if k <= i:

258 k = n

259 i = self.updatepos(i, i + 1)

260 # incomplete

261 break

262 elif (i + 1) < n:

263 # not the end of the buffer, and can't be confused

264 # with some other construct

265 self.handle_data("&")

266 i = self.updatepos(i, i + 1)

267 else:

268 break

269 else:

270 assert 0, "interesting.search() lied"

271 # end while

272 if end and i < n and not self.cdata_elem:

273 self.handle_data(rawdata[i:n])

274 i = self.updatepos(i, n)

275 self.rawdata = rawdata[i:]

276

277 # Internal -- parse html declarations, return length or -1 if not terminated

278 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state

279 # See also parse_declaration in _markupbase

280 def parse_html_declaration(self, i):

281 rawdata = self.rawdata

282 assert rawdata[i:i+2] == '<!', ('unexpected call to '

283 'parse_html_declaration()')

284 if rawdata[i:i+4] == '<!--':

285 # this case is actually already handled in goahead()

286 return self.parse_comment(i)

287 elif rawdata[i:i+3] == '<![':

288 return self.parse_marked_section(i)

289 elif rawdata[i:i+9].lower() == '<!doctype':

290 # find the closing >

291 gtpos = rawdata.find('>', i+9)

292 if gtpos == -1:

293 return -1

294 self.handle_decl(rawdata[i+2:gtpos])

295 return gtpos+1

296 else:

297 return self.parse_bogus_comment(i)

298

299 # Internal -- parse bogus comment, return length or -1 if not terminated

300 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state

301 def parse_bogus_comment(self, i, report=1):

302 rawdata = self.rawdata

303 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '

304 'parse_comment()')

305 pos = rawdata.find('>', i+2)

306 if pos == -1:

307 return -1

308 if report:

309 self.handle_comment(rawdata[i+2:pos])

310 return pos + 1

311

312 # Internal -- parse processing instr, return end or -1 if not terminated

313 def parse_pi(self, i):

314 rawdata = self.rawdata

315 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

316 match = piclose.search(rawdata, i+2) # >

317 if not match:

318 return -1

319 j = match.start()

320 self.handle_pi(rawdata[i+2: j])

321 j = match.end()

322 return j

323

324 # Internal -- handle starttag, return end or -1 if not terminated

325 def parse_starttag(self, i):

326 self.__starttag_text = None

327 endpos = self.check_for_whole_start_tag(i)

328 if endpos < 0:

329 return endpos

330 rawdata = self.rawdata

331 self.__starttag_text = rawdata[i:endpos]

332

333 # Now parse the data between i+1 and j into a tag and attrs

334 attrs = []

335 match = tagfind.match(rawdata, i+1)

336 assert match, 'unexpected call to parse_starttag()'

337 k = match.end()

338 self.lasttag = tag = match.group(1).lower()

339 while k < endpos:

340 if self.strict:

341 m = attrfind.match(rawdata, k)

342 else:

343 m = attrfind_tolerant.match(rawdata, k)

344 if not m:

345 break

346 attrname, rest, attrvalue = m.group(1, 2, 3)

347 if not rest:

348 attrvalue = None

349 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

350 attrvalue[:1] == '"' == attrvalue[-1:]:

351 attrvalue = attrvalue[1:-1]

352 if attrvalue:

353 attrvalue = self.unescape(attrvalue)

354 attrs.append((attrname.lower(), attrvalue))

355 k = m.end()

356

357 end = rawdata[k:endpos].strip()

358 if end not in (">", "/>"):

359 lineno, offset = self.getpos()

360 if "\n" in self.__starttag_text:

361 lineno = lineno + self.__starttag_text.count("\n")

362 offset = len(self.__starttag_text) \

363 - self.__starttag_text.rfind("\n")

364 else:

365 offset = offset + len(self.__starttag_text)

366 if self.strict:

367 self.error("junk characters in start tag: %r"

368 % (rawdata[k:endpos][:20],))

369 self.handle_data(rawdata[i:endpos])

370 return endpos

371 if end.endswith('/>'):

372 # XHTML-style empty tag: <span attr="value" />

373 self.handle_startendtag(tag, attrs)

374 else:

375 self.handle_starttag(tag, attrs)

376 if tag in self.CDATA_CONTENT_ELEMENTS:

377 self.set_cdata_mode(tag)

378 return endpos

379

380 # Internal -- check to see if we have a complete starttag; return end

381 # or -1 if incomplete.

382 def check_for_whole_start_tag(self, i):

383 rawdata = self.rawdata

384 if self.strict:

385 m = locatestarttagend.match(rawdata, i)

386 else:

387 m = locatestarttagend_tolerant.match(rawdata, i)

388 if m:

389 j = m.end()

390 next = rawdata[j:j+1]

391 if next == ">":

392 return j + 1

393 if next == "/":

394 if rawdata.startswith("/>", j):

395 return j + 2

396 if rawdata.startswith("/", j):

397 # buffer boundary

398 return -1

399 # else bogus input

400 if self.strict:

401 self.updatepos(i, j + 1)

402 self.error("malformed empty start tag")

403 if j > i:

404 return j

405 else:

406 return i + 1

407 if next == "":

408 # end of input

409 return -1

410 if next in ("abcdefghijklmnopqrstuvwxyz=/"

411 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

412 # end of input in or before attribute value, or we have the

413 # '/' from a '/>' ending

414 return -1

415 if self.strict:

416 self.updatepos(i, j)

417 self.error("malformed start tag")

418 if j > i:

419 return j

420 else:

421 return i + 1

422 raise AssertionError("we should not get here!")

423

424 # Internal -- parse endtag, return end or -1 if incomplete

425 def parse_endtag(self, i):

426 rawdata = self.rawdata

427 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

428 match = endendtag.search(rawdata, i+1) # >

429 if not match:

430 return -1

431 gtpos = match.end()

432 match = endtagfind.match(rawdata, i) # </ + tag + >

433 if not match:

434 if self.cdata_elem is not None:

435 self.handle_data(rawdata[i:gtpos])

436 return gtpos

437 if self.strict:

438 self.error("bad end tag: %r" % (rawdata[i:gtpos],))

439 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

440 namematch = tagfind_tolerant.match(rawdata, i+2)

441 if not namematch:

442 # w3.org/TR/html5/tokenization.html#end-tag-open-state

443 if rawdata[i:i+3] == '</>':

444 return i+3

445 else:

446 return self.parse_bogus_comment(i)

447 tagname = namematch.group().lower()

448 # consume and ignore other stuff between the name and the >

449 # Note: this is not 100% correct, since we might have things like

450 # </tag attr=">">, but looking for > after tha name should cover

451 # most of the cases and is much simpler

452 gtpos = rawdata.find('>', namematch.end())

453 self.handle_endtag(tagname)

454 return gtpos+1

455

456 elem = match.group(1).lower() # script or style

457 if self.cdata_elem is not None:

458 if elem != self.cdata_elem:

459 self.handle_data(rawdata[i:gtpos])

460 return gtpos

461

462 self.handle_endtag(elem.lower())

463 self.clear_cdata_mode()

464 return gtpos

465

466 # Overridable -- finish processing of start+end tag: <tag.../>

467 def handle_startendtag(self, tag, attrs):

468 self.handle_starttag(tag, attrs)

469 self.handle_endtag(tag)

470

471 # Overridable -- handle start tag

472 def handle_starttag(self, tag, attrs):

473 pass

474

475 # Overridable -- handle end tag

476 def handle_endtag(self, tag):

477 pass

478

479 # Overridable -- handle character reference

480 def handle_charref(self, name):

481 pass

482

483 # Overridable -- handle entity reference

484 def handle_entityref(self, name):

485 pass

486

487 # Overridable -- handle data

488 def handle_data(self, data):

489 pass

490

491 # Overridable -- handle comment

492 def handle_comment(self, data):

493 pass

494

495 # Overridable -- handle declaration

496 def handle_decl(self, decl):

497 pass

498

499 # Overridable -- handle processing instruction

500 def handle_pi(self, data):

501 pass

502

503 def unknown_decl(self, data):

504 if self.strict:

505 self.error("unknown declaration: %r" % (data,))

506

507 # Internal -- helper to remove special character quoting

508 def unescape(self, s):

509 if '&' not in s:

510 return s

511 def replaceEntities(s):

512 s = s.groups()[0]

513 try:

514 if s[0] == "#":

515 s = s[1:]

516 if s[0] in ['x','X']:

517 c = int(s[1:].rstrip(';'), 16)

518 else:

519 c = int(s.rstrip(';'))

520 return chr(c)

521 except ValueError:

522 return '&#' + s

523 else:

524 from future.backports.html.entities import html5

525 if s in html5:

526 return html5[s]

527 elif s.endswith(';'):

528 return '&' + s

529 for x in range(2, len(s)):

530 if s[:x] in html5:

531 return html5[s[:x]] + s[x:]

532 else:

533 return '&' + s

534

535 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",

536 replaceEntities, s)