Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/html/parser.py: 15%

1"""A parser for HTML and XHTML."""

3# This file is based on sgmllib.py, but the API is slightly different.

5# XXX There should be a way to distinguish between PCDATA (parsed

6# character data -- the normal case), RCDATA (replaceable character

7# data -- only char and entity references and end tags are special)

8# and CDATA (character data -- only end tags are special).

11import re

12import _markupbase

14from html import unescape

17__all__ = ['HTMLParser']

19# Regular expressions used for parsing

21interesting_normal = re.compile('[&<]')

22incomplete = re.compile('&[a-zA-Z#]')

24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')

25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

27starttagopen = re.compile('<[a-zA-Z]')

28piclose = re.compile('>')

29commentclose = re.compile(r'--\s*>')

30# Note:

31# 1) if you change tagfind/attrfind remember to update locatestarttagend too;

32# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will

33# explode, so don't do it.

34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')

37attrfind_tolerant = re.compile(

38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'

39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')

40locatestarttagend_tolerant = re.compile(r"""

41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name

42 (?:[\s/]* # optional whitespace before attribute name

43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name

44 (?:\s*=+\s* # value indicator

45 (?:'[^']*' # LITA-enclosed value

46 |"[^"]*" # LIT-enclosed value

47 |(?!['"])[^>\s]* # bare value

48 )

49 \s* # possibly followed by a space

50 )?(?:\s|/(?!>))*

51 )*

52 )?

53 \s* # trailing whitespace

54""", re.VERBOSE)

55endendtag = re.compile('>')

56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between

57# </ and the tag name, so maybe this should be fixed

58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

62class HTMLParser(_markupbase.ParserBase):

63 """Find tags and other markup and call handler functions.

65 Usage:

66 p = HTMLParser()

67 p.feed(data)

68 ...

69 p.close()

71 Start tags are handled by calling self.handle_starttag() or

72 self.handle_startendtag(); end tags by self.handle_endtag(). The

73 data between tags is passed from the parser to the derived class

74 by calling self.handle_data() with the data as argument (the data

75 may be split up in arbitrary chunks). If convert_charrefs is

76 True the character references are converted automatically to the

77 corresponding Unicode character (and self.handle_data() is no

78 longer split in chunks), otherwise they are passed by calling

79 self.handle_entityref() or self.handle_charref() with the string

80 containing respectively the named or numeric reference as the

81 argument.

82 """

84 CDATA_CONTENT_ELEMENTS = ("script", "style")

86 def __init__(self, *, convert_charrefs=True):

87 """Initialize and reset this instance.

89 If convert_charrefs is True (the default), all character references

90 are automatically converted to the corresponding Unicode characters.

91 """

92 self.convert_charrefs = convert_charrefs

93 self.reset()

95 def reset(self):

96 """Reset this instance. Loses all unprocessed data."""

97 self.rawdata = ''

98 self.lasttag = '???'

99 self.interesting = interesting_normal

100 self.cdata_elem = None

101 _markupbase.ParserBase.reset(self)

102

103 def feed(self, data):

104 r"""Feed data to the parser.

105

106 Call this as often as you want, with as little or as much text

107 as you want (may include '\n').

108 """

109 self.rawdata = self.rawdata + data

110 self.goahead(0)

111

112 def close(self):

113 """Handle any buffered data."""

114 self.goahead(1)

115

116 __starttag_text = None

117

118 def get_starttag_text(self):

119 """Return full source of start tag: '<...>'."""

120 return self.__starttag_text

121

122 def set_cdata_mode(self, elem):

123 self.cdata_elem = elem.lower()

124 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

125

126 def clear_cdata_mode(self):

127 self.interesting = interesting_normal

128 self.cdata_elem = None

129

130 # Internal -- handle data as far as reasonable. May leave state

131 # and data to be processed by a subsequent call. If 'end' is

132 # true, force handling all data as if followed by EOF marker.

133 def goahead(self, end):

134 rawdata = self.rawdata

135 i = 0

136 n = len(rawdata)

137 while i < n:

138 if self.convert_charrefs and not self.cdata_elem:

139 j = rawdata.find('<', i)

140 if j < 0:

141 # if we can't find the next <, either we are at the end

142 # or there's more text incoming. If the latter is True,

143 # we can't pass the text to handle_data in case we have

144 # a charref cut in half at end. Try to determine if

145 # this is the case before proceeding by looking for an

146 # & near the end and see if it's followed by a space or ;.

147 amppos = rawdata.rfind('&', max(i, n-34))

148 if (amppos >= 0 and

149 not re.compile(r'[\s;]').search(rawdata, amppos)):

150 break # wait till we get all the text

151 j = n

152 else:

153 match = self.interesting.search(rawdata, i) # < or &

154 if match:

155 j = match.start()

156 else:

157 if self.cdata_elem:

158 break

159 j = n

160 if i < j:

161 if self.convert_charrefs and not self.cdata_elem:

162 self.handle_data(unescape(rawdata[i:j]))

163 else:

164 self.handle_data(rawdata[i:j])

165 i = self.updatepos(i, j)

166 if i == n: break

167 startswith = rawdata.startswith

168 if startswith('<', i):

169 if starttagopen.match(rawdata, i): # < + letter

170 k = self.parse_starttag(i)

171 elif startswith("</", i):

172 k = self.parse_endtag(i)

173 elif startswith("<!--", i):

174 k = self.parse_comment(i)

175 elif startswith("<?", i):

176 k = self.parse_pi(i)

177 elif startswith("<!", i):

178 k = self.parse_html_declaration(i)

179 elif (i + 1) < n:

180 self.handle_data("<")

181 k = i + 1

182 else:

183 break

184 if k < 0:

185 if not end:

186 break

187 k = rawdata.find('>', i + 1)

188 if k < 0:

189 k = rawdata.find('<', i + 1)

190 if k < 0:

191 k = i + 1

192 else:

193 k += 1

194 if self.convert_charrefs and not self.cdata_elem:

195 self.handle_data(unescape(rawdata[i:k]))

196 else:

197 self.handle_data(rawdata[i:k])

198 i = self.updatepos(i, k)

199 elif startswith("&#", i):

200 match = charref.match(rawdata, i)

201 if match:

202 name = match.group()[2:-1]

203 self.handle_charref(name)

204 k = match.end()

205 if not startswith(';', k-1):

206 k = k - 1

207 i = self.updatepos(i, k)

208 continue

209 else:

210 if ";" in rawdata[i:]: # bail by consuming &#

211 self.handle_data(rawdata[i:i+2])

212 i = self.updatepos(i, i+2)

213 break

214 elif startswith('&', i):

215 match = entityref.match(rawdata, i)

216 if match:

217 name = match.group(1)

218 self.handle_entityref(name)

219 k = match.end()

220 if not startswith(';', k-1):

221 k = k - 1

222 i = self.updatepos(i, k)

223 continue

224 match = incomplete.match(rawdata, i)

225 if match:

226 # match.group() will contain at least 2 chars

227 if end and match.group() == rawdata[i:]:

228 k = match.end()

229 if k <= i:

230 k = n

231 i = self.updatepos(i, i + 1)

232 # incomplete

233 break

234 elif (i + 1) < n:

235 # not the end of the buffer, and can't be confused

236 # with some other construct

237 self.handle_data("&")

238 i = self.updatepos(i, i + 1)

239 else:

240 break

241 else:

242 assert 0, "interesting.search() lied"

243 # end while

244 if end and i < n and not self.cdata_elem:

245 if self.convert_charrefs and not self.cdata_elem:

246 self.handle_data(unescape(rawdata[i:n]))

247 else:

248 self.handle_data(rawdata[i:n])

249 i = self.updatepos(i, n)

250 self.rawdata = rawdata[i:]

251

252 # Internal -- parse html declarations, return length or -1 if not terminated

253 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state

254 # See also parse_declaration in _markupbase

255 def parse_html_declaration(self, i):

256 rawdata = self.rawdata

257 assert rawdata[i:i+2] == '<!', ('unexpected call to '

258 'parse_html_declaration()')

259 if rawdata[i:i+4] == '<!--':

260 # this case is actually already handled in goahead()

261 return self.parse_comment(i)

262 elif rawdata[i:i+3] == '<![':

263 return self.parse_marked_section(i)

264 elif rawdata[i:i+9].lower() == '<!doctype':

265 # find the closing >

266 gtpos = rawdata.find('>', i+9)

267 if gtpos == -1:

268 return -1

269 self.handle_decl(rawdata[i+2:gtpos])

270 return gtpos+1

271 else:

272 return self.parse_bogus_comment(i)

273

274 # Internal -- parse bogus comment, return length or -1 if not terminated

275 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state

276 def parse_bogus_comment(self, i, report=1):

277 rawdata = self.rawdata

278 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '

279 'parse_comment()')

280 pos = rawdata.find('>', i+2)

281 if pos == -1:

282 return -1

283 if report:

284 self.handle_comment(rawdata[i+2:pos])

285 return pos + 1

286

287 # Internal -- parse processing instr, return end or -1 if not terminated

288 def parse_pi(self, i):

289 rawdata = self.rawdata

290 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

291 match = piclose.search(rawdata, i+2) # >

292 if not match:

293 return -1

294 j = match.start()

295 self.handle_pi(rawdata[i+2: j])

296 j = match.end()

297 return j

298

299 # Internal -- handle starttag, return end or -1 if not terminated

300 def parse_starttag(self, i):

301 self.__starttag_text = None

302 endpos = self.check_for_whole_start_tag(i)

303 if endpos < 0:

304 return endpos

305 rawdata = self.rawdata

306 self.__starttag_text = rawdata[i:endpos]

307

308 # Now parse the data between i+1 and j into a tag and attrs

309 attrs = []

310 match = tagfind_tolerant.match(rawdata, i+1)

311 assert match, 'unexpected call to parse_starttag()'

312 k = match.end()

313 self.lasttag = tag = match.group(1).lower()

314 while k < endpos:

315 m = attrfind_tolerant.match(rawdata, k)

316 if not m:

317 break

318 attrname, rest, attrvalue = m.group(1, 2, 3)

319 if not rest:

320 attrvalue = None

321 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

322 attrvalue[:1] == '"' == attrvalue[-1:]:

323 attrvalue = attrvalue[1:-1]

324 if attrvalue:

325 attrvalue = unescape(attrvalue)

326 attrs.append((attrname.lower(), attrvalue))

327 k = m.end()

328

329 end = rawdata[k:endpos].strip()

330 if end not in (">", "/>"):

331 lineno, offset = self.getpos()

332 if "\n" in self.__starttag_text:

333 lineno = lineno + self.__starttag_text.count("\n")

334 offset = len(self.__starttag_text) \

335 - self.__starttag_text.rfind("\n")

336 else:

337 offset = offset + len(self.__starttag_text)

338 self.handle_data(rawdata[i:endpos])

339 return endpos

340 if end.endswith('/>'):

341 # XHTML-style empty tag: <span attr="value" />

342 self.handle_startendtag(tag, attrs)

343 else:

344 self.handle_starttag(tag, attrs)

345 if tag in self.CDATA_CONTENT_ELEMENTS:

346 self.set_cdata_mode(tag)

347 return endpos

348

349 # Internal -- check to see if we have a complete starttag; return end

350 # or -1 if incomplete.

351 def check_for_whole_start_tag(self, i):

352 rawdata = self.rawdata

353 m = locatestarttagend_tolerant.match(rawdata, i)

354 if m:

355 j = m.end()

356 next = rawdata[j:j+1]

357 if next == ">":

358 return j + 1

359 if next == "/":

360 if rawdata.startswith("/>", j):

361 return j + 2

362 if rawdata.startswith("/", j):

363 # buffer boundary

364 return -1

365 # else bogus input

366 if j > i:

367 return j

368 else:

369 return i + 1

370 if next == "":

371 # end of input

372 return -1

373 if next in ("abcdefghijklmnopqrstuvwxyz=/"

374 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

375 # end of input in or before attribute value, or we have the

376 # '/' from a '/>' ending

377 return -1

378 if j > i:

379 return j

380 else:

381 return i + 1

382 raise AssertionError("we should not get here!")

383

384 # Internal -- parse endtag, return end or -1 if incomplete

385 def parse_endtag(self, i):

386 rawdata = self.rawdata

387 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

388 match = endendtag.search(rawdata, i+1) # >

389 if not match:

390 return -1

391 gtpos = match.end()

392 match = endtagfind.match(rawdata, i) # </ + tag + >

393 if not match:

394 if self.cdata_elem is not None:

395 self.handle_data(rawdata[i:gtpos])

396 return gtpos

397 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

398 namematch = tagfind_tolerant.match(rawdata, i+2)

399 if not namematch:

400 # w3.org/TR/html5/tokenization.html#end-tag-open-state

401 if rawdata[i:i+3] == '</>':

402 return i+3

403 else:

404 return self.parse_bogus_comment(i)

405 tagname = namematch.group(1).lower()

406 # consume and ignore other stuff between the name and the >

407 # Note: this is not 100% correct, since we might have things like

408 # </tag attr=">">, but looking for > after tha name should cover

409 # most of the cases and is much simpler

410 gtpos = rawdata.find('>', namematch.end())

411 self.handle_endtag(tagname)

412 return gtpos+1

413

414 elem = match.group(1).lower() # script or style

415 if self.cdata_elem is not None:

416 if elem != self.cdata_elem:

417 self.handle_data(rawdata[i:gtpos])

418 return gtpos

419

420 self.handle_endtag(elem)

421 self.clear_cdata_mode()

422 return gtpos

423

424 # Overridable -- finish processing of start+end tag: <tag.../>

425 def handle_startendtag(self, tag, attrs):

426 self.handle_starttag(tag, attrs)

427 self.handle_endtag(tag)

428

429 # Overridable -- handle start tag

430 def handle_starttag(self, tag, attrs):

431 pass

432

433 # Overridable -- handle end tag

434 def handle_endtag(self, tag):

435 pass

436

437 # Overridable -- handle character reference

438 def handle_charref(self, name):

439 pass

440

441 # Overridable -- handle entity reference

442 def handle_entityref(self, name):

443 pass

444

445 # Overridable -- handle data

446 def handle_data(self, data):

447 pass

448

449 # Overridable -- handle comment

450 def handle_comment(self, data):

451 pass

452

453 # Overridable -- handle declaration

454 def handle_decl(self, decl):

455 pass

456

457 # Overridable -- handle processing instruction

458 def handle_pi(self, data):

459 pass

460

461 def unknown_decl(self, data):

462 pass