Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/backports/

1"""Shared support for scanning document type declarations in HTML and XHTML.

3Backported for python-future from Python 3.3. Reason: ParserBase is an

4old-style class in the Python 2.7 source of markupbase.py, which I suspect

5might be the cause of sporadic unit-test failures on travis-ci.org with

6test_htmlparser.py. The test failures look like this:

8 ======================================================================

10ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStrictTestCase)

12----------------------------------------------------------------------

14Traceback (most recent call last):

15 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 661, in test_attr_entity_replacement

16 [("starttag", "a", [("b", "&><\"'")])])

17 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 93, in _run_check

18 collector = self.get_collector()

19 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 617, in get_collector

20 return EventCollector(strict=True)

21 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 27, in __init__

22 html.parser.HTMLParser.__init__(self, *args, **kw)

23 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 135, in __init__

24 self.reset()

25 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 143, in reset

26 _markupbase.ParserBase.reset(self)

28TypeError: unbound method reset() must be called with ParserBase instance as first argument (got EventCollector instance instead)

30This module is used as a foundation for the html.parser module. It has no

31documented public API and should not be used directly.

33"""

35import re

37_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match

38_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match

39_commentclose = re.compile(r'--\s*>')

40_markedsectionclose = re.compile(r']\s*]\s*>')

42# An analysis of the MS-Word extensions is available at

43# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf

45_msmarkedsectionclose = re.compile(r']\s*>')

47del re

50class ParserBase(object):

51 """Parser base class which provides some common support methods used

52 by the SGML/HTML and XHTML parsers."""

54 def __init__(self):

55 if self.__class__ is ParserBase:

56 raise RuntimeError(

57 "_markupbase.ParserBase must be subclassed")

59 def error(self, message):

60 raise NotImplementedError(

61 "subclasses of ParserBase must override error()")

63 def reset(self):

64 self.lineno = 1

65 self.offset = 0

67 def getpos(self):

68 """Return current line number and offset."""

69 return self.lineno, self.offset

71 # Internal -- update line number and offset. This should be

72 # called for each piece of data exactly once, in order -- in other

73 # words the concatenation of all the input strings to this

74 # function should be exactly the entire input.

75 def updatepos(self, i, j):

76 if i >= j:

77 return j

78 rawdata = self.rawdata

79 nlines = rawdata.count("\n", i, j)

80 if nlines:

81 self.lineno = self.lineno + nlines

82 pos = rawdata.rindex("\n", i, j) # Should not fail

83 self.offset = j-(pos+1)

84 else:

85 self.offset = self.offset + j-i

86 return j

88 _decl_otherchars = ''

90 # Internal -- parse declaration (for use by subclasses).

91 def parse_declaration(self, i):

92 # This is some sort of declaration; in "HTML as

93 # deployed," this should only be the document type

94 # declaration ("<!DOCTYPE html...>").

95 # ISO 8879:1986, however, has more complex

96 # declaration syntax for elements in <!...>, including:

97 # --comment--

98 # [marked section]

99 # name in the following list: ENTITY, DOCTYPE, ELEMENT,

100 # ATTLIST, NOTATION, SHORTREF, USEMAP,

101 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM

102 rawdata = self.rawdata

103 j = i + 2

104 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"

105 if rawdata[j:j+1] == ">":

106 # the empty comment <!>

107 return j + 1

108 if rawdata[j:j+1] in ("-", ""):

109 # Start of comment followed by buffer boundary,

110 # or just a buffer boundary.

111 return -1

112 # A simple, practical version could look like: ((name|stringlit) S*) + '>'

113 n = len(rawdata)

114 if rawdata[j:j+2] == '--': #comment

115 # Locate --.*-- as the body of the comment

116 return self.parse_comment(i)

117 elif rawdata[j] == '[': #marked section

118 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section

119 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA

120 # Note that this is extended by Microsoft Office "Save as Web" function

121 # to include [if...] and [endif].

122 return self.parse_marked_section(i)

123 else: #all other declaration elements

124 decltype, j = self._scan_name(j, i)

125 if j < 0:

126 return j

127 if decltype == "doctype":

128 self._decl_otherchars = ''

129 while j < n:

130 c = rawdata[j]

131 if c == ">":

132 # end of declaration syntax

133 data = rawdata[i+2:j]

134 if decltype == "doctype":

135 self.handle_decl(data)

136 else:

137 # According to the HTML5 specs sections "8.2.4.44 Bogus

138 # comment state" and "8.2.4.45 Markup declaration open

139 # state", a comment token should be emitted.

140 # Calling unknown_decl provides more flexibility though.

141 self.unknown_decl(data)

142 return j + 1

143 if c in "\"'":

144 m = _declstringlit_match(rawdata, j)

145 if not m:

146 return -1 # incomplete

147 j = m.end()

148 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":

149 name, j = self._scan_name(j, i)

150 elif c in self._decl_otherchars:

151 j = j + 1

152 elif c == "[":

153 # this could be handled in a separate doctype parser

154 if decltype == "doctype":

155 j = self._parse_doctype_subset(j + 1, i)

156 elif decltype in set(["attlist", "linktype", "link", "element"]):

157 # must tolerate []'d groups in a content model in an element declaration

158 # also in data attribute specifications of attlist declaration

159 # also link type declaration subsets in linktype declarations

160 # also link attribute specification lists in link declarations

161 self.error("unsupported '[' char in %s declaration" % decltype)

162 else:

163 self.error("unexpected '[' char in declaration")

164 else:

165 self.error(

166 "unexpected %r char in declaration" % rawdata[j])

167 if j < 0:

168 return j

169 return -1 # incomplete

170

171 # Internal -- parse a marked section

172 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>

173 def parse_marked_section(self, i, report=1):

174 rawdata= self.rawdata

175 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"

176 sectName, j = self._scan_name( i+3, i )

177 if j < 0:

178 return j

179 if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]):

180 # look for standard ]]> ending

181 match= _markedsectionclose.search(rawdata, i+3)

182 elif sectName in set(["if", "else", "endif"]):

183 # look for MS Office ]> ending

184 match= _msmarkedsectionclose.search(rawdata, i+3)

185 else:

186 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])

187 if not match:

188 return -1

189 if report:

190 j = match.start(0)

191 self.unknown_decl(rawdata[i+3: j])

192 return match.end(0)

193

194 # Internal -- parse comment, return length or -1 if not terminated

195 def parse_comment(self, i, report=1):

196 rawdata = self.rawdata

197 if rawdata[i:i+4] != '<!--':

198 self.error('unexpected call to parse_comment()')

199 match = _commentclose.search(rawdata, i+4)

200 if not match:

201 return -1

202 if report:

203 j = match.start(0)

204 self.handle_comment(rawdata[i+4: j])

205 return match.end(0)

206

207 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,

208 # returning the index just past any whitespace following the trailing ']'.

209 def _parse_doctype_subset(self, i, declstartpos):

210 rawdata = self.rawdata

211 n = len(rawdata)

212 j = i

213 while j < n:

214 c = rawdata[j]

215 if c == "<":

216 s = rawdata[j:j+2]

217 if s == "<":

218 # end of buffer; incomplete

219 return -1

220 if s != "<!":

221 self.updatepos(declstartpos, j + 1)

222 self.error("unexpected char in internal subset (in %r)" % s)

223 if (j + 2) == n:

224 # end of buffer; incomplete

225 return -1

226 if (j + 4) > n:

227 # end of buffer; incomplete

228 return -1

229 if rawdata[j:j+4] == "<!--":

230 j = self.parse_comment(j, report=0)

231 if j < 0:

232 return j

233 continue

234 name, j = self._scan_name(j + 2, declstartpos)

235 if j == -1:

236 return -1

237 if name not in set(["attlist", "element", "entity", "notation"]):

238 self.updatepos(declstartpos, j + 2)

239 self.error(

240 "unknown declaration %r in internal subset" % name)

241 # handle the individual names

242 meth = getattr(self, "_parse_doctype_" + name)

243 j = meth(j, declstartpos)

244 if j < 0:

245 return j

246 elif c == "%":

247 # parameter entity reference

248 if (j + 1) == n:

249 # end of buffer; incomplete

250 return -1

251 s, j = self._scan_name(j + 1, declstartpos)

252 if j < 0:

253 return j

254 if rawdata[j] == ";":

255 j = j + 1

256 elif c == "]":

257 j = j + 1

258 while j < n and rawdata[j].isspace():

259 j = j + 1

260 if j < n:

261 if rawdata[j] == ">":

262 return j

263 self.updatepos(declstartpos, j)

264 self.error("unexpected char after internal subset")

265 else:

266 return -1

267 elif c.isspace():

268 j = j + 1

269 else:

270 self.updatepos(declstartpos, j)

271 self.error("unexpected char %r in internal subset" % c)

272 # end of buffer reached

273 return -1

274

275 # Internal -- scan past <!ELEMENT declarations

276 def _parse_doctype_element(self, i, declstartpos):

277 name, j = self._scan_name(i, declstartpos)

278 if j == -1:

279 return -1

280 # style content model; just skip until '>'

281 rawdata = self.rawdata

282 if '>' in rawdata[j:]:

283 return rawdata.find(">", j) + 1

284 return -1

285

286 # Internal -- scan past <!ATTLIST declarations

287 def _parse_doctype_attlist(self, i, declstartpos):

288 rawdata = self.rawdata

289 name, j = self._scan_name(i, declstartpos)

290 c = rawdata[j:j+1]

291 if c == "":

292 return -1

293 if c == ">":

294 return j + 1

295 while 1:

296 # scan a series of attribute descriptions; simplified:

297 # name type [value] [#constraint]

298 name, j = self._scan_name(j, declstartpos)

299 if j < 0:

300 return j

301 c = rawdata[j:j+1]

302 if c == "":

303 return -1

304 if c == "(":

305 # an enumerated type; look for ')'

306 if ")" in rawdata[j:]:

307 j = rawdata.find(")", j) + 1

308 else:

309 return -1

310 while rawdata[j:j+1].isspace():

311 j = j + 1

312 if not rawdata[j:]:

313 # end of buffer, incomplete

314 return -1

315 else:

316 name, j = self._scan_name(j, declstartpos)

317 c = rawdata[j:j+1]

318 if not c:

319 return -1

320 if c in "'\"":

321 m = _declstringlit_match(rawdata, j)

322 if m:

323 j = m.end()

324 else:

325 return -1

326 c = rawdata[j:j+1]

327 if not c:

328 return -1

329 if c == "#":

330 if rawdata[j:] == "#":

331 # end of buffer

332 return -1

333 name, j = self._scan_name(j + 1, declstartpos)

334 if j < 0:

335 return j

336 c = rawdata[j:j+1]

337 if not c:

338 return -1

339 if c == '>':

340 # all done

341 return j + 1

342

343 # Internal -- scan past <!NOTATION declarations

344 def _parse_doctype_notation(self, i, declstartpos):

345 name, j = self._scan_name(i, declstartpos)

346 if j < 0:

347 return j

348 rawdata = self.rawdata

349 while 1:

350 c = rawdata[j:j+1]

351 if not c:

352 # end of buffer; incomplete

353 return -1

354 if c == '>':

355 return j + 1

356 if c in "'\"":

357 m = _declstringlit_match(rawdata, j)

358 if not m:

359 return -1

360 j = m.end()

361 else:

362 name, j = self._scan_name(j, declstartpos)

363 if j < 0:

364 return j

365

366 # Internal -- scan past <!ENTITY declarations

367 def _parse_doctype_entity(self, i, declstartpos):

368 rawdata = self.rawdata

369 if rawdata[i:i+1] == "%":

370 j = i + 1

371 while 1:

372 c = rawdata[j:j+1]

373 if not c:

374 return -1

375 if c.isspace():

376 j = j + 1

377 else:

378 break

379 else:

380 j = i

381 name, j = self._scan_name(j, declstartpos)

382 if j < 0:

383 return j

384 while 1:

385 c = self.rawdata[j:j+1]

386 if not c:

387 return -1

388 if c in "'\"":

389 m = _declstringlit_match(rawdata, j)

390 if m:

391 j = m.end()

392 else:

393 return -1 # incomplete

394 elif c == ">":

395 return j + 1

396 else:

397 name, j = self._scan_name(j, declstartpos)

398 if j < 0:

399 return j

400

401 # Internal -- scan a name token and the new position and the token, or

402 # return -1 if we've reached the end of the buffer.

403 def _scan_name(self, i, declstartpos):

404 rawdata = self.rawdata

405 n = len(rawdata)

406 if i == n:

407 return None, -1

408 m = _declname_match(rawdata, i)

409 if m:

410 s = m.group()

411 name = s.strip()

412 if (i + len(s)) == n:

413 return None, -1 # end of buffer

414 return name.lower(), m.end()

415 else:

416 self.updatepos(declstartpos, i)

417 self.error("expected name token at %r"

418 % rawdata[declstartpos:declstartpos+20])

419

420 # To be overridden -- handlers for unknown objects

421 def unknown_decl(self, data):

422 pass

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/backports/_markupbase.py: 27%

277 statements