Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/backports/_markupbase.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

277 statements  

1"""Shared support for scanning document type declarations in HTML and XHTML. 

2 

3Backported for python-future from Python 3.3. Reason: ParserBase is an 

4old-style class in the Python 2.7 source of markupbase.py, which I suspect 

5might be the cause of sporadic unit-test failures on travis-ci.org with 

6test_htmlparser.py. The test failures look like this: 

7 

8 ====================================================================== 

9 

10ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStrictTestCase) 

11 

12---------------------------------------------------------------------- 

13 

14Traceback (most recent call last): 

15 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 661, in test_attr_entity_replacement 

16 [("starttag", "a", [("b", "&><\"'")])]) 

17 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 93, in _run_check 

18 collector = self.get_collector() 

19 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 617, in get_collector 

20 return EventCollector(strict=True) 

21 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 27, in __init__ 

22 html.parser.HTMLParser.__init__(self, *args, **kw) 

23 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 135, in __init__ 

24 self.reset() 

25 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 143, in reset 

26 _markupbase.ParserBase.reset(self) 

27 

28TypeError: unbound method reset() must be called with ParserBase instance as first argument (got EventCollector instance instead) 

29 

30This module is used as a foundation for the html.parser module. It has no 

31documented public API and should not be used directly. 

32 

33""" 

34 

35import re 

36 

37_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 

38_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 

39_commentclose = re.compile(r'--\s*>') 

40_markedsectionclose = re.compile(r']\s*]\s*>') 

41 

42# An analysis of the MS-Word extensions is available at 

43# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 

44 

45_msmarkedsectionclose = re.compile(r']\s*>') 

46 

47del re 

48 

49 

50class ParserBase(object): 

51 """Parser base class which provides some common support methods used 

52 by the SGML/HTML and XHTML parsers.""" 

53 

54 def __init__(self): 

55 if self.__class__ is ParserBase: 

56 raise RuntimeError( 

57 "_markupbase.ParserBase must be subclassed") 

58 

59 def error(self, message): 

60 raise NotImplementedError( 

61 "subclasses of ParserBase must override error()") 

62 

63 def reset(self): 

64 self.lineno = 1 

65 self.offset = 0 

66 

67 def getpos(self): 

68 """Return current line number and offset.""" 

69 return self.lineno, self.offset 

70 

71 # Internal -- update line number and offset. This should be 

72 # called for each piece of data exactly once, in order -- in other 

73 # words the concatenation of all the input strings to this 

74 # function should be exactly the entire input. 

75 def updatepos(self, i, j): 

76 if i >= j: 

77 return j 

78 rawdata = self.rawdata 

79 nlines = rawdata.count("\n", i, j) 

80 if nlines: 

81 self.lineno = self.lineno + nlines 

82 pos = rawdata.rindex("\n", i, j) # Should not fail 

83 self.offset = j-(pos+1) 

84 else: 

85 self.offset = self.offset + j-i 

86 return j 

87 

88 _decl_otherchars = '' 

89 

90 # Internal -- parse declaration (for use by subclasses). 

91 def parse_declaration(self, i): 

92 # This is some sort of declaration; in "HTML as 

93 # deployed," this should only be the document type 

94 # declaration ("<!DOCTYPE html...>"). 

95 # ISO 8879:1986, however, has more complex 

96 # declaration syntax for elements in <!...>, including: 

97 # --comment-- 

98 # [marked section] 

99 # name in the following list: ENTITY, DOCTYPE, ELEMENT, 

100 # ATTLIST, NOTATION, SHORTREF, USEMAP, 

101 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 

102 rawdata = self.rawdata 

103 j = i + 2 

104 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 

105 if rawdata[j:j+1] == ">": 

106 # the empty comment <!> 

107 return j + 1 

108 if rawdata[j:j+1] in ("-", ""): 

109 # Start of comment followed by buffer boundary, 

110 # or just a buffer boundary. 

111 return -1 

112 # A simple, practical version could look like: ((name|stringlit) S*) + '>' 

113 n = len(rawdata) 

114 if rawdata[j:j+2] == '--': #comment 

115 # Locate --.*-- as the body of the comment 

116 return self.parse_comment(i) 

117 elif rawdata[j] == '[': #marked section 

118 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 

119 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 

120 # Note that this is extended by Microsoft Office "Save as Web" function 

121 # to include [if...] and [endif]. 

122 return self.parse_marked_section(i) 

123 else: #all other declaration elements 

124 decltype, j = self._scan_name(j, i) 

125 if j < 0: 

126 return j 

127 if decltype == "doctype": 

128 self._decl_otherchars = '' 

129 while j < n: 

130 c = rawdata[j] 

131 if c == ">": 

132 # end of declaration syntax 

133 data = rawdata[i+2:j] 

134 if decltype == "doctype": 

135 self.handle_decl(data) 

136 else: 

137 # According to the HTML5 specs sections "8.2.4.44 Bogus 

138 # comment state" and "8.2.4.45 Markup declaration open 

139 # state", a comment token should be emitted. 

140 # Calling unknown_decl provides more flexibility though. 

141 self.unknown_decl(data) 

142 return j + 1 

143 if c in "\"'": 

144 m = _declstringlit_match(rawdata, j) 

145 if not m: 

146 return -1 # incomplete 

147 j = m.end() 

148 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 

149 name, j = self._scan_name(j, i) 

150 elif c in self._decl_otherchars: 

151 j = j + 1 

152 elif c == "[": 

153 # this could be handled in a separate doctype parser 

154 if decltype == "doctype": 

155 j = self._parse_doctype_subset(j + 1, i) 

156 elif decltype in set(["attlist", "linktype", "link", "element"]): 

157 # must tolerate []'d groups in a content model in an element declaration 

158 # also in data attribute specifications of attlist declaration 

159 # also link type declaration subsets in linktype declarations 

160 # also link attribute specification lists in link declarations 

161 self.error("unsupported '[' char in %s declaration" % decltype) 

162 else: 

163 self.error("unexpected '[' char in declaration") 

164 else: 

165 self.error( 

166 "unexpected %r char in declaration" % rawdata[j]) 

167 if j < 0: 

168 return j 

169 return -1 # incomplete 

170 

171 # Internal -- parse a marked section 

172 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 

173 def parse_marked_section(self, i, report=1): 

174 rawdata= self.rawdata 

175 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 

176 sectName, j = self._scan_name( i+3, i ) 

177 if j < 0: 

178 return j 

179 if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]): 

180 # look for standard ]]> ending 

181 match= _markedsectionclose.search(rawdata, i+3) 

182 elif sectName in set(["if", "else", "endif"]): 

183 # look for MS Office ]> ending 

184 match= _msmarkedsectionclose.search(rawdata, i+3) 

185 else: 

186 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) 

187 if not match: 

188 return -1 

189 if report: 

190 j = match.start(0) 

191 self.unknown_decl(rawdata[i+3: j]) 

192 return match.end(0) 

193 

194 # Internal -- parse comment, return length or -1 if not terminated 

195 def parse_comment(self, i, report=1): 

196 rawdata = self.rawdata 

197 if rawdata[i:i+4] != '<!--': 

198 self.error('unexpected call to parse_comment()') 

199 match = _commentclose.search(rawdata, i+4) 

200 if not match: 

201 return -1 

202 if report: 

203 j = match.start(0) 

204 self.handle_comment(rawdata[i+4: j]) 

205 return match.end(0) 

206 

207 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 

208 # returning the index just past any whitespace following the trailing ']'. 

209 def _parse_doctype_subset(self, i, declstartpos): 

210 rawdata = self.rawdata 

211 n = len(rawdata) 

212 j = i 

213 while j < n: 

214 c = rawdata[j] 

215 if c == "<": 

216 s = rawdata[j:j+2] 

217 if s == "<": 

218 # end of buffer; incomplete 

219 return -1 

220 if s != "<!": 

221 self.updatepos(declstartpos, j + 1) 

222 self.error("unexpected char in internal subset (in %r)" % s) 

223 if (j + 2) == n: 

224 # end of buffer; incomplete 

225 return -1 

226 if (j + 4) > n: 

227 # end of buffer; incomplete 

228 return -1 

229 if rawdata[j:j+4] == "<!--": 

230 j = self.parse_comment(j, report=0) 

231 if j < 0: 

232 return j 

233 continue 

234 name, j = self._scan_name(j + 2, declstartpos) 

235 if j == -1: 

236 return -1 

237 if name not in set(["attlist", "element", "entity", "notation"]): 

238 self.updatepos(declstartpos, j + 2) 

239 self.error( 

240 "unknown declaration %r in internal subset" % name) 

241 # handle the individual names 

242 meth = getattr(self, "_parse_doctype_" + name) 

243 j = meth(j, declstartpos) 

244 if j < 0: 

245 return j 

246 elif c == "%": 

247 # parameter entity reference 

248 if (j + 1) == n: 

249 # end of buffer; incomplete 

250 return -1 

251 s, j = self._scan_name(j + 1, declstartpos) 

252 if j < 0: 

253 return j 

254 if rawdata[j] == ";": 

255 j = j + 1 

256 elif c == "]": 

257 j = j + 1 

258 while j < n and rawdata[j].isspace(): 

259 j = j + 1 

260 if j < n: 

261 if rawdata[j] == ">": 

262 return j 

263 self.updatepos(declstartpos, j) 

264 self.error("unexpected char after internal subset") 

265 else: 

266 return -1 

267 elif c.isspace(): 

268 j = j + 1 

269 else: 

270 self.updatepos(declstartpos, j) 

271 self.error("unexpected char %r in internal subset" % c) 

272 # end of buffer reached 

273 return -1 

274 

275 # Internal -- scan past <!ELEMENT declarations 

276 def _parse_doctype_element(self, i, declstartpos): 

277 name, j = self._scan_name(i, declstartpos) 

278 if j == -1: 

279 return -1 

280 # style content model; just skip until '>' 

281 rawdata = self.rawdata 

282 if '>' in rawdata[j:]: 

283 return rawdata.find(">", j) + 1 

284 return -1 

285 

286 # Internal -- scan past <!ATTLIST declarations 

287 def _parse_doctype_attlist(self, i, declstartpos): 

288 rawdata = self.rawdata 

289 name, j = self._scan_name(i, declstartpos) 

290 c = rawdata[j:j+1] 

291 if c == "": 

292 return -1 

293 if c == ">": 

294 return j + 1 

295 while 1: 

296 # scan a series of attribute descriptions; simplified: 

297 # name type [value] [#constraint] 

298 name, j = self._scan_name(j, declstartpos) 

299 if j < 0: 

300 return j 

301 c = rawdata[j:j+1] 

302 if c == "": 

303 return -1 

304 if c == "(": 

305 # an enumerated type; look for ')' 

306 if ")" in rawdata[j:]: 

307 j = rawdata.find(")", j) + 1 

308 else: 

309 return -1 

310 while rawdata[j:j+1].isspace(): 

311 j = j + 1 

312 if not rawdata[j:]: 

313 # end of buffer, incomplete 

314 return -1 

315 else: 

316 name, j = self._scan_name(j, declstartpos) 

317 c = rawdata[j:j+1] 

318 if not c: 

319 return -1 

320 if c in "'\"": 

321 m = _declstringlit_match(rawdata, j) 

322 if m: 

323 j = m.end() 

324 else: 

325 return -1 

326 c = rawdata[j:j+1] 

327 if not c: 

328 return -1 

329 if c == "#": 

330 if rawdata[j:] == "#": 

331 # end of buffer 

332 return -1 

333 name, j = self._scan_name(j + 1, declstartpos) 

334 if j < 0: 

335 return j 

336 c = rawdata[j:j+1] 

337 if not c: 

338 return -1 

339 if c == '>': 

340 # all done 

341 return j + 1 

342 

343 # Internal -- scan past <!NOTATION declarations 

344 def _parse_doctype_notation(self, i, declstartpos): 

345 name, j = self._scan_name(i, declstartpos) 

346 if j < 0: 

347 return j 

348 rawdata = self.rawdata 

349 while 1: 

350 c = rawdata[j:j+1] 

351 if not c: 

352 # end of buffer; incomplete 

353 return -1 

354 if c == '>': 

355 return j + 1 

356 if c in "'\"": 

357 m = _declstringlit_match(rawdata, j) 

358 if not m: 

359 return -1 

360 j = m.end() 

361 else: 

362 name, j = self._scan_name(j, declstartpos) 

363 if j < 0: 

364 return j 

365 

366 # Internal -- scan past <!ENTITY declarations 

367 def _parse_doctype_entity(self, i, declstartpos): 

368 rawdata = self.rawdata 

369 if rawdata[i:i+1] == "%": 

370 j = i + 1 

371 while 1: 

372 c = rawdata[j:j+1] 

373 if not c: 

374 return -1 

375 if c.isspace(): 

376 j = j + 1 

377 else: 

378 break 

379 else: 

380 j = i 

381 name, j = self._scan_name(j, declstartpos) 

382 if j < 0: 

383 return j 

384 while 1: 

385 c = self.rawdata[j:j+1] 

386 if not c: 

387 return -1 

388 if c in "'\"": 

389 m = _declstringlit_match(rawdata, j) 

390 if m: 

391 j = m.end() 

392 else: 

393 return -1 # incomplete 

394 elif c == ">": 

395 return j + 1 

396 else: 

397 name, j = self._scan_name(j, declstartpos) 

398 if j < 0: 

399 return j 

400 

401 # Internal -- scan a name token and the new position and the token, or 

402 # return -1 if we've reached the end of the buffer. 

403 def _scan_name(self, i, declstartpos): 

404 rawdata = self.rawdata 

405 n = len(rawdata) 

406 if i == n: 

407 return None, -1 

408 m = _declname_match(rawdata, i) 

409 if m: 

410 s = m.group() 

411 name = s.strip() 

412 if (i + len(s)) == n: 

413 return None, -1 # end of buffer 

414 return name.lower(), m.end() 

415 else: 

416 self.updatepos(declstartpos, i) 

417 self.error("expected name token at %r" 

418 % rawdata[declstartpos:declstartpos+20]) 

419 

420 # To be overridden -- handlers for unknown objects 

421 def unknown_decl(self, data): 

422 pass