Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/_markupbase.py: 7%

272 statements  

« prev     ^ index     » next       coverage.py v7.0.5, created at 2023-01-17 06:13 +0000

1"""Shared support for scanning document type declarations in HTML and XHTML. 

2 

3This module is used as a foundation for the html.parser module. It has no 

4documented public API and should not be used directly. 

5 

6""" 

7 

8import re 

9 

10_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 

11_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 

12_commentclose = re.compile(r'--\s*>') 

13_markedsectionclose = re.compile(r']\s*]\s*>') 

14 

15# An analysis of the MS-Word extensions is available at 

16# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 

17 

18_msmarkedsectionclose = re.compile(r']\s*>') 

19 

20del re 

21 

22 

23class ParserBase: 

24 """Parser base class which provides some common support methods used 

25 by the SGML/HTML and XHTML parsers.""" 

26 

27 def __init__(self): 

28 if self.__class__ is ParserBase: 

29 raise RuntimeError( 

30 "_markupbase.ParserBase must be subclassed") 

31 

32 def error(self, message): 

33 raise NotImplementedError( 

34 "subclasses of ParserBase must override error()") 

35 

36 def reset(self): 

37 self.lineno = 1 

38 self.offset = 0 

39 

40 def getpos(self): 

41 """Return current line number and offset.""" 

42 return self.lineno, self.offset 

43 

44 # Internal -- update line number and offset. This should be 

45 # called for each piece of data exactly once, in order -- in other 

46 # words the concatenation of all the input strings to this 

47 # function should be exactly the entire input. 

48 def updatepos(self, i, j): 

49 if i >= j: 

50 return j 

51 rawdata = self.rawdata 

52 nlines = rawdata.count("\n", i, j) 

53 if nlines: 

54 self.lineno = self.lineno + nlines 

55 pos = rawdata.rindex("\n", i, j) # Should not fail 

56 self.offset = j-(pos+1) 

57 else: 

58 self.offset = self.offset + j-i 

59 return j 

60 

61 _decl_otherchars = '' 

62 

63 # Internal -- parse declaration (for use by subclasses). 

64 def parse_declaration(self, i): 

65 # This is some sort of declaration; in "HTML as 

66 # deployed," this should only be the document type 

67 # declaration ("<!DOCTYPE html...>"). 

68 # ISO 8879:1986, however, has more complex 

69 # declaration syntax for elements in <!...>, including: 

70 # --comment-- 

71 # [marked section] 

72 # name in the following list: ENTITY, DOCTYPE, ELEMENT, 

73 # ATTLIST, NOTATION, SHORTREF, USEMAP, 

74 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 

75 rawdata = self.rawdata 

76 j = i + 2 

77 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 

78 if rawdata[j:j+1] == ">": 

79 # the empty comment <!> 

80 return j + 1 

81 if rawdata[j:j+1] in ("-", ""): 

82 # Start of comment followed by buffer boundary, 

83 # or just a buffer boundary. 

84 return -1 

85 # A simple, practical version could look like: ((name|stringlit) S*) + '>' 

86 n = len(rawdata) 

87 if rawdata[j:j+2] == '--': #comment 

88 # Locate --.*-- as the body of the comment 

89 return self.parse_comment(i) 

90 elif rawdata[j] == '[': #marked section 

91 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 

92 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 

93 # Note that this is extended by Microsoft Office "Save as Web" function 

94 # to include [if...] and [endif]. 

95 return self.parse_marked_section(i) 

96 else: #all other declaration elements 

97 decltype, j = self._scan_name(j, i) 

98 if j < 0: 

99 return j 

100 if decltype == "doctype": 

101 self._decl_otherchars = '' 

102 while j < n: 

103 c = rawdata[j] 

104 if c == ">": 

105 # end of declaration syntax 

106 data = rawdata[i+2:j] 

107 if decltype == "doctype": 

108 self.handle_decl(data) 

109 else: 

110 # According to the HTML5 specs sections "8.2.4.44 Bogus 

111 # comment state" and "8.2.4.45 Markup declaration open 

112 # state", a comment token should be emitted. 

113 # Calling unknown_decl provides more flexibility though. 

114 self.unknown_decl(data) 

115 return j + 1 

116 if c in "\"'": 

117 m = _declstringlit_match(rawdata, j) 

118 if not m: 

119 return -1 # incomplete 

120 j = m.end() 

121 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 

122 name, j = self._scan_name(j, i) 

123 elif c in self._decl_otherchars: 

124 j = j + 1 

125 elif c == "[": 

126 # this could be handled in a separate doctype parser 

127 if decltype == "doctype": 

128 j = self._parse_doctype_subset(j + 1, i) 

129 elif decltype in {"attlist", "linktype", "link", "element"}: 

130 # must tolerate []'d groups in a content model in an element declaration 

131 # also in data attribute specifications of attlist declaration 

132 # also link type declaration subsets in linktype declarations 

133 # also link attribute specification lists in link declarations 

134 self.error("unsupported '[' char in %s declaration" % decltype) 

135 else: 

136 self.error("unexpected '[' char in declaration") 

137 else: 

138 self.error( 

139 "unexpected %r char in declaration" % rawdata[j]) 

140 if j < 0: 

141 return j 

142 return -1 # incomplete 

143 

144 # Internal -- parse a marked section 

145 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 

146 def parse_marked_section(self, i, report=1): 

147 rawdata= self.rawdata 

148 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 

149 sectName, j = self._scan_name( i+3, i ) 

150 if j < 0: 

151 return j 

152 if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}: 

153 # look for standard ]]> ending 

154 match= _markedsectionclose.search(rawdata, i+3) 

155 elif sectName in {"if", "else", "endif"}: 

156 # look for MS Office ]> ending 

157 match= _msmarkedsectionclose.search(rawdata, i+3) 

158 else: 

159 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) 

160 if not match: 

161 return -1 

162 if report: 

163 j = match.start(0) 

164 self.unknown_decl(rawdata[i+3: j]) 

165 return match.end(0) 

166 

167 # Internal -- parse comment, return length or -1 if not terminated 

168 def parse_comment(self, i, report=1): 

169 rawdata = self.rawdata 

170 if rawdata[i:i+4] != '<!--': 

171 self.error('unexpected call to parse_comment()') 

172 match = _commentclose.search(rawdata, i+4) 

173 if not match: 

174 return -1 

175 if report: 

176 j = match.start(0) 

177 self.handle_comment(rawdata[i+4: j]) 

178 return match.end(0) 

179 

180 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 

181 # returning the index just past any whitespace following the trailing ']'. 

182 def _parse_doctype_subset(self, i, declstartpos): 

183 rawdata = self.rawdata 

184 n = len(rawdata) 

185 j = i 

186 while j < n: 

187 c = rawdata[j] 

188 if c == "<": 

189 s = rawdata[j:j+2] 

190 if s == "<": 

191 # end of buffer; incomplete 

192 return -1 

193 if s != "<!": 

194 self.updatepos(declstartpos, j + 1) 

195 self.error("unexpected char in internal subset (in %r)" % s) 

196 if (j + 2) == n: 

197 # end of buffer; incomplete 

198 return -1 

199 if (j + 4) > n: 

200 # end of buffer; incomplete 

201 return -1 

202 if rawdata[j:j+4] == "<!--": 

203 j = self.parse_comment(j, report=0) 

204 if j < 0: 

205 return j 

206 continue 

207 name, j = self._scan_name(j + 2, declstartpos) 

208 if j == -1: 

209 return -1 

210 if name not in {"attlist", "element", "entity", "notation"}: 

211 self.updatepos(declstartpos, j + 2) 

212 self.error( 

213 "unknown declaration %r in internal subset" % name) 

214 # handle the individual names 

215 meth = getattr(self, "_parse_doctype_" + name) 

216 j = meth(j, declstartpos) 

217 if j < 0: 

218 return j 

219 elif c == "%": 

220 # parameter entity reference 

221 if (j + 1) == n: 

222 # end of buffer; incomplete 

223 return -1 

224 s, j = self._scan_name(j + 1, declstartpos) 

225 if j < 0: 

226 return j 

227 if rawdata[j] == ";": 

228 j = j + 1 

229 elif c == "]": 

230 j = j + 1 

231 while j < n and rawdata[j].isspace(): 

232 j = j + 1 

233 if j < n: 

234 if rawdata[j] == ">": 

235 return j 

236 self.updatepos(declstartpos, j) 

237 self.error("unexpected char after internal subset") 

238 else: 

239 return -1 

240 elif c.isspace(): 

241 j = j + 1 

242 else: 

243 self.updatepos(declstartpos, j) 

244 self.error("unexpected char %r in internal subset" % c) 

245 # end of buffer reached 

246 return -1 

247 

248 # Internal -- scan past <!ELEMENT declarations 

249 def _parse_doctype_element(self, i, declstartpos): 

250 name, j = self._scan_name(i, declstartpos) 

251 if j == -1: 

252 return -1 

253 # style content model; just skip until '>' 

254 rawdata = self.rawdata 

255 if '>' in rawdata[j:]: 

256 return rawdata.find(">", j) + 1 

257 return -1 

258 

259 # Internal -- scan past <!ATTLIST declarations 

260 def _parse_doctype_attlist(self, i, declstartpos): 

261 rawdata = self.rawdata 

262 name, j = self._scan_name(i, declstartpos) 

263 c = rawdata[j:j+1] 

264 if c == "": 

265 return -1 

266 if c == ">": 

267 return j + 1 

268 while 1: 

269 # scan a series of attribute descriptions; simplified: 

270 # name type [value] [#constraint] 

271 name, j = self._scan_name(j, declstartpos) 

272 if j < 0: 

273 return j 

274 c = rawdata[j:j+1] 

275 if c == "": 

276 return -1 

277 if c == "(": 

278 # an enumerated type; look for ')' 

279 if ")" in rawdata[j:]: 

280 j = rawdata.find(")", j) + 1 

281 else: 

282 return -1 

283 while rawdata[j:j+1].isspace(): 

284 j = j + 1 

285 if not rawdata[j:]: 

286 # end of buffer, incomplete 

287 return -1 

288 else: 

289 name, j = self._scan_name(j, declstartpos) 

290 c = rawdata[j:j+1] 

291 if not c: 

292 return -1 

293 if c in "'\"": 

294 m = _declstringlit_match(rawdata, j) 

295 if m: 

296 j = m.end() 

297 else: 

298 return -1 

299 c = rawdata[j:j+1] 

300 if not c: 

301 return -1 

302 if c == "#": 

303 if rawdata[j:] == "#": 

304 # end of buffer 

305 return -1 

306 name, j = self._scan_name(j + 1, declstartpos) 

307 if j < 0: 

308 return j 

309 c = rawdata[j:j+1] 

310 if not c: 

311 return -1 

312 if c == '>': 

313 # all done 

314 return j + 1 

315 

316 # Internal -- scan past <!NOTATION declarations 

317 def _parse_doctype_notation(self, i, declstartpos): 

318 name, j = self._scan_name(i, declstartpos) 

319 if j < 0: 

320 return j 

321 rawdata = self.rawdata 

322 while 1: 

323 c = rawdata[j:j+1] 

324 if not c: 

325 # end of buffer; incomplete 

326 return -1 

327 if c == '>': 

328 return j + 1 

329 if c in "'\"": 

330 m = _declstringlit_match(rawdata, j) 

331 if not m: 

332 return -1 

333 j = m.end() 

334 else: 

335 name, j = self._scan_name(j, declstartpos) 

336 if j < 0: 

337 return j 

338 

339 # Internal -- scan past <!ENTITY declarations 

340 def _parse_doctype_entity(self, i, declstartpos): 

341 rawdata = self.rawdata 

342 if rawdata[i:i+1] == "%": 

343 j = i + 1 

344 while 1: 

345 c = rawdata[j:j+1] 

346 if not c: 

347 return -1 

348 if c.isspace(): 

349 j = j + 1 

350 else: 

351 break 

352 else: 

353 j = i 

354 name, j = self._scan_name(j, declstartpos) 

355 if j < 0: 

356 return j 

357 while 1: 

358 c = self.rawdata[j:j+1] 

359 if not c: 

360 return -1 

361 if c in "'\"": 

362 m = _declstringlit_match(rawdata, j) 

363 if m: 

364 j = m.end() 

365 else: 

366 return -1 # incomplete 

367 elif c == ">": 

368 return j + 1 

369 else: 

370 name, j = self._scan_name(j, declstartpos) 

371 if j < 0: 

372 return j 

373 

374 # Internal -- scan a name token and the new position and the token, or 

375 # return -1 if we've reached the end of the buffer. 

376 def _scan_name(self, i, declstartpos): 

377 rawdata = self.rawdata 

378 n = len(rawdata) 

379 if i == n: 

380 return None, -1 

381 m = _declname_match(rawdata, i) 

382 if m: 

383 s = m.group() 

384 name = s.strip() 

385 if (i + len(s)) == n: 

386 return None, -1 # end of buffer 

387 return name.lower(), m.end() 

388 else: 

389 self.updatepos(declstartpos, i) 

390 self.error("expected name token at %r" 

391 % rawdata[declstartpos:declstartpos+20]) 

392 

393 # To be overridden -- handlers for unknown objects 

394 def unknown_decl(self, data): 

395 pass