Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/

1"""Shared support for scanning document type declarations in HTML and XHTML.

3This module is used as a foundation for the html.parser module. It has no

4documented public API and should not be used directly.

6"""

8import re

10_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match

11_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match

12_commentclose = re.compile(r'--\s*>')

13_markedsectionclose = re.compile(r']\s*]\s*>')

15# An analysis of the MS-Word extensions is available at

16# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf

18_msmarkedsectionclose = re.compile(r']\s*>')

20del re

23class ParserBase:

24 """Parser base class which provides some common support methods used

25 by the SGML/HTML and XHTML parsers."""

27 def __init__(self):

28 if self.__class__ is ParserBase:

29 raise RuntimeError(

30 "_markupbase.ParserBase must be subclassed")

32 def error(self, message):

33 raise NotImplementedError(

34 "subclasses of ParserBase must override error()")

36 def reset(self):

37 self.lineno = 1

38 self.offset = 0

40 def getpos(self):

41 """Return current line number and offset."""

42 return self.lineno, self.offset

44 # Internal -- update line number and offset. This should be

45 # called for each piece of data exactly once, in order -- in other

46 # words the concatenation of all the input strings to this

47 # function should be exactly the entire input.

48 def updatepos(self, i, j):

49 if i >= j:

50 return j

51 rawdata = self.rawdata

52 nlines = rawdata.count("\n", i, j)

53 if nlines:

54 self.lineno = self.lineno + nlines

55 pos = rawdata.rindex("\n", i, j) # Should not fail

56 self.offset = j-(pos+1)

57 else:

58 self.offset = self.offset + j-i

59 return j

61 _decl_otherchars = ''

63 # Internal -- parse declaration (for use by subclasses).

64 def parse_declaration(self, i):

65 # This is some sort of declaration; in "HTML as

66 # deployed," this should only be the document type

67 # declaration ("<!DOCTYPE html...>").

68 # ISO 8879:1986, however, has more complex

69 # declaration syntax for elements in <!...>, including:

70 # --comment--

71 # [marked section]

72 # name in the following list: ENTITY, DOCTYPE, ELEMENT,

73 # ATTLIST, NOTATION, SHORTREF, USEMAP,

74 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM

75 rawdata = self.rawdata

76 j = i + 2

77 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"

78 if rawdata[j:j+1] == ">":

79 # the empty comment <!>

80 return j + 1

81 if rawdata[j:j+1] in ("-", ""):

82 # Start of comment followed by buffer boundary,

83 # or just a buffer boundary.

84 return -1

85 # A simple, practical version could look like: ((name|stringlit) S*) + '>'

86 n = len(rawdata)

87 if rawdata[j:j+2] == '--': #comment

88 # Locate --.*-- as the body of the comment

89 return self.parse_comment(i)

90 elif rawdata[j] == '[': #marked section

91 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section

92 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA

93 # Note that this is extended by Microsoft Office "Save as Web" function

94 # to include [if...] and [endif].

95 return self.parse_marked_section(i)

96 else: #all other declaration elements

97 decltype, j = self._scan_name(j, i)

98 if j < 0:

99 return j

100 if decltype == "doctype":

101 self._decl_otherchars = ''

102 while j < n:

103 c = rawdata[j]

104 if c == ">":

105 # end of declaration syntax

106 data = rawdata[i+2:j]

107 if decltype == "doctype":

108 self.handle_decl(data)

109 else:

110 # According to the HTML5 specs sections "8.2.4.44 Bogus

111 # comment state" and "8.2.4.45 Markup declaration open

112 # state", a comment token should be emitted.

113 # Calling unknown_decl provides more flexibility though.

114 self.unknown_decl(data)

115 return j + 1

116 if c in "\"'":

117 m = _declstringlit_match(rawdata, j)

118 if not m:

119 return -1 # incomplete

120 j = m.end()

121 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":

122 name, j = self._scan_name(j, i)

123 elif c in self._decl_otherchars:

124 j = j + 1

125 elif c == "[":

126 # this could be handled in a separate doctype parser

127 if decltype == "doctype":

128 j = self._parse_doctype_subset(j + 1, i)

129 elif decltype in {"attlist", "linktype", "link", "element"}:

130 # must tolerate []'d groups in a content model in an element declaration

131 # also in data attribute specifications of attlist declaration

132 # also link type declaration subsets in linktype declarations

133 # also link attribute specification lists in link declarations

134 self.error("unsupported '[' char in %s declaration" % decltype)

135 else:

136 self.error("unexpected '[' char in declaration")

137 else:

138 self.error(

139 "unexpected %r char in declaration" % rawdata[j])

140 if j < 0:

141 return j

142 return -1 # incomplete

143

144 # Internal -- parse a marked section

145 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>

146 def parse_marked_section(self, i, report=1):

147 rawdata= self.rawdata

148 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"

149 sectName, j = self._scan_name( i+3, i )

150 if j < 0:

151 return j

152 if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:

153 # look for standard ]]> ending

154 match= _markedsectionclose.search(rawdata, i+3)

155 elif sectName in {"if", "else", "endif"}:

156 # look for MS Office ]> ending

157 match= _msmarkedsectionclose.search(rawdata, i+3)

158 else:

159 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])

160 if not match:

161 return -1

162 if report:

163 j = match.start(0)

164 self.unknown_decl(rawdata[i+3: j])

165 return match.end(0)

166

167 # Internal -- parse comment, return length or -1 if not terminated

168 def parse_comment(self, i, report=1):

169 rawdata = self.rawdata

170 if rawdata[i:i+4] != '<!--':

171 self.error('unexpected call to parse_comment()')

172 match = _commentclose.search(rawdata, i+4)

173 if not match:

174 return -1

175 if report:

176 j = match.start(0)

177 self.handle_comment(rawdata[i+4: j])

178 return match.end(0)

179

180 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,

181 # returning the index just past any whitespace following the trailing ']'.

182 def _parse_doctype_subset(self, i, declstartpos):

183 rawdata = self.rawdata

184 n = len(rawdata)

185 j = i

186 while j < n:

187 c = rawdata[j]

188 if c == "<":

189 s = rawdata[j:j+2]

190 if s == "<":

191 # end of buffer; incomplete

192 return -1

193 if s != "<!":

194 self.updatepos(declstartpos, j + 1)

195 self.error("unexpected char in internal subset (in %r)" % s)

196 if (j + 2) == n:

197 # end of buffer; incomplete

198 return -1

199 if (j + 4) > n:

200 # end of buffer; incomplete

201 return -1

202 if rawdata[j:j+4] == "<!--":

203 j = self.parse_comment(j, report=0)

204 if j < 0:

205 return j

206 continue

207 name, j = self._scan_name(j + 2, declstartpos)

208 if j == -1:

209 return -1

210 if name not in {"attlist", "element", "entity", "notation"}:

211 self.updatepos(declstartpos, j + 2)

212 self.error(

213 "unknown declaration %r in internal subset" % name)

214 # handle the individual names

215 meth = getattr(self, "_parse_doctype_" + name)

216 j = meth(j, declstartpos)

217 if j < 0:

218 return j

219 elif c == "%":

220 # parameter entity reference

221 if (j + 1) == n:

222 # end of buffer; incomplete

223 return -1

224 s, j = self._scan_name(j + 1, declstartpos)

225 if j < 0:

226 return j

227 if rawdata[j] == ";":

228 j = j + 1

229 elif c == "]":

230 j = j + 1

231 while j < n and rawdata[j].isspace():

232 j = j + 1

233 if j < n:

234 if rawdata[j] == ">":

235 return j

236 self.updatepos(declstartpos, j)

237 self.error("unexpected char after internal subset")

238 else:

239 return -1

240 elif c.isspace():

241 j = j + 1

242 else:

243 self.updatepos(declstartpos, j)

244 self.error("unexpected char %r in internal subset" % c)

245 # end of buffer reached

246 return -1

247

248 # Internal -- scan past <!ELEMENT declarations

249 def _parse_doctype_element(self, i, declstartpos):

250 name, j = self._scan_name(i, declstartpos)

251 if j == -1:

252 return -1

253 # style content model; just skip until '>'

254 rawdata = self.rawdata

255 if '>' in rawdata[j:]:

256 return rawdata.find(">", j) + 1

257 return -1

258

259 # Internal -- scan past <!ATTLIST declarations

260 def _parse_doctype_attlist(self, i, declstartpos):

261 rawdata = self.rawdata

262 name, j = self._scan_name(i, declstartpos)

263 c = rawdata[j:j+1]

264 if c == "":

265 return -1

266 if c == ">":

267 return j + 1

268 while 1:

269 # scan a series of attribute descriptions; simplified:

270 # name type [value] [#constraint]

271 name, j = self._scan_name(j, declstartpos)

272 if j < 0:

273 return j

274 c = rawdata[j:j+1]

275 if c == "":

276 return -1

277 if c == "(":

278 # an enumerated type; look for ')'

279 if ")" in rawdata[j:]:

280 j = rawdata.find(")", j) + 1

281 else:

282 return -1

283 while rawdata[j:j+1].isspace():

284 j = j + 1

285 if not rawdata[j:]:

286 # end of buffer, incomplete

287 return -1

288 else:

289 name, j = self._scan_name(j, declstartpos)

290 c = rawdata[j:j+1]

291 if not c:

292 return -1

293 if c in "'\"":

294 m = _declstringlit_match(rawdata, j)

295 if m:

296 j = m.end()

297 else:

298 return -1

299 c = rawdata[j:j+1]

300 if not c:

301 return -1

302 if c == "#":

303 if rawdata[j:] == "#":

304 # end of buffer

305 return -1

306 name, j = self._scan_name(j + 1, declstartpos)

307 if j < 0:

308 return j

309 c = rawdata[j:j+1]

310 if not c:

311 return -1

312 if c == '>':

313 # all done

314 return j + 1

315

316 # Internal -- scan past <!NOTATION declarations

317 def _parse_doctype_notation(self, i, declstartpos):

318 name, j = self._scan_name(i, declstartpos)

319 if j < 0:

320 return j

321 rawdata = self.rawdata

322 while 1:

323 c = rawdata[j:j+1]

324 if not c:

325 # end of buffer; incomplete

326 return -1

327 if c == '>':

328 return j + 1

329 if c in "'\"":

330 m = _declstringlit_match(rawdata, j)

331 if not m:

332 return -1

333 j = m.end()

334 else:

335 name, j = self._scan_name(j, declstartpos)

336 if j < 0:

337 return j

338

339 # Internal -- scan past <!ENTITY declarations

340 def _parse_doctype_entity(self, i, declstartpos):

341 rawdata = self.rawdata

342 if rawdata[i:i+1] == "%":

343 j = i + 1

344 while 1:

345 c = rawdata[j:j+1]

346 if not c:

347 return -1

348 if c.isspace():

349 j = j + 1

350 else:

351 break

352 else:

353 j = i

354 name, j = self._scan_name(j, declstartpos)

355 if j < 0:

356 return j

357 while 1:

358 c = self.rawdata[j:j+1]

359 if not c:

360 return -1

361 if c in "'\"":

362 m = _declstringlit_match(rawdata, j)

363 if m:

364 j = m.end()

365 else:

366 return -1 # incomplete

367 elif c == ">":

368 return j + 1

369 else:

370 name, j = self._scan_name(j, declstartpos)

371 if j < 0:

372 return j

373

374 # Internal -- scan a name token and the new position and the token, or

375 # return -1 if we've reached the end of the buffer.

376 def _scan_name(self, i, declstartpos):

377 rawdata = self.rawdata

378 n = len(rawdata)

379 if i == n:

380 return None, -1

381 m = _declname_match(rawdata, i)

382 if m:

383 s = m.group()

384 name = s.strip()

385 if (i + len(s)) == n:

386 return None, -1 # end of buffer

387 return name.lower(), m.end()

388 else:

389 self.updatepos(declstartpos, i)

390 self.error("expected name token at %r"

391 % rawdata[declstartpos:declstartpos+20])

392

393 # To be overridden -- handlers for unknown objects

394 def unknown_decl(self, data):

395 pass

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/_markupbase.py: 7%

272 statements