Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tinycss2/tokenizer.py: 5%

297 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1import re 

2import sys 

3 

4from webencodings import ascii_lower 

5 

6from .ast import ( 

7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock, 

8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, 

9 ParseError, PercentageToken, SquareBracketsBlock, StringToken, 

10 UnicodeRangeToken, URLToken, WhitespaceToken) 

11from .serializer import serialize_string_value, serialize_url 

12 

13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?') 

14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?') 

15 

16 

17def parse_component_value_list(css, skip_comments=False): 

18 """Parse a list of component values. 

19 

20 :type css: :obj:`str` 

21 :param css: A CSS string. 

22 :type skip_comments: :obj:`bool` 

23 :param skip_comments: 

24 Ignore CSS comments. 

25 The return values (and recursively its blocks and functions) 

26 will not contain any :class:`~tinycss2.ast.Comment` object. 

27 :returns: A list of :term:`component values`. 

28 

29 """ 

30 css = (css.replace('\0', '\uFFFD') 

31 # This turns out to be faster than a regexp: 

32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n')) 

33 length = len(css) 

34 token_start_pos = pos = 0 # Character index in the css source. 

35 line = 1 # First line is line 1. 

36 last_newline = -1 

37 root = tokens = [] 

38 end_char = None # Pop the stack when encountering this character. 

39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples. 

40 

41 while pos < length: 

42 newline = css.rfind('\n', token_start_pos, pos) 

43 if newline != -1: 

44 line += 1 + css.count('\n', token_start_pos, newline) 

45 last_newline = newline 

46 # First character in a line is in column 1. 

47 column = pos - last_newline 

48 token_start_pos = pos 

49 c = css[pos] 

50 

51 if c in ' \n\t': 

52 pos += 1 

53 while css.startswith((' ', '\n', '\t'), pos): 

54 pos += 1 

55 value = css[token_start_pos:pos] 

56 tokens.append(WhitespaceToken(line, column, value)) 

57 continue 

58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and 

59 css[pos + 2] in '0123456789abcdefABCDEF?'): 

60 start, end, pos = _consume_unicode_range(css, pos + 2) 

61 tokens.append(UnicodeRangeToken(line, column, start, end)) 

62 continue 

63 elif css.startswith('-->', pos): # Check before identifiers 

64 tokens.append(LiteralToken(line, column, '-->')) 

65 pos += 3 

66 continue 

67 elif _is_ident_start(css, pos): 

68 value, pos = _consume_ident(css, pos) 

69 if not css.startswith('(', pos): # Not a function 

70 tokens.append(IdentToken(line, column, value)) 

71 continue 

72 pos += 1 # Skip the '(' 

73 if ascii_lower(value) == 'url': 

74 url_pos = pos 

75 while css.startswith((' ', '\n', '\t'), url_pos): 

76 url_pos += 1 

77 if url_pos >= length or css[url_pos] not in ('"', "'"): 

78 value, pos, error = _consume_url(css, pos) 

79 if value is not None: 

80 repr = 'url({})'.format(serialize_url(value)) 

81 if error is not None: 

82 error_key = error[0] 

83 if error_key == 'eof-in-string': 

84 repr = repr[:-2] 

85 else: 

86 assert error_key == 'eof-in-url' 

87 repr = repr[:-1] 

88 tokens.append(URLToken(line, column, value, repr)) 

89 if error is not None: 

90 tokens.append(ParseError(line, column, *error)) 

91 continue 

92 arguments = [] 

93 tokens.append(FunctionBlock(line, column, value, arguments)) 

94 stack.append((tokens, end_char)) 

95 end_char = ')' 

96 tokens = arguments 

97 continue 

98 

99 match = _NUMBER_RE.match(css, pos) 

100 if match: 

101 pos = match.end() 

102 repr_ = css[token_start_pos:pos] 

103 value = float(repr_) 

104 int_value = int(repr_) if not any(match.groups()) else None 

105 if pos < length and _is_ident_start(css, pos): 

106 unit, pos = _consume_ident(css, pos) 

107 tokens.append(DimensionToken( 

108 line, column, value, int_value, repr_, unit)) 

109 elif css.startswith('%', pos): 

110 pos += 1 

111 tokens.append(PercentageToken( 

112 line, column, value, int_value, repr_)) 

113 else: 

114 tokens.append(NumberToken( 

115 line, column, value, int_value, repr_)) 

116 elif c == '@': 

117 pos += 1 

118 if pos < length and _is_ident_start(css, pos): 

119 value, pos = _consume_ident(css, pos) 

120 tokens.append(AtKeywordToken(line, column, value)) 

121 else: 

122 tokens.append(LiteralToken(line, column, '@')) 

123 elif c == '#': 

124 pos += 1 

125 if pos < length and ( 

126 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz' 

127 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or 

128 ord(css[pos]) > 0x7F or # Non-ASCII 

129 # Valid escape: 

130 (css[pos] == '\\' and not css.startswith('\\\n', pos))): 

131 is_identifier = _is_ident_start(css, pos) 

132 value, pos = _consume_ident(css, pos) 

133 tokens.append(HashToken(line, column, value, is_identifier)) 

134 else: 

135 tokens.append(LiteralToken(line, column, '#')) 

136 elif c == '{': 

137 content = [] 

138 tokens.append(CurlyBracketsBlock(line, column, content)) 

139 stack.append((tokens, end_char)) 

140 end_char = '}' 

141 tokens = content 

142 pos += 1 

143 elif c == '[': 

144 content = [] 

145 tokens.append(SquareBracketsBlock(line, column, content)) 

146 stack.append((tokens, end_char)) 

147 end_char = ']' 

148 tokens = content 

149 pos += 1 

150 elif c == '(': 

151 content = [] 

152 tokens.append(ParenthesesBlock(line, column, content)) 

153 stack.append((tokens, end_char)) 

154 end_char = ')' 

155 tokens = content 

156 pos += 1 

157 elif c == end_char: # Matching }, ] or ) 

158 # The top-level end_char is None (never equal to a character), 

159 # so we never get here if the stack is empty. 

160 tokens, end_char = stack.pop() 

161 pos += 1 

162 elif c in '}])': 

163 tokens.append(ParseError(line, column, c, 'Unmatched ' + c)) 

164 pos += 1 

165 elif c in ('"', "'"): 

166 value, pos, error = _consume_quoted_string(css, pos) 

167 if value is not None: 

168 repr = '"{}"'.format(serialize_string_value(value)) 

169 if error is not None: 

170 repr = repr[:-1] 

171 tokens.append(StringToken(line, column, value, repr)) 

172 if error is not None: 

173 tokens.append(ParseError(line, column, *error)) 

174 elif css.startswith('/*', pos): # Comment 

175 pos = css.find('*/', pos + 2) 

176 if pos == -1: 

177 if not skip_comments: 

178 tokens.append( 

179 Comment(line, column, css[token_start_pos + 2:])) 

180 break 

181 if not skip_comments: 

182 tokens.append( 

183 Comment(line, column, css[token_start_pos + 2:pos])) 

184 pos += 2 

185 elif css.startswith('<!--', pos): 

186 tokens.append(LiteralToken(line, column, '<!--')) 

187 pos += 4 

188 elif css.startswith('||', pos): 

189 tokens.append(LiteralToken(line, column, '||')) 

190 pos += 2 

191 elif c in '~|^$*': 

192 pos += 1 

193 if css.startswith('=', pos): 

194 pos += 1 

195 tokens.append(LiteralToken(line, column, c + '=')) 

196 else: 

197 tokens.append(LiteralToken(line, column, c)) 

198 else: 

199 tokens.append(LiteralToken(line, column, c)) 

200 pos += 1 

201 return root 

202 

203 

204def _is_name_start(css, pos): 

205 """Return true if the given character is a name-start code point.""" 

206 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point 

207 c = css[pos] 

208 return ( 

209 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or 

210 ord(c) > 0x7F) 

211 

212 

213def _is_ident_start(css, pos): 

214 """Return True if the given position is the start of a CSS identifier.""" 

215 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier 

216 if _is_name_start(css, pos): 

217 return True 

218 elif css[pos] == '-': 

219 pos += 1 

220 return ( 

221 # Name-start code point or hyphen: 

222 (pos < len(css) and ( 

223 _is_name_start(css, pos) or css[pos] == '-')) or 

224 # Valid escape: 

225 (css.startswith('\\', pos) and not css.startswith('\\\n', pos))) 

226 elif css[pos] == '\\': 

227 return not css.startswith('\\\n', pos) 

228 return False 

229 

230 

231def _consume_ident(css, pos): 

232 """Return (unescaped_value, new_pos). 

233 

234 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`. 

235 

236 """ 

237 # http://dev.w3.org/csswg/css-syntax/#consume-a-name 

238 chunks = [] 

239 length = len(css) 

240 start_pos = pos 

241 while pos < length: 

242 c = css[pos] 

243 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789' 

244 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F: 

245 pos += 1 

246 elif c == '\\' and not css.startswith('\\\n', pos): 

247 # Valid escape 

248 chunks.append(css[start_pos:pos]) 

249 c, pos = _consume_escape(css, pos + 1) 

250 chunks.append(c) 

251 start_pos = pos 

252 else: 

253 break 

254 chunks.append(css[start_pos:pos]) 

255 return ''.join(chunks), pos 

256 

257 

258def _consume_quoted_string(css, pos): 

259 """Return (unescaped_value, new_pos).""" 

260 # https://drafts.csswg.org/css-syntax/#consume-a-string-token 

261 error = None 

262 quote = css[pos] 

263 assert quote in ('"', "'") 

264 pos += 1 

265 chunks = [] 

266 length = len(css) 

267 start_pos = pos 

268 while pos < length: 

269 c = css[pos] 

270 if c == quote: 

271 chunks.append(css[start_pos:pos]) 

272 pos += 1 

273 break 

274 elif c == '\\': 

275 chunks.append(css[start_pos:pos]) 

276 pos += 1 

277 if pos < length: 

278 if css[pos] == '\n': # Ignore escaped newlines 

279 pos += 1 

280 else: 

281 c, pos = _consume_escape(css, pos) 

282 chunks.append(c) 

283 # else: Escaped EOF, do nothing 

284 start_pos = pos 

285 elif c == '\n': # Unescaped newline 

286 return None, pos, ('bad-string', 'Bad string token') 

287 else: 

288 pos += 1 

289 else: 

290 error = ('eof-in-string', 'EOF in string') 

291 chunks.append(css[start_pos:pos]) 

292 return ''.join(chunks), pos, error 

293 

294 

295def _consume_escape(css, pos): 

296 r"""Return (unescaped_char, new_pos). 

297 

298 Assumes a valid escape: pos is just after '\' and not followed by '\n'. 

299 

300 """ 

301 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character 

302 hex_match = _HEX_ESCAPE_RE.match(css, pos) 

303 if hex_match: 

304 codepoint = int(hex_match.group(1), 16) 

305 return ( 

306 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD', 

307 hex_match.end()) 

308 elif pos < len(css): 

309 return css[pos], pos + 1 

310 else: 

311 return '\uFFFD', pos 

312 

313 

314def _consume_url(css, pos): 

315 """Return (unescaped_url, new_pos) 

316 

317 The given pos is assumed to be just after the '(' of 'url('. 

318 

319 """ 

320 error = None 

321 length = len(css) 

322 # https://drafts.csswg.org/css-syntax/#consume-a-url-token 

323 # Skip whitespace 

324 while css.startswith((' ', '\n', '\t'), pos): 

325 pos += 1 

326 if pos >= length: # EOF 

327 return '', pos, ('eof-in-url', 'EOF in URL') 

328 c = css[pos] 

329 if c in ('"', "'"): 

330 value, pos, error = _consume_quoted_string(css, pos) 

331 elif c == ')': 

332 return '', pos + 1, error 

333 else: 

334 chunks = [] 

335 start_pos = pos 

336 while 1: 

337 if pos >= length: # EOF 

338 chunks.append(css[start_pos:pos]) 

339 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL') 

340 c = css[pos] 

341 if c == ')': 

342 chunks.append(css[start_pos:pos]) 

343 pos += 1 

344 return ''.join(chunks), pos, error 

345 elif c in ' \n\t': 

346 chunks.append(css[start_pos:pos]) 

347 value = ''.join(chunks) 

348 pos += 1 

349 break 

350 elif c == '\\' and not css.startswith('\\\n', pos): 

351 # Valid escape 

352 chunks.append(css[start_pos:pos]) 

353 c, pos = _consume_escape(css, pos + 1) 

354 chunks.append(c) 

355 start_pos = pos 

356 elif (c in 

357 '"\'(' 

358 # https://drafts.csswg.org/css-syntax/#non-printable-character 

359 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e' 

360 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19' 

361 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'): 

362 value = None # Parse error 

363 pos += 1 

364 break 

365 else: 

366 pos += 1 

367 

368 if value is not None: 

369 while css.startswith((' ', '\n', '\t'), pos): 

370 pos += 1 

371 if pos < length: 

372 if css[pos] == ')': 

373 return value, pos + 1, error 

374 else: 

375 if error is None: 

376 error = ('eof-in-url', 'EOF in URL') 

377 return value, pos, error 

378 

379 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0 

380 while pos < length: 

381 if css.startswith('\\)', pos): 

382 pos += 2 

383 elif css[pos] == ')': 

384 pos += 1 

385 break 

386 else: 

387 pos += 1 

388 return None, pos, ('bad-url', 'bad URL token') 

389 

390 

391def _consume_unicode_range(css, pos): 

392 """Return (range, new_pos) 

393 

394 The given pos is assume to be just after the '+' of 'U+' or 'u+'. 

395 

396 """ 

397 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token 

398 length = len(css) 

399 start_pos = pos 

400 max_pos = min(pos + 6, length) 

401 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF': 

402 pos += 1 

403 start = css[start_pos:pos] 

404 

405 start_pos = pos 

406 # Same max_pos as before: total of hex digits and question marks <= 6 

407 while pos < max_pos and css[pos] == '?': 

408 pos += 1 

409 question_marks = pos - start_pos 

410 

411 if question_marks: 

412 end = start + 'F' * question_marks 

413 start = start + '0' * question_marks 

414 elif (pos + 1 < length and css[pos] == '-' and 

415 css[pos + 1] in '0123456789abcdefABCDEF'): 

416 pos += 1 

417 start_pos = pos 

418 max_pos = min(pos + 6, length) 

419 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF': 

420 pos += 1 

421 end = css[start_pos:pos] 

422 else: 

423 end = start 

424 return int(start, 16), int(end, 16), pos