Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tinycss2/tokenizer.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

302 statements  

1import re 

2import sys 

3 

4from webencodings import ascii_lower 

5 

6from .ast import ( # isort: skip 

7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock, 

8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError, 

9 PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken, 

10 WhitespaceToken) 

11from .serializer import serialize_string_value, serialize_url 

12 

13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?') 

14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?') 

15 

16 

17def parse_component_value_list(css, skip_comments=False): 

18 """Parse a list of component values. 

19 

20 :type css: :obj:`str` 

21 :param css: A CSS string. 

22 :type skip_comments: :obj:`bool` 

23 :param skip_comments: 

24 Ignore CSS comments. 

25 The return values (and recursively its blocks and functions) 

26 will not contain any :class:`~tinycss2.ast.Comment` object. 

27 :returns: A list of :term:`component values`. 

28 

29 """ 

30 css = (css.replace('\0', '\uFFFD') 

31 # This turns out to be faster than a regexp: 

32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n')) 

33 length = len(css) 

34 token_start_pos = pos = 0 # Character index in the css source. 

35 line = 1 # First line is line 1. 

36 last_newline = -1 

37 root = tokens = [] 

38 end_char = None # Pop the stack when encountering this character. 

39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples. 

40 

41 while pos < length: 

42 newline = css.rfind('\n', token_start_pos, pos) 

43 if newline != -1: 

44 line += 1 + css.count('\n', token_start_pos, newline) 

45 last_newline = newline 

46 # First character in a line is in column 1. 

47 column = pos - last_newline 

48 token_start_pos = pos 

49 c = css[pos] 

50 

51 if c in ' \n\t': 

52 pos += 1 

53 while css.startswith((' ', '\n', '\t'), pos): 

54 pos += 1 

55 value = css[token_start_pos:pos] 

56 tokens.append(WhitespaceToken(line, column, value)) 

57 continue 

58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and 

59 css[pos + 2] in '0123456789abcdefABCDEF?'): 

60 start, end, pos = _consume_unicode_range(css, pos + 2) 

61 tokens.append(UnicodeRangeToken(line, column, start, end)) 

62 continue 

63 elif css.startswith('-->', pos): # Check before identifiers 

64 tokens.append(LiteralToken(line, column, '-->')) 

65 pos += 3 

66 continue 

67 elif _is_ident_start(css, pos): 

68 value, pos = _consume_ident(css, pos) 

69 if not css.startswith('(', pos): # Not a function 

70 tokens.append(IdentToken(line, column, value)) 

71 continue 

72 pos += 1 # Skip the '(' 

73 try: 

74 is_url = ascii_lower(value) == 'url' 

75 except UnicodeEncodeError: 

76 is_url = False 

77 if is_url: 

78 url_pos = pos 

79 while css.startswith((' ', '\n', '\t'), url_pos): 

80 url_pos += 1 

81 if url_pos >= length or css[url_pos] not in ('"', "'"): 

82 value, pos, error = _consume_url(css, pos) 

83 if value is not None: 

84 repr = f'url({serialize_url(value)})' 

85 if error is not None: 

86 error_key = error[0] 

87 if error_key == 'eof-in-string': 

88 repr = repr[:-2] 

89 else: 

90 assert error_key == 'eof-in-url' 

91 repr = repr[:-1] 

92 tokens.append(URLToken(line, column, value, repr)) 

93 if error is not None: 

94 tokens.append(ParseError(line, column, *error)) 

95 continue 

96 arguments = [] 

97 tokens.append(FunctionBlock(line, column, value, arguments)) 

98 stack.append((tokens, end_char)) 

99 end_char = ')' 

100 tokens = arguments 

101 continue 

102 

103 match = _NUMBER_RE.match(css, pos) 

104 if match: 

105 pos = match.end() 

106 repr_ = css[token_start_pos:pos] 

107 value = float(repr_) 

108 int_value = int(repr_) if not any(match.groups()) else None 

109 if pos < length and _is_ident_start(css, pos): 

110 unit, pos = _consume_ident(css, pos) 

111 tokens.append(DimensionToken( 

112 line, column, value, int_value, repr_, unit)) 

113 elif css.startswith('%', pos): 

114 pos += 1 

115 tokens.append(PercentageToken(line, column, value, int_value, repr_)) 

116 else: 

117 tokens.append(NumberToken(line, column, value, int_value, repr_)) 

118 elif c == '@': 

119 pos += 1 

120 if pos < length and _is_ident_start(css, pos): 

121 value, pos = _consume_ident(css, pos) 

122 tokens.append(AtKeywordToken(line, column, value)) 

123 else: 

124 tokens.append(LiteralToken(line, column, '@')) 

125 elif c == '#': 

126 pos += 1 

127 if pos < length and ( 

128 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz' 

129 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or 

130 ord(css[pos]) > 0x7F or # Non-ASCII 

131 # Valid escape: 

132 (css[pos] == '\\' and not css.startswith('\\\n', pos))): 

133 is_identifier = _is_ident_start(css, pos) 

134 value, pos = _consume_ident(css, pos) 

135 tokens.append(HashToken(line, column, value, is_identifier)) 

136 else: 

137 tokens.append(LiteralToken(line, column, '#')) 

138 elif c == '{': 

139 content = [] 

140 tokens.append(CurlyBracketsBlock(line, column, content)) 

141 stack.append((tokens, end_char)) 

142 end_char = '}' 

143 tokens = content 

144 pos += 1 

145 elif c == '[': 

146 content = [] 

147 tokens.append(SquareBracketsBlock(line, column, content)) 

148 stack.append((tokens, end_char)) 

149 end_char = ']' 

150 tokens = content 

151 pos += 1 

152 elif c == '(': 

153 content = [] 

154 tokens.append(ParenthesesBlock(line, column, content)) 

155 stack.append((tokens, end_char)) 

156 end_char = ')' 

157 tokens = content 

158 pos += 1 

159 elif c == end_char: # Matching }, ] or ) 

160 # The top-level end_char is None (never equal to a character), 

161 # so we never get here if the stack is empty. 

162 tokens, end_char = stack.pop() 

163 pos += 1 

164 elif c in '}])': 

165 tokens.append(ParseError(line, column, c, 'Unmatched ' + c)) 

166 pos += 1 

167 elif c in ('"', "'"): 

168 value, pos, error = _consume_quoted_string(css, pos) 

169 if value is not None: 

170 repr = f'"{serialize_string_value(value)}"' 

171 if error is not None: 

172 repr = repr[:-1] 

173 tokens.append(StringToken(line, column, value, repr)) 

174 if error is not None: 

175 tokens.append(ParseError(line, column, *error)) 

176 elif css.startswith('/*', pos): # Comment 

177 pos = css.find('*/', pos + 2) 

178 if pos == -1: 

179 if not skip_comments: 

180 tokens.append(Comment(line, column, css[token_start_pos + 2:])) 

181 break 

182 if not skip_comments: 

183 tokens.append(Comment(line, column, css[token_start_pos + 2:pos])) 

184 pos += 2 

185 elif css.startswith('<!--', pos): 

186 tokens.append(LiteralToken(line, column, '<!--')) 

187 pos += 4 

188 elif css.startswith('||', pos): 

189 tokens.append(LiteralToken(line, column, '||')) 

190 pos += 2 

191 elif c in '~|^$*': 

192 pos += 1 

193 if css.startswith('=', pos): 

194 pos += 1 

195 tokens.append(LiteralToken(line, column, c + '=')) 

196 else: 

197 tokens.append(LiteralToken(line, column, c)) 

198 else: 

199 tokens.append(LiteralToken(line, column, c)) 

200 pos += 1 

201 return root 

202 

203 

204def _is_name_start(css, pos): 

205 """Return true if the given character is a name-start code point.""" 

206 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point 

207 c = css[pos] 

208 return ( 

209 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or 

210 ord(c) > 0x7F) 

211 

212 

213def _is_ident_start(css, pos): 

214 """Return True if the given position is the start of a CSS identifier.""" 

215 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier 

216 if _is_name_start(css, pos): 

217 return True 

218 elif css[pos] == '-': 

219 pos += 1 

220 return ( 

221 # Name-start code point or hyphen: 

222 (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or 

223 # Valid escape: 

224 (css.startswith('\\', pos) and not css.startswith('\\\n', pos))) 

225 elif css[pos] == '\\': 

226 return not css.startswith('\\\n', pos) 

227 return False 

228 

229 

230def _consume_ident(css, pos): 

231 """Return (unescaped_value, new_pos). 

232 

233 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`. 

234 

235 """ 

236 # http://dev.w3.org/csswg/css-syntax/#consume-a-name 

237 chunks = [] 

238 length = len(css) 

239 start_pos = pos 

240 while pos < length: 

241 c = css[pos] 

242 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789' 

243 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F: 

244 pos += 1 

245 elif c == '\\' and not css.startswith('\\\n', pos): 

246 # Valid escape 

247 chunks.append(css[start_pos:pos]) 

248 c, pos = _consume_escape(css, pos + 1) 

249 chunks.append(c) 

250 start_pos = pos 

251 else: 

252 break 

253 chunks.append(css[start_pos:pos]) 

254 return ''.join(chunks), pos 

255 

256 

257def _consume_quoted_string(css, pos): 

258 """Return (unescaped_value, new_pos).""" 

259 # https://drafts.csswg.org/css-syntax/#consume-a-string-token 

260 error = None 

261 quote = css[pos] 

262 assert quote in ('"', "'") 

263 pos += 1 

264 chunks = [] 

265 length = len(css) 

266 start_pos = pos 

267 while pos < length: 

268 c = css[pos] 

269 if c == quote: 

270 chunks.append(css[start_pos:pos]) 

271 pos += 1 

272 break 

273 elif c == '\\': 

274 chunks.append(css[start_pos:pos]) 

275 pos += 1 

276 if pos < length: 

277 if css[pos] == '\n': # Ignore escaped newlines 

278 pos += 1 

279 else: 

280 c, pos = _consume_escape(css, pos) 

281 chunks.append(c) 

282 # else: Escaped EOF, do nothing 

283 start_pos = pos 

284 elif c == '\n': # Unescaped newline 

285 return None, pos, ('bad-string', 'Bad string token') 

286 else: 

287 pos += 1 

288 else: 

289 error = ('eof-in-string', 'EOF in string') 

290 chunks.append(css[start_pos:pos]) 

291 return ''.join(chunks), pos, error 

292 

293 

294def _consume_escape(css, pos): 

295 r"""Return (unescaped_char, new_pos). 

296 

297 Assumes a valid escape: pos is just after '\' and not followed by '\n'. 

298 

299 """ 

300 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character 

301 hex_match = _HEX_ESCAPE_RE.match(css, pos) 

302 if hex_match: 

303 codepoint = int(hex_match.group(1), 16) 

304 return ( 

305 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD', 

306 hex_match.end()) 

307 elif pos < len(css): 

308 return css[pos], pos + 1 

309 else: 

310 return '\uFFFD', pos 

311 

312 

313def _consume_url(css, pos): 

314 """Return (unescaped_url, new_pos) 

315 

316 The given pos is assumed to be just after the '(' of 'url('. 

317 

318 """ 

319 error = None 

320 length = len(css) 

321 # https://drafts.csswg.org/css-syntax/#consume-a-url-token 

322 # Skip whitespace 

323 while css.startswith((' ', '\n', '\t'), pos): 

324 pos += 1 

325 if pos >= length: # EOF 

326 return '', pos, ('eof-in-url', 'EOF in URL') 

327 c = css[pos] 

328 if c in ('"', "'"): 

329 value, pos, error = _consume_quoted_string(css, pos) 

330 elif c == ')': 

331 return '', pos + 1, error 

332 else: 

333 chunks = [] 

334 start_pos = pos 

335 while 1: 

336 if pos >= length: # EOF 

337 chunks.append(css[start_pos:pos]) 

338 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL') 

339 c = css[pos] 

340 if c == ')': 

341 chunks.append(css[start_pos:pos]) 

342 pos += 1 

343 return ''.join(chunks), pos, error 

344 elif c in ' \n\t': 

345 chunks.append(css[start_pos:pos]) 

346 value = ''.join(chunks) 

347 pos += 1 

348 break 

349 elif c == '\\' and not css.startswith('\\\n', pos): 

350 # Valid escape 

351 chunks.append(css[start_pos:pos]) 

352 c, pos = _consume_escape(css, pos + 1) 

353 chunks.append(c) 

354 start_pos = pos 

355 elif (c in 

356 '"\'(' 

357 # https://drafts.csswg.org/css-syntax/#non-printable-character 

358 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e' 

359 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19' 

360 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'): 

361 value = None # Parse error 

362 pos += 1 

363 break 

364 else: 

365 pos += 1 

366 

367 if value is not None: 

368 while css.startswith((' ', '\n', '\t'), pos): 

369 pos += 1 

370 if pos < length: 

371 if css[pos] == ')': 

372 return value, pos + 1, error 

373 else: 

374 if error is None: 

375 error = ('eof-in-url', 'EOF in URL') 

376 return value, pos, error 

377 

378 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0 

379 while pos < length: 

380 if css.startswith('\\)', pos): 

381 pos += 2 

382 elif css[pos] == ')': 

383 pos += 1 

384 break 

385 else: 

386 pos += 1 

387 return None, pos, ('bad-url', 'bad URL token') 

388 

389 

390def _consume_unicode_range(css, pos): 

391 """Return (range, new_pos) 

392 

393 The given pos is assume to be just after the '+' of 'U+' or 'u+'. 

394 

395 """ 

396 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token 

397 length = len(css) 

398 start_pos = pos 

399 max_pos = min(pos + 6, length) 

400 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF': 

401 pos += 1 

402 start = css[start_pos:pos] 

403 

404 start_pos = pos 

405 # Same max_pos as before: total of hex digits and question marks <= 6 

406 while pos < max_pos and css[pos] == '?': 

407 pos += 1 

408 question_marks = pos - start_pos 

409 

410 if question_marks: 

411 end = start + 'F' * question_marks 

412 start = start + '0' * question_marks 

413 elif (pos + 1 < length and css[pos] == '-' and 

414 css[pos + 1] in '0123456789abcdefABCDEF'): 

415 pos += 1 

416 start_pos = pos 

417 max_pos = min(pos + 6, length) 

418 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF': 

419 pos += 1 

420 end = css[start_pos:pos] 

421 else: 

422 end = start 

423 return int(start, 16), int(end, 16), pos