Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tinycss2/tokenizer.py: 5%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

298 statements  

1import re 

2import sys 

3 

4from webencodings import ascii_lower 

5 

6from .ast import ( # isort: skip 

7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock, 

8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError, 

9 PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken, 

10 WhitespaceToken) 

11from .serializer import serialize_string_value, serialize_url 

12 

13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?') 

14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?') 

15 

16 

17def parse_component_value_list(css, skip_comments=False): 

18 """Parse a list of component values. 

19 

20 :type css: :obj:`str` 

21 :param css: A CSS string. 

22 :type skip_comments: :obj:`bool` 

23 :param skip_comments: 

24 Ignore CSS comments. 

25 The return values (and recursively its blocks and functions) 

26 will not contain any :class:`~tinycss2.ast.Comment` object. 

27 :returns: A list of :term:`component values`. 

28 

29 """ 

30 css = (css.replace('\0', '\uFFFD') 

31 # This turns out to be faster than a regexp: 

32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n')) 

33 length = len(css) 

34 token_start_pos = pos = 0 # Character index in the css source. 

35 line = 1 # First line is line 1. 

36 last_newline = -1 

37 root = tokens = [] 

38 end_char = None # Pop the stack when encountering this character. 

39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples. 

40 

41 while pos < length: 

42 newline = css.rfind('\n', token_start_pos, pos) 

43 if newline != -1: 

44 line += 1 + css.count('\n', token_start_pos, newline) 

45 last_newline = newline 

46 # First character in a line is in column 1. 

47 column = pos - last_newline 

48 token_start_pos = pos 

49 c = css[pos] 

50 

51 if c in ' \n\t': 

52 pos += 1 

53 while css.startswith((' ', '\n', '\t'), pos): 

54 pos += 1 

55 value = css[token_start_pos:pos] 

56 tokens.append(WhitespaceToken(line, column, value)) 

57 continue 

58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and 

59 css[pos + 2] in '0123456789abcdefABCDEF?'): 

60 start, end, pos = _consume_unicode_range(css, pos + 2) 

61 tokens.append(UnicodeRangeToken(line, column, start, end)) 

62 continue 

63 elif css.startswith('-->', pos): # Check before identifiers 

64 tokens.append(LiteralToken(line, column, '-->')) 

65 pos += 3 

66 continue 

67 elif _is_ident_start(css, pos): 

68 value, pos = _consume_ident(css, pos) 

69 if not css.startswith('(', pos): # Not a function 

70 tokens.append(IdentToken(line, column, value)) 

71 continue 

72 pos += 1 # Skip the '(' 

73 if ascii_lower(value) == 'url': 

74 url_pos = pos 

75 while css.startswith((' ', '\n', '\t'), url_pos): 

76 url_pos += 1 

77 if url_pos >= length or css[url_pos] not in ('"', "'"): 

78 value, pos, error = _consume_url(css, pos) 

79 if value is not None: 

80 repr = 'url({})'.format(serialize_url(value)) 

81 if error is not None: 

82 error_key = error[0] 

83 if error_key == 'eof-in-string': 

84 repr = repr[:-2] 

85 else: 

86 assert error_key == 'eof-in-url' 

87 repr = repr[:-1] 

88 tokens.append(URLToken(line, column, value, repr)) 

89 if error is not None: 

90 tokens.append(ParseError(line, column, *error)) 

91 continue 

92 arguments = [] 

93 tokens.append(FunctionBlock(line, column, value, arguments)) 

94 stack.append((tokens, end_char)) 

95 end_char = ')' 

96 tokens = arguments 

97 continue 

98 

99 match = _NUMBER_RE.match(css, pos) 

100 if match: 

101 pos = match.end() 

102 repr_ = css[token_start_pos:pos] 

103 value = float(repr_) 

104 int_value = int(repr_) if not any(match.groups()) else None 

105 if pos < length and _is_ident_start(css, pos): 

106 unit, pos = _consume_ident(css, pos) 

107 tokens.append(DimensionToken( 

108 line, column, value, int_value, repr_, unit)) 

109 elif css.startswith('%', pos): 

110 pos += 1 

111 tokens.append(PercentageToken(line, column, value, int_value, repr_)) 

112 else: 

113 tokens.append(NumberToken(line, column, value, int_value, repr_)) 

114 elif c == '@': 

115 pos += 1 

116 if pos < length and _is_ident_start(css, pos): 

117 value, pos = _consume_ident(css, pos) 

118 tokens.append(AtKeywordToken(line, column, value)) 

119 else: 

120 tokens.append(LiteralToken(line, column, '@')) 

121 elif c == '#': 

122 pos += 1 

123 if pos < length and ( 

124 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz' 

125 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or 

126 ord(css[pos]) > 0x7F or # Non-ASCII 

127 # Valid escape: 

128 (css[pos] == '\\' and not css.startswith('\\\n', pos))): 

129 is_identifier = _is_ident_start(css, pos) 

130 value, pos = _consume_ident(css, pos) 

131 tokens.append(HashToken(line, column, value, is_identifier)) 

132 else: 

133 tokens.append(LiteralToken(line, column, '#')) 

134 elif c == '{': 

135 content = [] 

136 tokens.append(CurlyBracketsBlock(line, column, content)) 

137 stack.append((tokens, end_char)) 

138 end_char = '}' 

139 tokens = content 

140 pos += 1 

141 elif c == '[': 

142 content = [] 

143 tokens.append(SquareBracketsBlock(line, column, content)) 

144 stack.append((tokens, end_char)) 

145 end_char = ']' 

146 tokens = content 

147 pos += 1 

148 elif c == '(': 

149 content = [] 

150 tokens.append(ParenthesesBlock(line, column, content)) 

151 stack.append((tokens, end_char)) 

152 end_char = ')' 

153 tokens = content 

154 pos += 1 

155 elif c == end_char: # Matching }, ] or ) 

156 # The top-level end_char is None (never equal to a character), 

157 # so we never get here if the stack is empty. 

158 tokens, end_char = stack.pop() 

159 pos += 1 

160 elif c in '}])': 

161 tokens.append(ParseError(line, column, c, 'Unmatched ' + c)) 

162 pos += 1 

163 elif c in ('"', "'"): 

164 value, pos, error = _consume_quoted_string(css, pos) 

165 if value is not None: 

166 repr = '"{}"'.format(serialize_string_value(value)) 

167 if error is not None: 

168 repr = repr[:-1] 

169 tokens.append(StringToken(line, column, value, repr)) 

170 if error is not None: 

171 tokens.append(ParseError(line, column, *error)) 

172 elif css.startswith('/*', pos): # Comment 

173 pos = css.find('*/', pos + 2) 

174 if pos == -1: 

175 if not skip_comments: 

176 tokens.append(Comment(line, column, css[token_start_pos + 2:])) 

177 break 

178 if not skip_comments: 

179 tokens.append(Comment(line, column, css[token_start_pos + 2:pos])) 

180 pos += 2 

181 elif css.startswith('<!--', pos): 

182 tokens.append(LiteralToken(line, column, '<!--')) 

183 pos += 4 

184 elif css.startswith('||', pos): 

185 tokens.append(LiteralToken(line, column, '||')) 

186 pos += 2 

187 elif c in '~|^$*': 

188 pos += 1 

189 if css.startswith('=', pos): 

190 pos += 1 

191 tokens.append(LiteralToken(line, column, c + '=')) 

192 else: 

193 tokens.append(LiteralToken(line, column, c)) 

194 else: 

195 tokens.append(LiteralToken(line, column, c)) 

196 pos += 1 

197 return root 

198 

199 

200def _is_name_start(css, pos): 

201 """Return true if the given character is a name-start code point.""" 

202 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point 

203 c = css[pos] 

204 return ( 

205 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or 

206 ord(c) > 0x7F) 

207 

208 

209def _is_ident_start(css, pos): 

210 """Return True if the given position is the start of a CSS identifier.""" 

211 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier 

212 if _is_name_start(css, pos): 

213 return True 

214 elif css[pos] == '-': 

215 pos += 1 

216 return ( 

217 # Name-start code point or hyphen: 

218 (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or 

219 # Valid escape: 

220 (css.startswith('\\', pos) and not css.startswith('\\\n', pos))) 

221 elif css[pos] == '\\': 

222 return not css.startswith('\\\n', pos) 

223 return False 

224 

225 

226def _consume_ident(css, pos): 

227 """Return (unescaped_value, new_pos). 

228 

229 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`. 

230 

231 """ 

232 # http://dev.w3.org/csswg/css-syntax/#consume-a-name 

233 chunks = [] 

234 length = len(css) 

235 start_pos = pos 

236 while pos < length: 

237 c = css[pos] 

238 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789' 

239 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F: 

240 pos += 1 

241 elif c == '\\' and not css.startswith('\\\n', pos): 

242 # Valid escape 

243 chunks.append(css[start_pos:pos]) 

244 c, pos = _consume_escape(css, pos + 1) 

245 chunks.append(c) 

246 start_pos = pos 

247 else: 

248 break 

249 chunks.append(css[start_pos:pos]) 

250 return ''.join(chunks), pos 

251 

252 

253def _consume_quoted_string(css, pos): 

254 """Return (unescaped_value, new_pos).""" 

255 # https://drafts.csswg.org/css-syntax/#consume-a-string-token 

256 error = None 

257 quote = css[pos] 

258 assert quote in ('"', "'") 

259 pos += 1 

260 chunks = [] 

261 length = len(css) 

262 start_pos = pos 

263 while pos < length: 

264 c = css[pos] 

265 if c == quote: 

266 chunks.append(css[start_pos:pos]) 

267 pos += 1 

268 break 

269 elif c == '\\': 

270 chunks.append(css[start_pos:pos]) 

271 pos += 1 

272 if pos < length: 

273 if css[pos] == '\n': # Ignore escaped newlines 

274 pos += 1 

275 else: 

276 c, pos = _consume_escape(css, pos) 

277 chunks.append(c) 

278 # else: Escaped EOF, do nothing 

279 start_pos = pos 

280 elif c == '\n': # Unescaped newline 

281 return None, pos, ('bad-string', 'Bad string token') 

282 else: 

283 pos += 1 

284 else: 

285 error = ('eof-in-string', 'EOF in string') 

286 chunks.append(css[start_pos:pos]) 

287 return ''.join(chunks), pos, error 

288 

289 

290def _consume_escape(css, pos): 

291 r"""Return (unescaped_char, new_pos). 

292 

293 Assumes a valid escape: pos is just after '\' and not followed by '\n'. 

294 

295 """ 

296 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character 

297 hex_match = _HEX_ESCAPE_RE.match(css, pos) 

298 if hex_match: 

299 codepoint = int(hex_match.group(1), 16) 

300 return ( 

301 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD', 

302 hex_match.end()) 

303 elif pos < len(css): 

304 return css[pos], pos + 1 

305 else: 

306 return '\uFFFD', pos 

307 

308 

309def _consume_url(css, pos): 

310 """Return (unescaped_url, new_pos) 

311 

312 The given pos is assumed to be just after the '(' of 'url('. 

313 

314 """ 

315 error = None 

316 length = len(css) 

317 # https://drafts.csswg.org/css-syntax/#consume-a-url-token 

318 # Skip whitespace 

319 while css.startswith((' ', '\n', '\t'), pos): 

320 pos += 1 

321 if pos >= length: # EOF 

322 return '', pos, ('eof-in-url', 'EOF in URL') 

323 c = css[pos] 

324 if c in ('"', "'"): 

325 value, pos, error = _consume_quoted_string(css, pos) 

326 elif c == ')': 

327 return '', pos + 1, error 

328 else: 

329 chunks = [] 

330 start_pos = pos 

331 while 1: 

332 if pos >= length: # EOF 

333 chunks.append(css[start_pos:pos]) 

334 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL') 

335 c = css[pos] 

336 if c == ')': 

337 chunks.append(css[start_pos:pos]) 

338 pos += 1 

339 return ''.join(chunks), pos, error 

340 elif c in ' \n\t': 

341 chunks.append(css[start_pos:pos]) 

342 value = ''.join(chunks) 

343 pos += 1 

344 break 

345 elif c == '\\' and not css.startswith('\\\n', pos): 

346 # Valid escape 

347 chunks.append(css[start_pos:pos]) 

348 c, pos = _consume_escape(css, pos + 1) 

349 chunks.append(c) 

350 start_pos = pos 

351 elif (c in 

352 '"\'(' 

353 # https://drafts.csswg.org/css-syntax/#non-printable-character 

354 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e' 

355 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19' 

356 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'): 

357 value = None # Parse error 

358 pos += 1 

359 break 

360 else: 

361 pos += 1 

362 

363 if value is not None: 

364 while css.startswith((' ', '\n', '\t'), pos): 

365 pos += 1 

366 if pos < length: 

367 if css[pos] == ')': 

368 return value, pos + 1, error 

369 else: 

370 if error is None: 

371 error = ('eof-in-url', 'EOF in URL') 

372 return value, pos, error 

373 

374 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0 

375 while pos < length: 

376 if css.startswith('\\)', pos): 

377 pos += 2 

378 elif css[pos] == ')': 

379 pos += 1 

380 break 

381 else: 

382 pos += 1 

383 return None, pos, ('bad-url', 'bad URL token') 

384 

385 

386def _consume_unicode_range(css, pos): 

387 """Return (range, new_pos) 

388 

389 The given pos is assume to be just after the '+' of 'U+' or 'u+'. 

390 

391 """ 

392 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token 

393 length = len(css) 

394 start_pos = pos 

395 max_pos = min(pos + 6, length) 

396 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF': 

397 pos += 1 

398 start = css[start_pos:pos] 

399 

400 start_pos = pos 

401 # Same max_pos as before: total of hex digits and question marks <= 6 

402 while pos < max_pos and css[pos] == '?': 

403 pos += 1 

404 question_marks = pos - start_pos 

405 

406 if question_marks: 

407 end = start + 'F' * question_marks 

408 start = start + '0' * question_marks 

409 elif (pos + 1 < length and css[pos] == '-' and 

410 css[pos + 1] in '0123456789abcdefABCDEF'): 

411 pos += 1 

412 start_pos = pos 

413 max_pos = min(pos + 6, length) 

414 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF': 

415 pos += 1 

416 end = css[start_pos:pos] 

417 else: 

418 end = start 

419 return int(start, 16), int(end, 16), pos