Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tinycss2/tokenizer.py: 99%

1import re

2import sys

4from webencodings import ascii_lower

6from .ast import ( # isort: skip

7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,

8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError,

9 PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken,

10 WhitespaceToken)

11from .serializer import serialize_string_value, serialize_url

13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')

14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')

17def parse_component_value_list(css, skip_comments=False):

18 """Parse a list of component values.

20 :type css: :obj:`str`

21 :param css: A CSS string.

22 :type skip_comments: :obj:`bool`

23 :param skip_comments:

24 Ignore CSS comments.

25 The return values (and recursively its blocks and functions)

26 will not contain any :class:`~tinycss2.ast.Comment` object.

27 :returns: A list of :term:`component values`.

29 """

30 css = (css.replace('\0', '\uFFFD')

31 # This turns out to be faster than a regexp:

32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))

33 length = len(css)

34 token_start_pos = pos = 0 # Character index in the css source.

35 line = 1 # First line is line 1.

36 last_newline = -1

37 root = tokens = []

38 end_char = None # Pop the stack when encountering this character.

39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples.

41 while pos < length:

42 newline = css.rfind('\n', token_start_pos, pos)

43 if newline != -1:

44 line += 1 + css.count('\n', token_start_pos, newline)

45 last_newline = newline

46 # First character in a line is in column 1.

47 column = pos - last_newline

48 token_start_pos = pos

49 c = css[pos]

51 if c in ' \n\t':

52 pos += 1

53 while css.startswith((' ', '\n', '\t'), pos):

54 pos += 1

55 value = css[token_start_pos:pos]

56 tokens.append(WhitespaceToken(line, column, value))

57 continue

58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and

59 css[pos + 2] in '0123456789abcdefABCDEF?'):

60 start, end, pos = _consume_unicode_range(css, pos + 2)

61 tokens.append(UnicodeRangeToken(line, column, start, end))

62 continue

63 elif css.startswith('-->', pos): # Check before identifiers

64 tokens.append(LiteralToken(line, column, '-->'))

65 pos += 3

66 continue

67 elif _is_ident_start(css, pos):

68 value, pos = _consume_ident(css, pos)

69 if not css.startswith('(', pos): # Not a function

70 tokens.append(IdentToken(line, column, value))

71 continue

72 pos += 1 # Skip the '('

73 try:

74 is_url = ascii_lower(value) == 'url'

75 except UnicodeEncodeError:

76 is_url = False

77 if is_url:

78 url_pos = pos

79 while css.startswith((' ', '\n', '\t'), url_pos):

80 url_pos += 1

81 if url_pos >= length or css[url_pos] not in ('"', "'"):

82 value, pos, error = _consume_url(css, pos)

83 if value is not None:

84 repr = f'url({serialize_url(value)})'

85 if error is not None:

86 error_key = error[0]

87 if error_key == 'eof-in-string':

88 repr = repr[:-2]

89 else:

90 assert error_key == 'eof-in-url'

91 repr = repr[:-1]

92 tokens.append(URLToken(line, column, value, repr))

93 if error is not None:

94 tokens.append(ParseError(line, column, *error))

95 continue

96 arguments = []

97 tokens.append(FunctionBlock(line, column, value, arguments))

98 stack.append((tokens, end_char))

99 end_char = ')'

100 tokens = arguments

101 continue

102

103 match = _NUMBER_RE.match(css, pos)

104 if match:

105 pos = match.end()

106 repr_ = css[token_start_pos:pos]

107 value = float(repr_)

108 int_value = int(repr_) if not any(match.groups()) else None

109 if pos < length and _is_ident_start(css, pos):

110 unit, pos = _consume_ident(css, pos)

111 tokens.append(DimensionToken(

112 line, column, value, int_value, repr_, unit))

113 elif css.startswith('%', pos):

114 pos += 1

115 tokens.append(PercentageToken(line, column, value, int_value, repr_))

116 else:

117 tokens.append(NumberToken(line, column, value, int_value, repr_))

118 elif c == '@':

119 pos += 1

120 if pos < length and _is_ident_start(css, pos):

121 value, pos = _consume_ident(css, pos)

122 tokens.append(AtKeywordToken(line, column, value))

123 else:

124 tokens.append(LiteralToken(line, column, '@'))

125 elif c == '#':

126 pos += 1

127 if pos < length and (

128 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'

129 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or

130 ord(css[pos]) > 0x7F or # Non-ASCII

131 # Valid escape:

132 (css[pos] == '\\' and not css.startswith('\\\n', pos))):

133 is_identifier = _is_ident_start(css, pos)

134 value, pos = _consume_ident(css, pos)

135 tokens.append(HashToken(line, column, value, is_identifier))

136 else:

137 tokens.append(LiteralToken(line, column, '#'))

138 elif c == '{':

139 content = []

140 tokens.append(CurlyBracketsBlock(line, column, content))

141 stack.append((tokens, end_char))

142 end_char = '}'

143 tokens = content

144 pos += 1

145 elif c == '[':

146 content = []

147 tokens.append(SquareBracketsBlock(line, column, content))

148 stack.append((tokens, end_char))

149 end_char = ']'

150 tokens = content

151 pos += 1

152 elif c == '(':

153 content = []

154 tokens.append(ParenthesesBlock(line, column, content))

155 stack.append((tokens, end_char))

156 end_char = ')'

157 tokens = content

158 pos += 1

159 elif c == end_char: # Matching }, ] or )

160 # The top-level end_char is None (never equal to a character),

161 # so we never get here if the stack is empty.

162 tokens, end_char = stack.pop()

163 pos += 1

164 elif c in '}])':

165 tokens.append(ParseError(line, column, c, 'Unmatched ' + c))

166 pos += 1

167 elif c in ('"', "'"):

168 value, pos, error = _consume_quoted_string(css, pos)

169 if value is not None:

170 repr = f'"{serialize_string_value(value)}"'

171 if error is not None:

172 repr = repr[:-1]

173 tokens.append(StringToken(line, column, value, repr))

174 if error is not None:

175 tokens.append(ParseError(line, column, *error))

176 elif css.startswith('/*', pos): # Comment

177 pos = css.find('*/', pos + 2)

178 if pos == -1:

179 if not skip_comments:

180 tokens.append(Comment(line, column, css[token_start_pos + 2:]))

181 break

182 if not skip_comments:

183 tokens.append(Comment(line, column, css[token_start_pos + 2:pos]))

184 pos += 2

185 elif css.startswith('<!--', pos):

186 tokens.append(LiteralToken(line, column, '<!--'))

187 pos += 4

188 elif css.startswith('||', pos):

189 tokens.append(LiteralToken(line, column, '||'))

190 pos += 2

191 elif c in '~|^$*':

192 pos += 1

193 if css.startswith('=', pos):

194 pos += 1

195 tokens.append(LiteralToken(line, column, c + '='))

196 else:

197 tokens.append(LiteralToken(line, column, c))

198 else:

199 tokens.append(LiteralToken(line, column, c))

200 pos += 1

201 return root

202

203

204def _is_name_start(css, pos):

205 """Return true if the given character is a name-start code point."""

206 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point

207 c = css[pos]

208 return (

209 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or

210 ord(c) > 0x7F)

211

212

213def _is_ident_start(css, pos):

214 """Return True if the given position is the start of a CSS identifier."""

215 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier

216 if _is_name_start(css, pos):

217 return True

218 elif css[pos] == '-':

219 pos += 1

220 return (

221 # Name-start code point or hyphen:

222 (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or

223 # Valid escape:

224 (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))

225 elif css[pos] == '\\':

226 return not css.startswith('\\\n', pos)

227 return False

228

229

230def _consume_ident(css, pos):

231 """Return (unescaped_value, new_pos).

232

233 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.

234

235 """

236 # http://dev.w3.org/csswg/css-syntax/#consume-a-name

237 chunks = []

238 length = len(css)

239 start_pos = pos

240 while pos < length:

241 c = css[pos]

242 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'

243 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:

244 pos += 1

245 elif c == '\\' and not css.startswith('\\\n', pos):

246 # Valid escape

247 chunks.append(css[start_pos:pos])

248 c, pos = _consume_escape(css, pos + 1)

249 chunks.append(c)

250 start_pos = pos

251 else:

252 break

253 chunks.append(css[start_pos:pos])

254 return ''.join(chunks), pos

255

256

257def _consume_quoted_string(css, pos):

258 """Return (unescaped_value, new_pos)."""

259 # https://drafts.csswg.org/css-syntax/#consume-a-string-token

260 error = None

261 quote = css[pos]

262 assert quote in ('"', "'")

263 pos += 1

264 chunks = []

265 length = len(css)

266 start_pos = pos

267 while pos < length:

268 c = css[pos]

269 if c == quote:

270 chunks.append(css[start_pos:pos])

271 pos += 1

272 break

273 elif c == '\\':

274 chunks.append(css[start_pos:pos])

275 pos += 1

276 if pos < length:

277 if css[pos] == '\n': # Ignore escaped newlines

278 pos += 1

279 else:

280 c, pos = _consume_escape(css, pos)

281 chunks.append(c)

282 # else: Escaped EOF, do nothing

283 start_pos = pos

284 elif c == '\n': # Unescaped newline

285 return None, pos, ('bad-string', 'Bad string token')

286 else:

287 pos += 1

288 else:

289 error = ('eof-in-string', 'EOF in string')

290 chunks.append(css[start_pos:pos])

291 return ''.join(chunks), pos, error

292

293

294def _consume_escape(css, pos):

295 r"""Return (unescaped_char, new_pos).

296

297 Assumes a valid escape: pos is just after '\' and not followed by '\n'.

298

299 """

300 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character

301 hex_match = _HEX_ESCAPE_RE.match(css, pos)

302 if hex_match:

303 codepoint = int(hex_match.group(1), 16)

304 return (

305 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',

306 hex_match.end())

307 elif pos < len(css):

308 return css[pos], pos + 1

309 else:

310 return '\uFFFD', pos

311

312

313def _consume_url(css, pos):

314 """Return (unescaped_url, new_pos)

315

316 The given pos is assumed to be just after the '(' of 'url('.

317

318 """

319 error = None

320 length = len(css)

321 # https://drafts.csswg.org/css-syntax/#consume-a-url-token

322 # Skip whitespace

323 while css.startswith((' ', '\n', '\t'), pos):

324 pos += 1

325 if pos >= length: # EOF

326 return '', pos, ('eof-in-url', 'EOF in URL')

327 c = css[pos]

328 if c in ('"', "'"):

329 value, pos, error = _consume_quoted_string(css, pos)

330 elif c == ')':

331 return '', pos + 1, error

332 else:

333 chunks = []

334 start_pos = pos

335 while 1:

336 if pos >= length: # EOF

337 chunks.append(css[start_pos:pos])

338 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')

339 c = css[pos]

340 if c == ')':

341 chunks.append(css[start_pos:pos])

342 pos += 1

343 return ''.join(chunks), pos, error

344 elif c in ' \n\t':

345 chunks.append(css[start_pos:pos])

346 value = ''.join(chunks)

347 pos += 1

348 break

349 elif c == '\\' and not css.startswith('\\\n', pos):

350 # Valid escape

351 chunks.append(css[start_pos:pos])

352 c, pos = _consume_escape(css, pos + 1)

353 chunks.append(c)

354 start_pos = pos

355 elif (c in

356 '"\'('

357 # https://drafts.csswg.org/css-syntax/#non-printable-character

358 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'

359 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'

360 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):

361 value = None # Parse error

362 pos += 1

363 break

364 else:

365 pos += 1

366

367 if value is not None:

368 while css.startswith((' ', '\n', '\t'), pos):

369 pos += 1

370 if pos < length:

371 if css[pos] == ')':

372 return value, pos + 1, error

373 else:

374 if error is None:

375 error = ('eof-in-url', 'EOF in URL')

376 return value, pos, error

377

378 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0

379 while pos < length:

380 if css.startswith('\\)', pos):

381 pos += 2

382 elif css[pos] == ')':

383 pos += 1

384 break

385 else:

386 pos += 1

387 return None, pos, ('bad-url', 'bad URL token')

388

389

390def _consume_unicode_range(css, pos):

391 """Return (range, new_pos)

392

393 The given pos is assume to be just after the '+' of 'U+' or 'u+'.

394

395 """

396 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token

397 length = len(css)

398 start_pos = pos

399 max_pos = min(pos + 6, length)

400 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':

401 pos += 1

402 start = css[start_pos:pos]

403

404 start_pos = pos

405 # Same max_pos as before: total of hex digits and question marks <= 6

406 while pos < max_pos and css[pos] == '?':

407 pos += 1

408 question_marks = pos - start_pos

409

410 if question_marks:

411 end = start + 'F' * question_marks

412 start = start + '0' * question_marks

413 elif (pos + 1 < length and css[pos] == '-' and

414 css[pos + 1] in '0123456789abcdefABCDEF'):

415 pos += 1

416 start_pos = pos

417 max_pos = min(pos + 6, length)

418 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':

419 pos += 1

420 end = css[start_pos:pos]

421 else:

422 end = start

423 return int(start, 16), int(end, 16), pos