Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tinycss2/tokenizer.py: 5%

1import re

2import sys

4from webencodings import ascii_lower

6from .ast import ( # isort: skip

7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,

8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError,

9 PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken,

10 WhitespaceToken)

11from .serializer import serialize_string_value, serialize_url

13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')

14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')

17def parse_component_value_list(css, skip_comments=False):

18 """Parse a list of component values.

20 :type css: :obj:`str`

21 :param css: A CSS string.

22 :type skip_comments: :obj:`bool`

23 :param skip_comments:

24 Ignore CSS comments.

25 The return values (and recursively its blocks and functions)

26 will not contain any :class:`~tinycss2.ast.Comment` object.

27 :returns: A list of :term:`component values`.

29 """

30 css = (css.replace('\0', '\uFFFD')

31 # This turns out to be faster than a regexp:

32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))

33 length = len(css)

34 token_start_pos = pos = 0 # Character index in the css source.

35 line = 1 # First line is line 1.

36 last_newline = -1

37 root = tokens = []

38 end_char = None # Pop the stack when encountering this character.

39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples.

41 while pos < length:

42 newline = css.rfind('\n', token_start_pos, pos)

43 if newline != -1:

44 line += 1 + css.count('\n', token_start_pos, newline)

45 last_newline = newline

46 # First character in a line is in column 1.

47 column = pos - last_newline

48 token_start_pos = pos

49 c = css[pos]

51 if c in ' \n\t':

52 pos += 1

53 while css.startswith((' ', '\n', '\t'), pos):

54 pos += 1

55 value = css[token_start_pos:pos]

56 tokens.append(WhitespaceToken(line, column, value))

57 continue

58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and

59 css[pos + 2] in '0123456789abcdefABCDEF?'):

60 start, end, pos = _consume_unicode_range(css, pos + 2)

61 tokens.append(UnicodeRangeToken(line, column, start, end))

62 continue

63 elif css.startswith('-->', pos): # Check before identifiers

64 tokens.append(LiteralToken(line, column, '-->'))

65 pos += 3

66 continue

67 elif _is_ident_start(css, pos):

68 value, pos = _consume_ident(css, pos)

69 if not css.startswith('(', pos): # Not a function

70 tokens.append(IdentToken(line, column, value))

71 continue

72 pos += 1 # Skip the '('

73 if ascii_lower(value) == 'url':

74 url_pos = pos

75 while css.startswith((' ', '\n', '\t'), url_pos):

76 url_pos += 1

77 if url_pos >= length or css[url_pos] not in ('"', "'"):

78 value, pos, error = _consume_url(css, pos)

79 if value is not None:

80 repr = 'url({})'.format(serialize_url(value))

81 if error is not None:

82 error_key = error[0]

83 if error_key == 'eof-in-string':

84 repr = repr[:-2]

85 else:

86 assert error_key == 'eof-in-url'

87 repr = repr[:-1]

88 tokens.append(URLToken(line, column, value, repr))

89 if error is not None:

90 tokens.append(ParseError(line, column, *error))

91 continue

92 arguments = []

93 tokens.append(FunctionBlock(line, column, value, arguments))

94 stack.append((tokens, end_char))

95 end_char = ')'

96 tokens = arguments

97 continue

99 match = _NUMBER_RE.match(css, pos)

100 if match:

101 pos = match.end()

102 repr_ = css[token_start_pos:pos]

103 value = float(repr_)

104 int_value = int(repr_) if not any(match.groups()) else None

105 if pos < length and _is_ident_start(css, pos):

106 unit, pos = _consume_ident(css, pos)

107 tokens.append(DimensionToken(

108 line, column, value, int_value, repr_, unit))

109 elif css.startswith('%', pos):

110 pos += 1

111 tokens.append(PercentageToken(line, column, value, int_value, repr_))

112 else:

113 tokens.append(NumberToken(line, column, value, int_value, repr_))

114 elif c == '@':

115 pos += 1

116 if pos < length and _is_ident_start(css, pos):

117 value, pos = _consume_ident(css, pos)

118 tokens.append(AtKeywordToken(line, column, value))

119 else:

120 tokens.append(LiteralToken(line, column, '@'))

121 elif c == '#':

122 pos += 1

123 if pos < length and (

124 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'

125 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or

126 ord(css[pos]) > 0x7F or # Non-ASCII

127 # Valid escape:

128 (css[pos] == '\\' and not css.startswith('\\\n', pos))):

129 is_identifier = _is_ident_start(css, pos)

130 value, pos = _consume_ident(css, pos)

131 tokens.append(HashToken(line, column, value, is_identifier))

132 else:

133 tokens.append(LiteralToken(line, column, '#'))

134 elif c == '{':

135 content = []

136 tokens.append(CurlyBracketsBlock(line, column, content))

137 stack.append((tokens, end_char))

138 end_char = '}'

139 tokens = content

140 pos += 1

141 elif c == '[':

142 content = []

143 tokens.append(SquareBracketsBlock(line, column, content))

144 stack.append((tokens, end_char))

145 end_char = ']'

146 tokens = content

147 pos += 1

148 elif c == '(':

149 content = []

150 tokens.append(ParenthesesBlock(line, column, content))

151 stack.append((tokens, end_char))

152 end_char = ')'

153 tokens = content

154 pos += 1

155 elif c == end_char: # Matching }, ] or )

156 # The top-level end_char is None (never equal to a character),

157 # so we never get here if the stack is empty.

158 tokens, end_char = stack.pop()

159 pos += 1

160 elif c in '}])':

161 tokens.append(ParseError(line, column, c, 'Unmatched ' + c))

162 pos += 1

163 elif c in ('"', "'"):

164 value, pos, error = _consume_quoted_string(css, pos)

165 if value is not None:

166 repr = '"{}"'.format(serialize_string_value(value))

167 if error is not None:

168 repr = repr[:-1]

169 tokens.append(StringToken(line, column, value, repr))

170 if error is not None:

171 tokens.append(ParseError(line, column, *error))

172 elif css.startswith('/*', pos): # Comment

173 pos = css.find('*/', pos + 2)

174 if pos == -1:

175 if not skip_comments:

176 tokens.append(Comment(line, column, css[token_start_pos + 2:]))

177 break

178 if not skip_comments:

179 tokens.append(Comment(line, column, css[token_start_pos + 2:pos]))

180 pos += 2

181 elif css.startswith('<!--', pos):

182 tokens.append(LiteralToken(line, column, '<!--'))

183 pos += 4

184 elif css.startswith('||', pos):

185 tokens.append(LiteralToken(line, column, '||'))

186 pos += 2

187 elif c in '~|^$*':

188 pos += 1

189 if css.startswith('=', pos):

190 pos += 1

191 tokens.append(LiteralToken(line, column, c + '='))

192 else:

193 tokens.append(LiteralToken(line, column, c))

194 else:

195 tokens.append(LiteralToken(line, column, c))

196 pos += 1

197 return root

198

199

200def _is_name_start(css, pos):

201 """Return true if the given character is a name-start code point."""

202 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point

203 c = css[pos]

204 return (

205 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or

206 ord(c) > 0x7F)

207

208

209def _is_ident_start(css, pos):

210 """Return True if the given position is the start of a CSS identifier."""

211 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier

212 if _is_name_start(css, pos):

213 return True

214 elif css[pos] == '-':

215 pos += 1

216 return (

217 # Name-start code point or hyphen:

218 (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or

219 # Valid escape:

220 (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))

221 elif css[pos] == '\\':

222 return not css.startswith('\\\n', pos)

223 return False

224

225

226def _consume_ident(css, pos):

227 """Return (unescaped_value, new_pos).

228

229 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.

230

231 """

232 # http://dev.w3.org/csswg/css-syntax/#consume-a-name

233 chunks = []

234 length = len(css)

235 start_pos = pos

236 while pos < length:

237 c = css[pos]

238 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'

239 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:

240 pos += 1

241 elif c == '\\' and not css.startswith('\\\n', pos):

242 # Valid escape

243 chunks.append(css[start_pos:pos])

244 c, pos = _consume_escape(css, pos + 1)

245 chunks.append(c)

246 start_pos = pos

247 else:

248 break

249 chunks.append(css[start_pos:pos])

250 return ''.join(chunks), pos

251

252

253def _consume_quoted_string(css, pos):

254 """Return (unescaped_value, new_pos)."""

255 # https://drafts.csswg.org/css-syntax/#consume-a-string-token

256 error = None

257 quote = css[pos]

258 assert quote in ('"', "'")

259 pos += 1

260 chunks = []

261 length = len(css)

262 start_pos = pos

263 while pos < length:

264 c = css[pos]

265 if c == quote:

266 chunks.append(css[start_pos:pos])

267 pos += 1

268 break

269 elif c == '\\':

270 chunks.append(css[start_pos:pos])

271 pos += 1

272 if pos < length:

273 if css[pos] == '\n': # Ignore escaped newlines

274 pos += 1

275 else:

276 c, pos = _consume_escape(css, pos)

277 chunks.append(c)

278 # else: Escaped EOF, do nothing

279 start_pos = pos

280 elif c == '\n': # Unescaped newline

281 return None, pos, ('bad-string', 'Bad string token')

282 else:

283 pos += 1

284 else:

285 error = ('eof-in-string', 'EOF in string')

286 chunks.append(css[start_pos:pos])

287 return ''.join(chunks), pos, error

288

289

290def _consume_escape(css, pos):

291 r"""Return (unescaped_char, new_pos).

292

293 Assumes a valid escape: pos is just after '\' and not followed by '\n'.

294

295 """

296 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character

297 hex_match = _HEX_ESCAPE_RE.match(css, pos)

298 if hex_match:

299 codepoint = int(hex_match.group(1), 16)

300 return (

301 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',

302 hex_match.end())

303 elif pos < len(css):

304 return css[pos], pos + 1

305 else:

306 return '\uFFFD', pos

307

308

309def _consume_url(css, pos):

310 """Return (unescaped_url, new_pos)

311

312 The given pos is assumed to be just after the '(' of 'url('.

313

314 """

315 error = None

316 length = len(css)

317 # https://drafts.csswg.org/css-syntax/#consume-a-url-token

318 # Skip whitespace

319 while css.startswith((' ', '\n', '\t'), pos):

320 pos += 1

321 if pos >= length: # EOF

322 return '', pos, ('eof-in-url', 'EOF in URL')

323 c = css[pos]

324 if c in ('"', "'"):

325 value, pos, error = _consume_quoted_string(css, pos)

326 elif c == ')':

327 return '', pos + 1, error

328 else:

329 chunks = []

330 start_pos = pos

331 while 1:

332 if pos >= length: # EOF

333 chunks.append(css[start_pos:pos])

334 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')

335 c = css[pos]

336 if c == ')':

337 chunks.append(css[start_pos:pos])

338 pos += 1

339 return ''.join(chunks), pos, error

340 elif c in ' \n\t':

341 chunks.append(css[start_pos:pos])

342 value = ''.join(chunks)

343 pos += 1

344 break

345 elif c == '\\' and not css.startswith('\\\n', pos):

346 # Valid escape

347 chunks.append(css[start_pos:pos])

348 c, pos = _consume_escape(css, pos + 1)

349 chunks.append(c)

350 start_pos = pos

351 elif (c in

352 '"\'('

353 # https://drafts.csswg.org/css-syntax/#non-printable-character

354 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'

355 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'

356 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):

357 value = None # Parse error

358 pos += 1

359 break

360 else:

361 pos += 1

362

363 if value is not None:

364 while css.startswith((' ', '\n', '\t'), pos):

365 pos += 1

366 if pos < length:

367 if css[pos] == ')':

368 return value, pos + 1, error

369 else:

370 if error is None:

371 error = ('eof-in-url', 'EOF in URL')

372 return value, pos, error

373

374 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0

375 while pos < length:

376 if css.startswith('\\)', pos):

377 pos += 2

378 elif css[pos] == ')':

379 pos += 1

380 break

381 else:

382 pos += 1

383 return None, pos, ('bad-url', 'bad URL token')

384

385

386def _consume_unicode_range(css, pos):

387 """Return (range, new_pos)

388

389 The given pos is assume to be just after the '+' of 'U+' or 'u+'.

390

391 """

392 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token

393 length = len(css)

394 start_pos = pos

395 max_pos = min(pos + 6, length)

396 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':

397 pos += 1

398 start = css[start_pos:pos]

399

400 start_pos = pos

401 # Same max_pos as before: total of hex digits and question marks <= 6

402 while pos < max_pos and css[pos] == '?':

403 pos += 1

404 question_marks = pos - start_pos

405

406 if question_marks:

407 end = start + 'F' * question_marks

408 start = start + '0' * question_marks

409 elif (pos + 1 < length and css[pos] == '-' and

410 css[pos + 1] in '0123456789abcdefABCDEF'):

411 pos += 1

412 start_pos = pos

413 max_pos = min(pos + 6, length)

414 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':

415 pos += 1

416 end = css[start_pos:pos]

417 else:

418 end = start

419 return int(start, 16), int(end, 16), pos