Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tinycss2/tokenizer.py: 5%

1import re

2import sys

4from webencodings import ascii_lower

6from .ast import (

7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,

8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock,

9 ParseError, PercentageToken, SquareBracketsBlock, StringToken,

10 UnicodeRangeToken, URLToken, WhitespaceToken)

11from .serializer import serialize_string_value, serialize_url

13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')

14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')

17def parse_component_value_list(css, skip_comments=False):

18 """Parse a list of component values.

20 :type css: :obj:`str`

21 :param css: A CSS string.

22 :type skip_comments: :obj:`bool`

23 :param skip_comments:

24 Ignore CSS comments.

25 The return values (and recursively its blocks and functions)

26 will not contain any :class:`~tinycss2.ast.Comment` object.

27 :returns: A list of :term:`component values`.

29 """

30 css = (css.replace('\0', '\uFFFD')

31 # This turns out to be faster than a regexp:

32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))

33 length = len(css)

34 token_start_pos = pos = 0 # Character index in the css source.

35 line = 1 # First line is line 1.

36 last_newline = -1

37 root = tokens = []

38 end_char = None # Pop the stack when encountering this character.

39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples.

41 while pos < length:

42 newline = css.rfind('\n', token_start_pos, pos)

43 if newline != -1:

44 line += 1 + css.count('\n', token_start_pos, newline)

45 last_newline = newline

46 # First character in a line is in column 1.

47 column = pos - last_newline

48 token_start_pos = pos

49 c = css[pos]

51 if c in ' \n\t':

52 pos += 1

53 while css.startswith((' ', '\n', '\t'), pos):

54 pos += 1

55 value = css[token_start_pos:pos]

56 tokens.append(WhitespaceToken(line, column, value))

57 continue

58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and

59 css[pos + 2] in '0123456789abcdefABCDEF?'):

60 start, end, pos = _consume_unicode_range(css, pos + 2)

61 tokens.append(UnicodeRangeToken(line, column, start, end))

62 continue

63 elif css.startswith('-->', pos): # Check before identifiers

64 tokens.append(LiteralToken(line, column, '-->'))

65 pos += 3

66 continue

67 elif _is_ident_start(css, pos):

68 value, pos = _consume_ident(css, pos)

69 if not css.startswith('(', pos): # Not a function

70 tokens.append(IdentToken(line, column, value))

71 continue

72 pos += 1 # Skip the '('

73 if ascii_lower(value) == 'url':

74 url_pos = pos

75 while css.startswith((' ', '\n', '\t'), url_pos):

76 url_pos += 1

77 if url_pos >= length or css[url_pos] not in ('"', "'"):

78 value, pos, error = _consume_url(css, pos)

79 if value is not None:

80 repr = 'url({})'.format(serialize_url(value))

81 if error is not None:

82 error_key = error[0]

83 if error_key == 'eof-in-string':

84 repr = repr[:-2]

85 else:

86 assert error_key == 'eof-in-url'

87 repr = repr[:-1]

88 tokens.append(URLToken(line, column, value, repr))

89 if error is not None:

90 tokens.append(ParseError(line, column, *error))

91 continue

92 arguments = []

93 tokens.append(FunctionBlock(line, column, value, arguments))

94 stack.append((tokens, end_char))

95 end_char = ')'

96 tokens = arguments

97 continue

99 match = _NUMBER_RE.match(css, pos)

100 if match:

101 pos = match.end()

102 repr_ = css[token_start_pos:pos]

103 value = float(repr_)

104 int_value = int(repr_) if not any(match.groups()) else None

105 if pos < length and _is_ident_start(css, pos):

106 unit, pos = _consume_ident(css, pos)

107 tokens.append(DimensionToken(

108 line, column, value, int_value, repr_, unit))

109 elif css.startswith('%', pos):

110 pos += 1

111 tokens.append(PercentageToken(

112 line, column, value, int_value, repr_))

113 else:

114 tokens.append(NumberToken(

115 line, column, value, int_value, repr_))

116 elif c == '@':

117 pos += 1

118 if pos < length and _is_ident_start(css, pos):

119 value, pos = _consume_ident(css, pos)

120 tokens.append(AtKeywordToken(line, column, value))

121 else:

122 tokens.append(LiteralToken(line, column, '@'))

123 elif c == '#':

124 pos += 1

125 if pos < length and (

126 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'

127 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or

128 ord(css[pos]) > 0x7F or # Non-ASCII

129 # Valid escape:

130 (css[pos] == '\\' and not css.startswith('\\\n', pos))):

131 is_identifier = _is_ident_start(css, pos)

132 value, pos = _consume_ident(css, pos)

133 tokens.append(HashToken(line, column, value, is_identifier))

134 else:

135 tokens.append(LiteralToken(line, column, '#'))

136 elif c == '{':

137 content = []

138 tokens.append(CurlyBracketsBlock(line, column, content))

139 stack.append((tokens, end_char))

140 end_char = '}'

141 tokens = content

142 pos += 1

143 elif c == '[':

144 content = []

145 tokens.append(SquareBracketsBlock(line, column, content))

146 stack.append((tokens, end_char))

147 end_char = ']'

148 tokens = content

149 pos += 1

150 elif c == '(':

151 content = []

152 tokens.append(ParenthesesBlock(line, column, content))

153 stack.append((tokens, end_char))

154 end_char = ')'

155 tokens = content

156 pos += 1

157 elif c == end_char: # Matching }, ] or )

158 # The top-level end_char is None (never equal to a character),

159 # so we never get here if the stack is empty.

160 tokens, end_char = stack.pop()

161 pos += 1

162 elif c in '}])':

163 tokens.append(ParseError(line, column, c, 'Unmatched ' + c))

164 pos += 1

165 elif c in ('"', "'"):

166 value, pos, error = _consume_quoted_string(css, pos)

167 if value is not None:

168 repr = '"{}"'.format(serialize_string_value(value))

169 if error is not None:

170 repr = repr[:-1]

171 tokens.append(StringToken(line, column, value, repr))

172 if error is not None:

173 tokens.append(ParseError(line, column, *error))

174 elif css.startswith('/*', pos): # Comment

175 pos = css.find('*/', pos + 2)

176 if pos == -1:

177 if not skip_comments:

178 tokens.append(

179 Comment(line, column, css[token_start_pos + 2:]))

180 break

181 if not skip_comments:

182 tokens.append(

183 Comment(line, column, css[token_start_pos + 2:pos]))

184 pos += 2

185 elif css.startswith('<!--', pos):

186 tokens.append(LiteralToken(line, column, '<!--'))

187 pos += 4

188 elif css.startswith('||', pos):

189 tokens.append(LiteralToken(line, column, '||'))

190 pos += 2

191 elif c in '~|^$*':

192 pos += 1

193 if css.startswith('=', pos):

194 pos += 1

195 tokens.append(LiteralToken(line, column, c + '='))

196 else:

197 tokens.append(LiteralToken(line, column, c))

198 else:

199 tokens.append(LiteralToken(line, column, c))

200 pos += 1

201 return root

202

203

204def _is_name_start(css, pos):

205 """Return true if the given character is a name-start code point."""

206 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point

207 c = css[pos]

208 return (

209 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or

210 ord(c) > 0x7F)

211

212

213def _is_ident_start(css, pos):

214 """Return True if the given position is the start of a CSS identifier."""

215 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier

216 if _is_name_start(css, pos):

217 return True

218 elif css[pos] == '-':

219 pos += 1

220 return (

221 # Name-start code point or hyphen:

222 (pos < len(css) and (

223 _is_name_start(css, pos) or css[pos] == '-')) or

224 # Valid escape:

225 (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))

226 elif css[pos] == '\\':

227 return not css.startswith('\\\n', pos)

228 return False

229

230

231def _consume_ident(css, pos):

232 """Return (unescaped_value, new_pos).

233

234 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.

235

236 """

237 # http://dev.w3.org/csswg/css-syntax/#consume-a-name

238 chunks = []

239 length = len(css)

240 start_pos = pos

241 while pos < length:

242 c = css[pos]

243 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'

244 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:

245 pos += 1

246 elif c == '\\' and not css.startswith('\\\n', pos):

247 # Valid escape

248 chunks.append(css[start_pos:pos])

249 c, pos = _consume_escape(css, pos + 1)

250 chunks.append(c)

251 start_pos = pos

252 else:

253 break

254 chunks.append(css[start_pos:pos])

255 return ''.join(chunks), pos

256

257

258def _consume_quoted_string(css, pos):

259 """Return (unescaped_value, new_pos)."""

260 # https://drafts.csswg.org/css-syntax/#consume-a-string-token

261 error = None

262 quote = css[pos]

263 assert quote in ('"', "'")

264 pos += 1

265 chunks = []

266 length = len(css)

267 start_pos = pos

268 while pos < length:

269 c = css[pos]

270 if c == quote:

271 chunks.append(css[start_pos:pos])

272 pos += 1

273 break

274 elif c == '\\':

275 chunks.append(css[start_pos:pos])

276 pos += 1

277 if pos < length:

278 if css[pos] == '\n': # Ignore escaped newlines

279 pos += 1

280 else:

281 c, pos = _consume_escape(css, pos)

282 chunks.append(c)

283 # else: Escaped EOF, do nothing

284 start_pos = pos

285 elif c == '\n': # Unescaped newline

286 return None, pos, ('bad-string', 'Bad string token')

287 else:

288 pos += 1

289 else:

290 error = ('eof-in-string', 'EOF in string')

291 chunks.append(css[start_pos:pos])

292 return ''.join(chunks), pos, error

293

294

295def _consume_escape(css, pos):

296 r"""Return (unescaped_char, new_pos).

297

298 Assumes a valid escape: pos is just after '\' and not followed by '\n'.

299

300 """

301 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character

302 hex_match = _HEX_ESCAPE_RE.match(css, pos)

303 if hex_match:

304 codepoint = int(hex_match.group(1), 16)

305 return (

306 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',

307 hex_match.end())

308 elif pos < len(css):

309 return css[pos], pos + 1

310 else:

311 return '\uFFFD', pos

312

313

314def _consume_url(css, pos):

315 """Return (unescaped_url, new_pos)

316

317 The given pos is assumed to be just after the '(' of 'url('.

318

319 """

320 error = None

321 length = len(css)

322 # https://drafts.csswg.org/css-syntax/#consume-a-url-token

323 # Skip whitespace

324 while css.startswith((' ', '\n', '\t'), pos):

325 pos += 1

326 if pos >= length: # EOF

327 return '', pos, ('eof-in-url', 'EOF in URL')

328 c = css[pos]

329 if c in ('"', "'"):

330 value, pos, error = _consume_quoted_string(css, pos)

331 elif c == ')':

332 return '', pos + 1, error

333 else:

334 chunks = []

335 start_pos = pos

336 while 1:

337 if pos >= length: # EOF

338 chunks.append(css[start_pos:pos])

339 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')

340 c = css[pos]

341 if c == ')':

342 chunks.append(css[start_pos:pos])

343 pos += 1

344 return ''.join(chunks), pos, error

345 elif c in ' \n\t':

346 chunks.append(css[start_pos:pos])

347 value = ''.join(chunks)

348 pos += 1

349 break

350 elif c == '\\' and not css.startswith('\\\n', pos):

351 # Valid escape

352 chunks.append(css[start_pos:pos])

353 c, pos = _consume_escape(css, pos + 1)

354 chunks.append(c)

355 start_pos = pos

356 elif (c in

357 '"\'('

358 # https://drafts.csswg.org/css-syntax/#non-printable-character

359 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'

360 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'

361 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):

362 value = None # Parse error

363 pos += 1

364 break

365 else:

366 pos += 1

367

368 if value is not None:

369 while css.startswith((' ', '\n', '\t'), pos):

370 pos += 1

371 if pos < length:

372 if css[pos] == ')':

373 return value, pos + 1, error

374 else:

375 if error is None:

376 error = ('eof-in-url', 'EOF in URL')

377 return value, pos, error

378

379 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0

380 while pos < length:

381 if css.startswith('\\)', pos):

382 pos += 2

383 elif css[pos] == ')':

384 pos += 1

385 break

386 else:

387 pos += 1

388 return None, pos, ('bad-url', 'bad URL token')

389

390

391def _consume_unicode_range(css, pos):

392 """Return (range, new_pos)

393

394 The given pos is assume to be just after the '+' of 'U+' or 'u+'.

395

396 """

397 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token

398 length = len(css)

399 start_pos = pos

400 max_pos = min(pos + 6, length)

401 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':

402 pos += 1

403 start = css[start_pos:pos]

404

405 start_pos = pos

406 # Same max_pos as before: total of hex digits and question marks <= 6

407 while pos < max_pos and css[pos] == '?':

408 pos += 1

409 question_marks = pos - start_pos

410

411 if question_marks:

412 end = start + 'F' * question_marks

413 start = start + '0' * question_marks

414 elif (pos + 1 < length and css[pos] == '-' and

415 css[pos + 1] in '0123456789abcdefABCDEF'):

416 pos += 1

417 start_pos = pos

418 max_pos = min(pos + 6, length)

419 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':

420 pos += 1

421 end = css[start_pos:pos]

422 else:

423 end = start

424 return int(start, 16), int(end, 16), pos