1import re
2import sys
3
4from webencodings import ascii_lower
5
6from .ast import ( # isort: skip
7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,
8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError,
9 PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken,
10 WhitespaceToken)
11from .serializer import serialize_string_value, serialize_url
12
13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')
14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')
15
16
17def parse_component_value_list(css, skip_comments=False):
18 """Parse a list of component values.
19
20 :type css: :obj:`str`
21 :param css: A CSS string.
22 :type skip_comments: :obj:`bool`
23 :param skip_comments:
24 Ignore CSS comments.
25 The return values (and recursively its blocks and functions)
26 will not contain any :class:`~tinycss2.ast.Comment` object.
27 :returns: A list of :term:`component values`.
28
29 """
30 css = (css.replace('\0', '\uFFFD')
31 # This turns out to be faster than a regexp:
32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))
33 length = len(css)
34 token_start_pos = pos = 0 # Character index in the css source.
35 line = 1 # First line is line 1.
36 last_newline = -1
37 root = tokens = []
38 end_char = None # Pop the stack when encountering this character.
39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples.
40
41 while pos < length:
42 newline = css.rfind('\n', token_start_pos, pos)
43 if newline != -1:
44 line += 1 + css.count('\n', token_start_pos, newline)
45 last_newline = newline
46 # First character in a line is in column 1.
47 column = pos - last_newline
48 token_start_pos = pos
49 c = css[pos]
50
51 if c in ' \n\t':
52 pos += 1
53 while css.startswith((' ', '\n', '\t'), pos):
54 pos += 1
55 value = css[token_start_pos:pos]
56 tokens.append(WhitespaceToken(line, column, value))
57 continue
58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and
59 css[pos + 2] in '0123456789abcdefABCDEF?'):
60 start, end, pos = _consume_unicode_range(css, pos + 2)
61 tokens.append(UnicodeRangeToken(line, column, start, end))
62 continue
63 elif css.startswith('-->', pos): # Check before identifiers
64 tokens.append(LiteralToken(line, column, '-->'))
65 pos += 3
66 continue
67 elif _is_ident_start(css, pos):
68 value, pos = _consume_ident(css, pos)
69 if not css.startswith('(', pos): # Not a function
70 tokens.append(IdentToken(line, column, value))
71 continue
72 pos += 1 # Skip the '('
73 try:
74 is_url = ascii_lower(value) == 'url'
75 except UnicodeEncodeError:
76 is_url = False
77 if is_url:
78 url_pos = pos
79 while css.startswith((' ', '\n', '\t'), url_pos):
80 url_pos += 1
81 if url_pos >= length or css[url_pos] not in ('"', "'"):
82 value, pos, error = _consume_url(css, pos)
83 if value is not None:
84 repr = f'url({serialize_url(value)})'
85 if error is not None:
86 error_key = error[0]
87 if error_key == 'eof-in-string':
88 repr = repr[:-2]
89 else:
90 assert error_key == 'eof-in-url'
91 repr = repr[:-1]
92 tokens.append(URLToken(line, column, value, repr))
93 if error is not None:
94 tokens.append(ParseError(line, column, *error))
95 continue
96 arguments = []
97 tokens.append(FunctionBlock(line, column, value, arguments))
98 stack.append((tokens, end_char))
99 end_char = ')'
100 tokens = arguments
101 continue
102
103 match = _NUMBER_RE.match(css, pos)
104 if match:
105 pos = match.end()
106 repr_ = css[token_start_pos:pos]
107 value = float(repr_)
108 int_value = int(repr_) if not any(match.groups()) else None
109 if pos < length and _is_ident_start(css, pos):
110 unit, pos = _consume_ident(css, pos)
111 tokens.append(DimensionToken(
112 line, column, value, int_value, repr_, unit))
113 elif css.startswith('%', pos):
114 pos += 1
115 tokens.append(PercentageToken(line, column, value, int_value, repr_))
116 else:
117 tokens.append(NumberToken(line, column, value, int_value, repr_))
118 elif c == '@':
119 pos += 1
120 if pos < length and _is_ident_start(css, pos):
121 value, pos = _consume_ident(css, pos)
122 tokens.append(AtKeywordToken(line, column, value))
123 else:
124 tokens.append(LiteralToken(line, column, '@'))
125 elif c == '#':
126 pos += 1
127 if pos < length and (
128 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'
129 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or
130 ord(css[pos]) > 0x7F or # Non-ASCII
131 # Valid escape:
132 (css[pos] == '\\' and not css.startswith('\\\n', pos))):
133 is_identifier = _is_ident_start(css, pos)
134 value, pos = _consume_ident(css, pos)
135 tokens.append(HashToken(line, column, value, is_identifier))
136 else:
137 tokens.append(LiteralToken(line, column, '#'))
138 elif c == '{':
139 content = []
140 tokens.append(CurlyBracketsBlock(line, column, content))
141 stack.append((tokens, end_char))
142 end_char = '}'
143 tokens = content
144 pos += 1
145 elif c == '[':
146 content = []
147 tokens.append(SquareBracketsBlock(line, column, content))
148 stack.append((tokens, end_char))
149 end_char = ']'
150 tokens = content
151 pos += 1
152 elif c == '(':
153 content = []
154 tokens.append(ParenthesesBlock(line, column, content))
155 stack.append((tokens, end_char))
156 end_char = ')'
157 tokens = content
158 pos += 1
159 elif c == end_char: # Matching }, ] or )
160 # The top-level end_char is None (never equal to a character),
161 # so we never get here if the stack is empty.
162 tokens, end_char = stack.pop()
163 pos += 1
164 elif c in '}])':
165 tokens.append(ParseError(line, column, c, 'Unmatched ' + c))
166 pos += 1
167 elif c in ('"', "'"):
168 value, pos, error = _consume_quoted_string(css, pos)
169 if value is not None:
170 repr = f'"{serialize_string_value(value)}"'
171 if error is not None:
172 repr = repr[:-1]
173 tokens.append(StringToken(line, column, value, repr))
174 if error is not None:
175 tokens.append(ParseError(line, column, *error))
176 elif css.startswith('/*', pos): # Comment
177 pos = css.find('*/', pos + 2)
178 if pos == -1:
179 if not skip_comments:
180 tokens.append(Comment(line, column, css[token_start_pos + 2:]))
181 break
182 if not skip_comments:
183 tokens.append(Comment(line, column, css[token_start_pos + 2:pos]))
184 pos += 2
185 elif css.startswith('<!--', pos):
186 tokens.append(LiteralToken(line, column, '<!--'))
187 pos += 4
188 elif css.startswith('||', pos):
189 tokens.append(LiteralToken(line, column, '||'))
190 pos += 2
191 elif c in '~|^$*':
192 pos += 1
193 if css.startswith('=', pos):
194 pos += 1
195 tokens.append(LiteralToken(line, column, c + '='))
196 else:
197 tokens.append(LiteralToken(line, column, c))
198 else:
199 tokens.append(LiteralToken(line, column, c))
200 pos += 1
201 return root
202
203
204def _is_name_start(css, pos):
205 """Return true if the given character is a name-start code point."""
206 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point
207 c = css[pos]
208 return (
209 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or
210 ord(c) > 0x7F)
211
212
213def _is_ident_start(css, pos):
214 """Return True if the given position is the start of a CSS identifier."""
215 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier
216 if _is_name_start(css, pos):
217 return True
218 elif css[pos] == '-':
219 pos += 1
220 return (
221 # Name-start code point or hyphen:
222 (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or
223 # Valid escape:
224 (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))
225 elif css[pos] == '\\':
226 return not css.startswith('\\\n', pos)
227 return False
228
229
230def _consume_ident(css, pos):
231 """Return (unescaped_value, new_pos).
232
233 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.
234
235 """
236 # http://dev.w3.org/csswg/css-syntax/#consume-a-name
237 chunks = []
238 length = len(css)
239 start_pos = pos
240 while pos < length:
241 c = css[pos]
242 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'
243 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:
244 pos += 1
245 elif c == '\\' and not css.startswith('\\\n', pos):
246 # Valid escape
247 chunks.append(css[start_pos:pos])
248 c, pos = _consume_escape(css, pos + 1)
249 chunks.append(c)
250 start_pos = pos
251 else:
252 break
253 chunks.append(css[start_pos:pos])
254 return ''.join(chunks), pos
255
256
257def _consume_quoted_string(css, pos):
258 """Return (unescaped_value, new_pos)."""
259 # https://drafts.csswg.org/css-syntax/#consume-a-string-token
260 error = None
261 quote = css[pos]
262 assert quote in ('"', "'")
263 pos += 1
264 chunks = []
265 length = len(css)
266 start_pos = pos
267 while pos < length:
268 c = css[pos]
269 if c == quote:
270 chunks.append(css[start_pos:pos])
271 pos += 1
272 break
273 elif c == '\\':
274 chunks.append(css[start_pos:pos])
275 pos += 1
276 if pos < length:
277 if css[pos] == '\n': # Ignore escaped newlines
278 pos += 1
279 else:
280 c, pos = _consume_escape(css, pos)
281 chunks.append(c)
282 # else: Escaped EOF, do nothing
283 start_pos = pos
284 elif c == '\n': # Unescaped newline
285 return None, pos, ('bad-string', 'Bad string token')
286 else:
287 pos += 1
288 else:
289 error = ('eof-in-string', 'EOF in string')
290 chunks.append(css[start_pos:pos])
291 return ''.join(chunks), pos, error
292
293
294def _consume_escape(css, pos):
295 r"""Return (unescaped_char, new_pos).
296
297 Assumes a valid escape: pos is just after '\' and not followed by '\n'.
298
299 """
300 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character
301 hex_match = _HEX_ESCAPE_RE.match(css, pos)
302 if hex_match:
303 codepoint = int(hex_match.group(1), 16)
304 return (
305 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',
306 hex_match.end())
307 elif pos < len(css):
308 return css[pos], pos + 1
309 else:
310 return '\uFFFD', pos
311
312
313def _consume_url(css, pos):
314 """Return (unescaped_url, new_pos)
315
316 The given pos is assumed to be just after the '(' of 'url('.
317
318 """
319 error = None
320 length = len(css)
321 # https://drafts.csswg.org/css-syntax/#consume-a-url-token
322 # Skip whitespace
323 while css.startswith((' ', '\n', '\t'), pos):
324 pos += 1
325 if pos >= length: # EOF
326 return '', pos, ('eof-in-url', 'EOF in URL')
327 c = css[pos]
328 if c in ('"', "'"):
329 value, pos, error = _consume_quoted_string(css, pos)
330 elif c == ')':
331 return '', pos + 1, error
332 else:
333 chunks = []
334 start_pos = pos
335 while 1:
336 if pos >= length: # EOF
337 chunks.append(css[start_pos:pos])
338 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')
339 c = css[pos]
340 if c == ')':
341 chunks.append(css[start_pos:pos])
342 pos += 1
343 return ''.join(chunks), pos, error
344 elif c in ' \n\t':
345 chunks.append(css[start_pos:pos])
346 value = ''.join(chunks)
347 pos += 1
348 break
349 elif c == '\\' and not css.startswith('\\\n', pos):
350 # Valid escape
351 chunks.append(css[start_pos:pos])
352 c, pos = _consume_escape(css, pos + 1)
353 chunks.append(c)
354 start_pos = pos
355 elif (c in
356 '"\'('
357 # https://drafts.csswg.org/css-syntax/#non-printable-character
358 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'
359 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'
360 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):
361 value = None # Parse error
362 pos += 1
363 break
364 else:
365 pos += 1
366
367 if value is not None:
368 while css.startswith((' ', '\n', '\t'), pos):
369 pos += 1
370 if pos < length:
371 if css[pos] == ')':
372 return value, pos + 1, error
373 else:
374 if error is None:
375 error = ('eof-in-url', 'EOF in URL')
376 return value, pos, error
377
378 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0
379 while pos < length:
380 if css.startswith('\\)', pos):
381 pos += 2
382 elif css[pos] == ')':
383 pos += 1
384 break
385 else:
386 pos += 1
387 return None, pos, ('bad-url', 'bad URL token')
388
389
390def _consume_unicode_range(css, pos):
391 """Return (range, new_pos)
392
393 The given pos is assume to be just after the '+' of 'U+' or 'u+'.
394
395 """
396 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token
397 length = len(css)
398 start_pos = pos
399 max_pos = min(pos + 6, length)
400 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
401 pos += 1
402 start = css[start_pos:pos]
403
404 start_pos = pos
405 # Same max_pos as before: total of hex digits and question marks <= 6
406 while pos < max_pos and css[pos] == '?':
407 pos += 1
408 question_marks = pos - start_pos
409
410 if question_marks:
411 end = start + 'F' * question_marks
412 start = start + '0' * question_marks
413 elif (pos + 1 < length and css[pos] == '-' and
414 css[pos + 1] in '0123456789abcdefABCDEF'):
415 pos += 1
416 start_pos = pos
417 max_pos = min(pos + 6, length)
418 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
419 pos += 1
420 end = css[start_pos:pos]
421 else:
422 end = start
423 return int(start, 16), int(end, 16), pos