1import re
2import sys
3
4from webencodings import ascii_lower
5
6from .ast import ( # isort: skip
7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,
8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock, ParseError,
9 PercentageToken, SquareBracketsBlock, StringToken, UnicodeRangeToken, URLToken,
10 WhitespaceToken)
11from .serializer import serialize_string_value, serialize_url
12
13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')
14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')
15
16
17def parse_component_value_list(css, skip_comments=False):
18 """Parse a list of component values.
19
20 :type css: :obj:`str`
21 :param css: A CSS string.
22 :type skip_comments: :obj:`bool`
23 :param skip_comments:
24 Ignore CSS comments.
25 The return values (and recursively its blocks and functions)
26 will not contain any :class:`~tinycss2.ast.Comment` object.
27 :returns: A list of :term:`component values`.
28
29 """
30 css = (css.replace('\0', '\uFFFD')
31 # This turns out to be faster than a regexp:
32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))
33 length = len(css)
34 token_start_pos = pos = 0 # Character index in the css source.
35 line = 1 # First line is line 1.
36 last_newline = -1
37 root = tokens = []
38 end_char = None # Pop the stack when encountering this character.
39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples.
40
41 while pos < length:
42 newline = css.rfind('\n', token_start_pos, pos)
43 if newline != -1:
44 line += 1 + css.count('\n', token_start_pos, newline)
45 last_newline = newline
46 # First character in a line is in column 1.
47 column = pos - last_newline
48 token_start_pos = pos
49 c = css[pos]
50
51 if c in ' \n\t':
52 pos += 1
53 while css.startswith((' ', '\n', '\t'), pos):
54 pos += 1
55 value = css[token_start_pos:pos]
56 tokens.append(WhitespaceToken(line, column, value))
57 continue
58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and
59 css[pos + 2] in '0123456789abcdefABCDEF?'):
60 start, end, pos = _consume_unicode_range(css, pos + 2)
61 tokens.append(UnicodeRangeToken(line, column, start, end))
62 continue
63 elif css.startswith('-->', pos): # Check before identifiers
64 tokens.append(LiteralToken(line, column, '-->'))
65 pos += 3
66 continue
67 elif _is_ident_start(css, pos):
68 value, pos = _consume_ident(css, pos)
69 if not css.startswith('(', pos): # Not a function
70 tokens.append(IdentToken(line, column, value))
71 continue
72 pos += 1 # Skip the '('
73 if ascii_lower(value) == 'url':
74 url_pos = pos
75 while css.startswith((' ', '\n', '\t'), url_pos):
76 url_pos += 1
77 if url_pos >= length or css[url_pos] not in ('"', "'"):
78 value, pos, error = _consume_url(css, pos)
79 if value is not None:
80 repr = 'url({})'.format(serialize_url(value))
81 if error is not None:
82 error_key = error[0]
83 if error_key == 'eof-in-string':
84 repr = repr[:-2]
85 else:
86 assert error_key == 'eof-in-url'
87 repr = repr[:-1]
88 tokens.append(URLToken(line, column, value, repr))
89 if error is not None:
90 tokens.append(ParseError(line, column, *error))
91 continue
92 arguments = []
93 tokens.append(FunctionBlock(line, column, value, arguments))
94 stack.append((tokens, end_char))
95 end_char = ')'
96 tokens = arguments
97 continue
98
99 match = _NUMBER_RE.match(css, pos)
100 if match:
101 pos = match.end()
102 repr_ = css[token_start_pos:pos]
103 value = float(repr_)
104 int_value = int(repr_) if not any(match.groups()) else None
105 if pos < length and _is_ident_start(css, pos):
106 unit, pos = _consume_ident(css, pos)
107 tokens.append(DimensionToken(
108 line, column, value, int_value, repr_, unit))
109 elif css.startswith('%', pos):
110 pos += 1
111 tokens.append(PercentageToken(line, column, value, int_value, repr_))
112 else:
113 tokens.append(NumberToken(line, column, value, int_value, repr_))
114 elif c == '@':
115 pos += 1
116 if pos < length and _is_ident_start(css, pos):
117 value, pos = _consume_ident(css, pos)
118 tokens.append(AtKeywordToken(line, column, value))
119 else:
120 tokens.append(LiteralToken(line, column, '@'))
121 elif c == '#':
122 pos += 1
123 if pos < length and (
124 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'
125 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or
126 ord(css[pos]) > 0x7F or # Non-ASCII
127 # Valid escape:
128 (css[pos] == '\\' and not css.startswith('\\\n', pos))):
129 is_identifier = _is_ident_start(css, pos)
130 value, pos = _consume_ident(css, pos)
131 tokens.append(HashToken(line, column, value, is_identifier))
132 else:
133 tokens.append(LiteralToken(line, column, '#'))
134 elif c == '{':
135 content = []
136 tokens.append(CurlyBracketsBlock(line, column, content))
137 stack.append((tokens, end_char))
138 end_char = '}'
139 tokens = content
140 pos += 1
141 elif c == '[':
142 content = []
143 tokens.append(SquareBracketsBlock(line, column, content))
144 stack.append((tokens, end_char))
145 end_char = ']'
146 tokens = content
147 pos += 1
148 elif c == '(':
149 content = []
150 tokens.append(ParenthesesBlock(line, column, content))
151 stack.append((tokens, end_char))
152 end_char = ')'
153 tokens = content
154 pos += 1
155 elif c == end_char: # Matching }, ] or )
156 # The top-level end_char is None (never equal to a character),
157 # so we never get here if the stack is empty.
158 tokens, end_char = stack.pop()
159 pos += 1
160 elif c in '}])':
161 tokens.append(ParseError(line, column, c, 'Unmatched ' + c))
162 pos += 1
163 elif c in ('"', "'"):
164 value, pos, error = _consume_quoted_string(css, pos)
165 if value is not None:
166 repr = '"{}"'.format(serialize_string_value(value))
167 if error is not None:
168 repr = repr[:-1]
169 tokens.append(StringToken(line, column, value, repr))
170 if error is not None:
171 tokens.append(ParseError(line, column, *error))
172 elif css.startswith('/*', pos): # Comment
173 pos = css.find('*/', pos + 2)
174 if pos == -1:
175 if not skip_comments:
176 tokens.append(Comment(line, column, css[token_start_pos + 2:]))
177 break
178 if not skip_comments:
179 tokens.append(Comment(line, column, css[token_start_pos + 2:pos]))
180 pos += 2
181 elif css.startswith('<!--', pos):
182 tokens.append(LiteralToken(line, column, '<!--'))
183 pos += 4
184 elif css.startswith('||', pos):
185 tokens.append(LiteralToken(line, column, '||'))
186 pos += 2
187 elif c in '~|^$*':
188 pos += 1
189 if css.startswith('=', pos):
190 pos += 1
191 tokens.append(LiteralToken(line, column, c + '='))
192 else:
193 tokens.append(LiteralToken(line, column, c))
194 else:
195 tokens.append(LiteralToken(line, column, c))
196 pos += 1
197 return root
198
199
200def _is_name_start(css, pos):
201 """Return true if the given character is a name-start code point."""
202 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point
203 c = css[pos]
204 return (
205 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or
206 ord(c) > 0x7F)
207
208
209def _is_ident_start(css, pos):
210 """Return True if the given position is the start of a CSS identifier."""
211 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier
212 if _is_name_start(css, pos):
213 return True
214 elif css[pos] == '-':
215 pos += 1
216 return (
217 # Name-start code point or hyphen:
218 (pos < len(css) and (_is_name_start(css, pos) or css[pos] == '-')) or
219 # Valid escape:
220 (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))
221 elif css[pos] == '\\':
222 return not css.startswith('\\\n', pos)
223 return False
224
225
226def _consume_ident(css, pos):
227 """Return (unescaped_value, new_pos).
228
229 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.
230
231 """
232 # http://dev.w3.org/csswg/css-syntax/#consume-a-name
233 chunks = []
234 length = len(css)
235 start_pos = pos
236 while pos < length:
237 c = css[pos]
238 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'
239 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:
240 pos += 1
241 elif c == '\\' and not css.startswith('\\\n', pos):
242 # Valid escape
243 chunks.append(css[start_pos:pos])
244 c, pos = _consume_escape(css, pos + 1)
245 chunks.append(c)
246 start_pos = pos
247 else:
248 break
249 chunks.append(css[start_pos:pos])
250 return ''.join(chunks), pos
251
252
253def _consume_quoted_string(css, pos):
254 """Return (unescaped_value, new_pos)."""
255 # https://drafts.csswg.org/css-syntax/#consume-a-string-token
256 error = None
257 quote = css[pos]
258 assert quote in ('"', "'")
259 pos += 1
260 chunks = []
261 length = len(css)
262 start_pos = pos
263 while pos < length:
264 c = css[pos]
265 if c == quote:
266 chunks.append(css[start_pos:pos])
267 pos += 1
268 break
269 elif c == '\\':
270 chunks.append(css[start_pos:pos])
271 pos += 1
272 if pos < length:
273 if css[pos] == '\n': # Ignore escaped newlines
274 pos += 1
275 else:
276 c, pos = _consume_escape(css, pos)
277 chunks.append(c)
278 # else: Escaped EOF, do nothing
279 start_pos = pos
280 elif c == '\n': # Unescaped newline
281 return None, pos, ('bad-string', 'Bad string token')
282 else:
283 pos += 1
284 else:
285 error = ('eof-in-string', 'EOF in string')
286 chunks.append(css[start_pos:pos])
287 return ''.join(chunks), pos, error
288
289
290def _consume_escape(css, pos):
291 r"""Return (unescaped_char, new_pos).
292
293 Assumes a valid escape: pos is just after '\' and not followed by '\n'.
294
295 """
296 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character
297 hex_match = _HEX_ESCAPE_RE.match(css, pos)
298 if hex_match:
299 codepoint = int(hex_match.group(1), 16)
300 return (
301 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',
302 hex_match.end())
303 elif pos < len(css):
304 return css[pos], pos + 1
305 else:
306 return '\uFFFD', pos
307
308
309def _consume_url(css, pos):
310 """Return (unescaped_url, new_pos)
311
312 The given pos is assumed to be just after the '(' of 'url('.
313
314 """
315 error = None
316 length = len(css)
317 # https://drafts.csswg.org/css-syntax/#consume-a-url-token
318 # Skip whitespace
319 while css.startswith((' ', '\n', '\t'), pos):
320 pos += 1
321 if pos >= length: # EOF
322 return '', pos, ('eof-in-url', 'EOF in URL')
323 c = css[pos]
324 if c in ('"', "'"):
325 value, pos, error = _consume_quoted_string(css, pos)
326 elif c == ')':
327 return '', pos + 1, error
328 else:
329 chunks = []
330 start_pos = pos
331 while 1:
332 if pos >= length: # EOF
333 chunks.append(css[start_pos:pos])
334 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')
335 c = css[pos]
336 if c == ')':
337 chunks.append(css[start_pos:pos])
338 pos += 1
339 return ''.join(chunks), pos, error
340 elif c in ' \n\t':
341 chunks.append(css[start_pos:pos])
342 value = ''.join(chunks)
343 pos += 1
344 break
345 elif c == '\\' and not css.startswith('\\\n', pos):
346 # Valid escape
347 chunks.append(css[start_pos:pos])
348 c, pos = _consume_escape(css, pos + 1)
349 chunks.append(c)
350 start_pos = pos
351 elif (c in
352 '"\'('
353 # https://drafts.csswg.org/css-syntax/#non-printable-character
354 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'
355 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'
356 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):
357 value = None # Parse error
358 pos += 1
359 break
360 else:
361 pos += 1
362
363 if value is not None:
364 while css.startswith((' ', '\n', '\t'), pos):
365 pos += 1
366 if pos < length:
367 if css[pos] == ')':
368 return value, pos + 1, error
369 else:
370 if error is None:
371 error = ('eof-in-url', 'EOF in URL')
372 return value, pos, error
373
374 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0
375 while pos < length:
376 if css.startswith('\\)', pos):
377 pos += 2
378 elif css[pos] == ')':
379 pos += 1
380 break
381 else:
382 pos += 1
383 return None, pos, ('bad-url', 'bad URL token')
384
385
386def _consume_unicode_range(css, pos):
387 """Return (range, new_pos)
388
389 The given pos is assume to be just after the '+' of 'U+' or 'u+'.
390
391 """
392 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token
393 length = len(css)
394 start_pos = pos
395 max_pos = min(pos + 6, length)
396 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
397 pos += 1
398 start = css[start_pos:pos]
399
400 start_pos = pos
401 # Same max_pos as before: total of hex digits and question marks <= 6
402 while pos < max_pos and css[pos] == '?':
403 pos += 1
404 question_marks = pos - start_pos
405
406 if question_marks:
407 end = start + 'F' * question_marks
408 start = start + '0' * question_marks
409 elif (pos + 1 < length and css[pos] == '-' and
410 css[pos + 1] in '0123456789abcdefABCDEF'):
411 pos += 1
412 start_pos = pos
413 max_pos = min(pos + 6, length)
414 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
415 pos += 1
416 end = css[start_pos:pos]
417 else:
418 end = start
419 return int(start, 16), int(end, 16), pos