Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tinycss2/tokenizer.py: 5%
297 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1import re
2import sys
4from webencodings import ascii_lower
6from .ast import (
7 AtKeywordToken, Comment, CurlyBracketsBlock, DimensionToken, FunctionBlock,
8 HashToken, IdentToken, LiteralToken, NumberToken, ParenthesesBlock,
9 ParseError, PercentageToken, SquareBracketsBlock, StringToken,
10 UnicodeRangeToken, URLToken, WhitespaceToken)
11from .serializer import serialize_string_value, serialize_url
13_NUMBER_RE = re.compile(r'[-+]?([0-9]*\.)?[0-9]+([eE][+-]?[0-9]+)?')
14_HEX_ESCAPE_RE = re.compile(r'([0-9A-Fa-f]{1,6})[ \n\t]?')
17def parse_component_value_list(css, skip_comments=False):
18 """Parse a list of component values.
20 :type css: :obj:`str`
21 :param css: A CSS string.
22 :type skip_comments: :obj:`bool`
23 :param skip_comments:
24 Ignore CSS comments.
25 The return values (and recursively its blocks and functions)
26 will not contain any :class:`~tinycss2.ast.Comment` object.
27 :returns: A list of :term:`component values`.
29 """
30 css = (css.replace('\0', '\uFFFD')
31 # This turns out to be faster than a regexp:
32 .replace('\r\n', '\n').replace('\r', '\n').replace('\f', '\n'))
33 length = len(css)
34 token_start_pos = pos = 0 # Character index in the css source.
35 line = 1 # First line is line 1.
36 last_newline = -1
37 root = tokens = []
38 end_char = None # Pop the stack when encountering this character.
39 stack = [] # Stack of nested blocks: (tokens, end_char) tuples.
41 while pos < length:
42 newline = css.rfind('\n', token_start_pos, pos)
43 if newline != -1:
44 line += 1 + css.count('\n', token_start_pos, newline)
45 last_newline = newline
46 # First character in a line is in column 1.
47 column = pos - last_newline
48 token_start_pos = pos
49 c = css[pos]
51 if c in ' \n\t':
52 pos += 1
53 while css.startswith((' ', '\n', '\t'), pos):
54 pos += 1
55 value = css[token_start_pos:pos]
56 tokens.append(WhitespaceToken(line, column, value))
57 continue
58 elif (c in 'Uu' and pos + 2 < length and css[pos + 1] == '+' and
59 css[pos + 2] in '0123456789abcdefABCDEF?'):
60 start, end, pos = _consume_unicode_range(css, pos + 2)
61 tokens.append(UnicodeRangeToken(line, column, start, end))
62 continue
63 elif css.startswith('-->', pos): # Check before identifiers
64 tokens.append(LiteralToken(line, column, '-->'))
65 pos += 3
66 continue
67 elif _is_ident_start(css, pos):
68 value, pos = _consume_ident(css, pos)
69 if not css.startswith('(', pos): # Not a function
70 tokens.append(IdentToken(line, column, value))
71 continue
72 pos += 1 # Skip the '('
73 if ascii_lower(value) == 'url':
74 url_pos = pos
75 while css.startswith((' ', '\n', '\t'), url_pos):
76 url_pos += 1
77 if url_pos >= length or css[url_pos] not in ('"', "'"):
78 value, pos, error = _consume_url(css, pos)
79 if value is not None:
80 repr = 'url({})'.format(serialize_url(value))
81 if error is not None:
82 error_key = error[0]
83 if error_key == 'eof-in-string':
84 repr = repr[:-2]
85 else:
86 assert error_key == 'eof-in-url'
87 repr = repr[:-1]
88 tokens.append(URLToken(line, column, value, repr))
89 if error is not None:
90 tokens.append(ParseError(line, column, *error))
91 continue
92 arguments = []
93 tokens.append(FunctionBlock(line, column, value, arguments))
94 stack.append((tokens, end_char))
95 end_char = ')'
96 tokens = arguments
97 continue
99 match = _NUMBER_RE.match(css, pos)
100 if match:
101 pos = match.end()
102 repr_ = css[token_start_pos:pos]
103 value = float(repr_)
104 int_value = int(repr_) if not any(match.groups()) else None
105 if pos < length and _is_ident_start(css, pos):
106 unit, pos = _consume_ident(css, pos)
107 tokens.append(DimensionToken(
108 line, column, value, int_value, repr_, unit))
109 elif css.startswith('%', pos):
110 pos += 1
111 tokens.append(PercentageToken(
112 line, column, value, int_value, repr_))
113 else:
114 tokens.append(NumberToken(
115 line, column, value, int_value, repr_))
116 elif c == '@':
117 pos += 1
118 if pos < length and _is_ident_start(css, pos):
119 value, pos = _consume_ident(css, pos)
120 tokens.append(AtKeywordToken(line, column, value))
121 else:
122 tokens.append(LiteralToken(line, column, '@'))
123 elif c == '#':
124 pos += 1
125 if pos < length and (
126 css[pos] in '0123456789abcdefghijklmnopqrstuvwxyz'
127 '-_ABCDEFGHIJKLMNOPQRSTUVWXYZ' or
128 ord(css[pos]) > 0x7F or # Non-ASCII
129 # Valid escape:
130 (css[pos] == '\\' and not css.startswith('\\\n', pos))):
131 is_identifier = _is_ident_start(css, pos)
132 value, pos = _consume_ident(css, pos)
133 tokens.append(HashToken(line, column, value, is_identifier))
134 else:
135 tokens.append(LiteralToken(line, column, '#'))
136 elif c == '{':
137 content = []
138 tokens.append(CurlyBracketsBlock(line, column, content))
139 stack.append((tokens, end_char))
140 end_char = '}'
141 tokens = content
142 pos += 1
143 elif c == '[':
144 content = []
145 tokens.append(SquareBracketsBlock(line, column, content))
146 stack.append((tokens, end_char))
147 end_char = ']'
148 tokens = content
149 pos += 1
150 elif c == '(':
151 content = []
152 tokens.append(ParenthesesBlock(line, column, content))
153 stack.append((tokens, end_char))
154 end_char = ')'
155 tokens = content
156 pos += 1
157 elif c == end_char: # Matching }, ] or )
158 # The top-level end_char is None (never equal to a character),
159 # so we never get here if the stack is empty.
160 tokens, end_char = stack.pop()
161 pos += 1
162 elif c in '}])':
163 tokens.append(ParseError(line, column, c, 'Unmatched ' + c))
164 pos += 1
165 elif c in ('"', "'"):
166 value, pos, error = _consume_quoted_string(css, pos)
167 if value is not None:
168 repr = '"{}"'.format(serialize_string_value(value))
169 if error is not None:
170 repr = repr[:-1]
171 tokens.append(StringToken(line, column, value, repr))
172 if error is not None:
173 tokens.append(ParseError(line, column, *error))
174 elif css.startswith('/*', pos): # Comment
175 pos = css.find('*/', pos + 2)
176 if pos == -1:
177 if not skip_comments:
178 tokens.append(
179 Comment(line, column, css[token_start_pos + 2:]))
180 break
181 if not skip_comments:
182 tokens.append(
183 Comment(line, column, css[token_start_pos + 2:pos]))
184 pos += 2
185 elif css.startswith('<!--', pos):
186 tokens.append(LiteralToken(line, column, '<!--'))
187 pos += 4
188 elif css.startswith('||', pos):
189 tokens.append(LiteralToken(line, column, '||'))
190 pos += 2
191 elif c in '~|^$*':
192 pos += 1
193 if css.startswith('=', pos):
194 pos += 1
195 tokens.append(LiteralToken(line, column, c + '='))
196 else:
197 tokens.append(LiteralToken(line, column, c))
198 else:
199 tokens.append(LiteralToken(line, column, c))
200 pos += 1
201 return root
204def _is_name_start(css, pos):
205 """Return true if the given character is a name-start code point."""
206 # https://www.w3.org/TR/css-syntax-3/#name-start-code-point
207 c = css[pos]
208 return (
209 c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' or
210 ord(c) > 0x7F)
213def _is_ident_start(css, pos):
214 """Return True if the given position is the start of a CSS identifier."""
215 # https://drafts.csswg.org/css-syntax/#would-start-an-identifier
216 if _is_name_start(css, pos):
217 return True
218 elif css[pos] == '-':
219 pos += 1
220 return (
221 # Name-start code point or hyphen:
222 (pos < len(css) and (
223 _is_name_start(css, pos) or css[pos] == '-')) or
224 # Valid escape:
225 (css.startswith('\\', pos) and not css.startswith('\\\n', pos)))
226 elif css[pos] == '\\':
227 return not css.startswith('\\\n', pos)
228 return False
231def _consume_ident(css, pos):
232 """Return (unescaped_value, new_pos).
234 Assumes pos starts at a valid identifier. See :func:`_is_ident_start`.
236 """
237 # http://dev.w3.org/csswg/css-syntax/#consume-a-name
238 chunks = []
239 length = len(css)
240 start_pos = pos
241 while pos < length:
242 c = css[pos]
243 if c in ('abcdefghijklmnopqrstuvwxyz-_0123456789'
244 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or ord(c) > 0x7F:
245 pos += 1
246 elif c == '\\' and not css.startswith('\\\n', pos):
247 # Valid escape
248 chunks.append(css[start_pos:pos])
249 c, pos = _consume_escape(css, pos + 1)
250 chunks.append(c)
251 start_pos = pos
252 else:
253 break
254 chunks.append(css[start_pos:pos])
255 return ''.join(chunks), pos
258def _consume_quoted_string(css, pos):
259 """Return (unescaped_value, new_pos)."""
260 # https://drafts.csswg.org/css-syntax/#consume-a-string-token
261 error = None
262 quote = css[pos]
263 assert quote in ('"', "'")
264 pos += 1
265 chunks = []
266 length = len(css)
267 start_pos = pos
268 while pos < length:
269 c = css[pos]
270 if c == quote:
271 chunks.append(css[start_pos:pos])
272 pos += 1
273 break
274 elif c == '\\':
275 chunks.append(css[start_pos:pos])
276 pos += 1
277 if pos < length:
278 if css[pos] == '\n': # Ignore escaped newlines
279 pos += 1
280 else:
281 c, pos = _consume_escape(css, pos)
282 chunks.append(c)
283 # else: Escaped EOF, do nothing
284 start_pos = pos
285 elif c == '\n': # Unescaped newline
286 return None, pos, ('bad-string', 'Bad string token')
287 else:
288 pos += 1
289 else:
290 error = ('eof-in-string', 'EOF in string')
291 chunks.append(css[start_pos:pos])
292 return ''.join(chunks), pos, error
295def _consume_escape(css, pos):
296 r"""Return (unescaped_char, new_pos).
298 Assumes a valid escape: pos is just after '\' and not followed by '\n'.
300 """
301 # https://drafts.csswg.org/css-syntax/#consume-an-escaped-character
302 hex_match = _HEX_ESCAPE_RE.match(css, pos)
303 if hex_match:
304 codepoint = int(hex_match.group(1), 16)
305 return (
306 chr(codepoint) if 0 < codepoint <= sys.maxunicode else '\uFFFD',
307 hex_match.end())
308 elif pos < len(css):
309 return css[pos], pos + 1
310 else:
311 return '\uFFFD', pos
314def _consume_url(css, pos):
315 """Return (unescaped_url, new_pos)
317 The given pos is assumed to be just after the '(' of 'url('.
319 """
320 error = None
321 length = len(css)
322 # https://drafts.csswg.org/css-syntax/#consume-a-url-token
323 # Skip whitespace
324 while css.startswith((' ', '\n', '\t'), pos):
325 pos += 1
326 if pos >= length: # EOF
327 return '', pos, ('eof-in-url', 'EOF in URL')
328 c = css[pos]
329 if c in ('"', "'"):
330 value, pos, error = _consume_quoted_string(css, pos)
331 elif c == ')':
332 return '', pos + 1, error
333 else:
334 chunks = []
335 start_pos = pos
336 while 1:
337 if pos >= length: # EOF
338 chunks.append(css[start_pos:pos])
339 return ''.join(chunks), pos, ('eof-in-url', 'EOF in URL')
340 c = css[pos]
341 if c == ')':
342 chunks.append(css[start_pos:pos])
343 pos += 1
344 return ''.join(chunks), pos, error
345 elif c in ' \n\t':
346 chunks.append(css[start_pos:pos])
347 value = ''.join(chunks)
348 pos += 1
349 break
350 elif c == '\\' and not css.startswith('\\\n', pos):
351 # Valid escape
352 chunks.append(css[start_pos:pos])
353 c, pos = _consume_escape(css, pos + 1)
354 chunks.append(c)
355 start_pos = pos
356 elif (c in
357 '"\'('
358 # https://drafts.csswg.org/css-syntax/#non-printable-character
359 '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0e'
360 '\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19'
361 '\x1a\x1b\x1c\x1d\x1e\x1f\x7f'):
362 value = None # Parse error
363 pos += 1
364 break
365 else:
366 pos += 1
368 if value is not None:
369 while css.startswith((' ', '\n', '\t'), pos):
370 pos += 1
371 if pos < length:
372 if css[pos] == ')':
373 return value, pos + 1, error
374 else:
375 if error is None:
376 error = ('eof-in-url', 'EOF in URL')
377 return value, pos, error
379 # https://drafts.csswg.org/css-syntax/#consume-the-remnants-of-a-bad-url0
380 while pos < length:
381 if css.startswith('\\)', pos):
382 pos += 2
383 elif css[pos] == ')':
384 pos += 1
385 break
386 else:
387 pos += 1
388 return None, pos, ('bad-url', 'bad URL token')
391def _consume_unicode_range(css, pos):
392 """Return (range, new_pos)
394 The given pos is assume to be just after the '+' of 'U+' or 'u+'.
396 """
397 # https://drafts.csswg.org/css-syntax/#consume-a-unicode-range-token
398 length = len(css)
399 start_pos = pos
400 max_pos = min(pos + 6, length)
401 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
402 pos += 1
403 start = css[start_pos:pos]
405 start_pos = pos
406 # Same max_pos as before: total of hex digits and question marks <= 6
407 while pos < max_pos and css[pos] == '?':
408 pos += 1
409 question_marks = pos - start_pos
411 if question_marks:
412 end = start + 'F' * question_marks
413 start = start + '0' * question_marks
414 elif (pos + 1 < length and css[pos] == '-' and
415 css[pos + 1] in '0123456789abcdefABCDEF'):
416 pos += 1
417 start_pos = pos
418 max_pos = min(pos + 6, length)
419 while pos < max_pos and css[pos] in '0123456789abcdefABCDEF':
420 pos += 1
421 end = css[start_pos:pos]
422 else:
423 end = start
424 return int(start, 16), int(end, 16), pos