Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tinycss2/bytes.py: 18%
22 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1from webencodings import UTF8, decode, lookup
3from .parser import parse_stylesheet
6def decode_stylesheet_bytes(css_bytes, protocol_encoding=None,
7 environment_encoding=None):
8 """Determine the character encoding of a CSS stylesheet and decode it.
10 This is based on the presence of a :abbr:`BOM (Byte Order Mark)`,
11 a ``@charset`` rule, and encoding meta-information.
13 :type css_bytes: :obj:`bytes`
14 :param css_bytes: A CSS byte string.
15 :type protocol_encoding: :obj:`str`
16 :param protocol_encoding:
17 The encoding label, if any, defined by HTTP or equivalent protocol.
18 (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
19 :type environment_encoding: :class:`webencodings.Encoding`
20 :param environment_encoding:
21 The `environment encoding
22 <https://www.w3.org/TR/css-syntax/#environment-encoding>`_, if any.
23 :returns:
24 A 2-tuple of a decoded Unicode string and the
25 :class:`webencodings.Encoding` object that was used.
27 """
28 # https://drafts.csswg.org/css-syntax/#the-input-byte-stream
29 if protocol_encoding:
30 fallback = lookup(protocol_encoding)
31 if fallback:
32 return decode(css_bytes, fallback)
33 if css_bytes.startswith(b'@charset "'):
34 # 10 is len(b'@charset "')
35 # 100 is arbitrary so that no encoding label is more than 100-10 bytes.
36 end_quote = css_bytes.find(b'"', 10, 100)
37 if end_quote != -1 and css_bytes.startswith(b'";', end_quote):
38 fallback = lookup(css_bytes[10:end_quote].decode('latin1'))
39 if fallback:
40 if fallback.name in ('utf-16be', 'utf-16le'):
41 return decode(css_bytes, UTF8)
42 return decode(css_bytes, fallback)
43 if environment_encoding:
44 return decode(css_bytes, environment_encoding)
45 return decode(css_bytes, UTF8)
48def parse_stylesheet_bytes(css_bytes, protocol_encoding=None,
49 environment_encoding=None,
50 skip_comments=False, skip_whitespace=False):
51 """Parse :diagram:`stylesheet` from bytes,
52 determining the character encoding as web browsers do.
54 This is used when reading a file or fetching a URL.
55 The character encoding is determined from the initial bytes
56 (a :abbr:`BOM (Byte Order Mark)` or a ``@charset`` rule)
57 as well as the parameters. The ultimate fallback is UTF-8.
59 :type css_bytes: :obj:`bytes`
60 :param css_bytes: A CSS byte string.
61 :type protocol_encoding: :obj:`str`
62 :param protocol_encoding:
63 The encoding label, if any, defined by HTTP or equivalent protocol.
64 (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
65 :type environment_encoding: :class:`webencodings.Encoding`
66 :param environment_encoding:
67 The `environment encoding`_, if any.
68 :type skip_comments: :obj:`bool`
69 :param skip_comments:
70 Ignore CSS comments at the top-level of the stylesheet.
71 If the input is a string, ignore all comments.
72 :type skip_whitespace: :obj:`bool`
73 :param skip_whitespace:
74 Ignore whitespace at the top-level of the stylesheet.
75 Whitespace is still preserved
76 in the :attr:`~tinycss2.ast.QualifiedRule.prelude`
77 and the :attr:`~tinycss2.ast.QualifiedRule.content` of rules.
78 :returns:
79 A ``(rules, encoding)`` tuple.
81 * ``rules`` is a list of
82 :class:`~tinycss2.ast.QualifiedRule`,
83 :class:`~tinycss2.ast.AtRule`,
84 :class:`~tinycss2.ast.Comment` (if ``skip_comments`` is false),
85 :class:`~tinycss2.ast.WhitespaceToken`
86 (if ``skip_whitespace`` is false),
87 and :class:`~tinycss2.ast.ParseError` objects.
88 * ``encoding`` is the :class:`webencodings.Encoding` object
89 that was used.
90 If ``rules`` contains an ``@import`` rule, this is
91 the `environment encoding`_ for the imported stylesheet.
93 .. _environment encoding:
94 https://www.w3.org/TR/css-syntax/#environment-encoding
96 .. code-block:: python
98 response = urlopen('http://example.net/foo.css')
99 rules, encoding = parse_stylesheet_bytes(
100 css_bytes=response.read(),
101 # Python 3.x
102 protocol_encoding=response.info().get_content_type().get_param('charset'),
103 # Python 2.x
104 protocol_encoding=response.info().gettype().getparam('charset'),
105 )
106 for rule in rules:
107 ...
109 """
110 css_unicode, encoding = decode_stylesheet_bytes(
111 css_bytes, protocol_encoding, environment_encoding)
112 stylesheet = parse_stylesheet(css_unicode, skip_comments, skip_whitespace)
113 return stylesheet, encoding