1"""
2 babel.messages.jslexer
3 ~~~~~~~~~~~~~~~~~~~~~~
4
5 A simple JavaScript 1.5 lexer which is used for the JavaScript
6 extractor.
7
8 :copyright: (c) 2013-2024 by the Babel Team.
9 :license: BSD, see LICENSE for more details.
10"""
11from __future__ import annotations
12
13import re
14from collections.abc import Generator
15from typing import NamedTuple
16
17operators: list[str] = sorted([
18 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
19 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
20 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
21 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':',
22], key=len, reverse=True)
23
24escapes: dict[str, str] = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
25
26name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
27dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
28division_re = re.compile(r'/=?')
29regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
30line_re = re.compile(r'(\r\n|\n|\r)')
31line_join_re = re.compile(r'\\' + line_re.pattern)
32uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
33hex_escape_re = re.compile(r'[a-fA-F0-9]{1,2}')
34
35
36class Token(NamedTuple):
37 type: str
38 value: str
39 lineno: int
40
41
42_rules: list[tuple[str | None, re.Pattern[str]]] = [
43 (None, re.compile(r'\s+', re.UNICODE)),
44 (None, re.compile(r'<!--.*')),
45 ('linecomment', re.compile(r'//.*')),
46 ('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
47 ('dotted_name', dotted_name_re),
48 ('name', name_re),
49 ('number', re.compile(r'''(
50 (?:0|[1-9]\d*)
51 (\.\d+)?
52 ([eE][-+]?\d+)? |
53 (0x[a-fA-F0-9]+)
54 )''', re.VERBOSE)),
55 ('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
56 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
57 ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
58 ('string', re.compile(r'''(
59 '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
60 "(?:[^"\\]*(?:\\.[^"\\]*)*)"
61 )''', re.VERBOSE | re.DOTALL)),
62]
63
64
65def get_rules(jsx: bool, dotted: bool, template_string: bool) -> list[tuple[str | None, re.Pattern[str]]]:
66 """
67 Get a tokenization rule list given the passed syntax options.
68
69 Internal to this module.
70 """
71 rules = []
72 for token_type, rule in _rules:
73 if not jsx and token_type and 'jsx' in token_type:
74 continue
75 if not template_string and token_type == 'template_string':
76 continue
77 if token_type == 'dotted_name':
78 if not dotted:
79 continue
80 token_type = 'name'
81 rules.append((token_type, rule))
82 return rules
83
84
85def indicates_division(token: Token) -> bool:
86 """A helper function that helps the tokenizer to decide if the current
87 token may be followed by a division operator.
88 """
89 if token.type == 'operator':
90 return token.value in (')', ']', '}', '++', '--')
91 return token.type in ('name', 'number', 'string', 'regexp')
92
93
94def unquote_string(string: str) -> str:
95 """Unquote a string with JavaScript rules. The string has to start with
96 string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
97 """
98 assert string and string[0] == string[-1] and string[0] in '"\'`', \
99 'string provided is not properly delimited'
100 string = line_join_re.sub('\\1', string[1:-1])
101 result: list[str] = []
102 add = result.append
103 pos = 0
104
105 while True:
106 # scan for the next escape
107 escape_pos = string.find('\\', pos)
108 if escape_pos < 0:
109 break
110 add(string[pos:escape_pos])
111
112 # check which character is escaped
113 next_char = string[escape_pos + 1]
114 if next_char in escapes:
115 add(escapes[next_char])
116
117 # unicode escapes. trie to consume up to four characters of
118 # hexadecimal characters and try to interpret them as unicode
119 # character point. If there is no such character point, put
120 # all the consumed characters into the string.
121 elif next_char in 'uU':
122 escaped = uni_escape_re.match(string, escape_pos + 2)
123 if escaped is not None:
124 escaped_value = escaped.group()
125 if len(escaped_value) == 4:
126 try:
127 add(chr(int(escaped_value, 16)))
128 except ValueError:
129 pass
130 else:
131 pos = escape_pos + 6
132 continue
133 add(next_char + escaped_value)
134 pos = escaped.end()
135 continue
136 else:
137 add(next_char)
138
139 # hex escapes. conversion from 2-digits hex to char is infallible
140 elif next_char in 'xX':
141 escaped = hex_escape_re.match(string, escape_pos + 2)
142 if escaped is not None:
143 escaped_value = escaped.group()
144 add(chr(int(escaped_value, 16)))
145 pos = escape_pos + 2 + len(escaped_value)
146 continue
147 else:
148 add(next_char)
149
150 # bogus escape. Just remove the backslash.
151 else:
152 add(next_char)
153 pos = escape_pos + 2
154
155 if pos < len(string):
156 add(string[pos:])
157
158 return ''.join(result)
159
160
161def tokenize(source: str, jsx: bool = True, dotted: bool = True, template_string: bool = True, lineno: int = 1) -> Generator[Token, None, None]:
162 """
163 Tokenize JavaScript/JSX source. Returns a generator of tokens.
164
165 :param jsx: Enable (limited) JSX parsing.
166 :param dotted: Read dotted names as single name token.
167 :param template_string: Support ES6 template strings
168 :param lineno: starting line number (optional)
169 """
170 may_divide = False
171 pos = 0
172 end = len(source)
173 rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
174
175 while pos < end:
176 # handle regular rules first
177 for token_type, rule in rules: # noqa: B007
178 match = rule.match(source, pos)
179 if match is not None:
180 break
181 # if we don't have a match we don't give up yet, but check for
182 # division operators or regular expression literals, based on
183 # the status of `may_divide` which is determined by the last
184 # processed non-whitespace token using `indicates_division`.
185 else:
186 if may_divide:
187 match = division_re.match(source, pos)
188 token_type = 'operator'
189 else:
190 match = regex_re.match(source, pos)
191 token_type = 'regexp'
192 if match is None:
193 # woops. invalid syntax. jump one char ahead and try again.
194 pos += 1
195 continue
196
197 token_value = match.group()
198 if token_type is not None:
199 token = Token(token_type, token_value, lineno)
200 may_divide = indicates_division(token)
201 yield token
202 lineno += len(line_re.findall(token_value))
203 pos = match.end()