1"""
2babel.messages.jslexer
3~~~~~~~~~~~~~~~~~~~~~~
4
5A simple JavaScript 1.5 lexer which is used for the JavaScript
6extractor.
7
8:copyright: (c) 2013-2025 by the Babel Team.
9:license: BSD, see LICENSE for more details.
10"""
11
12from __future__ import annotations
13
14import re
15from collections.abc import Generator
16from typing import NamedTuple
17
18operators: list[str] = sorted([
19 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
20 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
21 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
22 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':',
23], key=len, reverse=True) # fmt: skip
24
25escapes: dict[str, str] = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
26
27name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
28dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
29division_re = re.compile(r'/=?')
30regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
31line_re = re.compile(r'(\r\n|\n|\r)')
32line_join_re = re.compile(r'\\' + line_re.pattern)
33uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
34hex_escape_re = re.compile(r'[a-fA-F0-9]{1,2}')
35
36
37class Token(NamedTuple):
38 type: str
39 value: str
40 lineno: int
41
42
43_rules: list[tuple[str | None, re.Pattern[str]]] = [
44 (None, re.compile(r'\s+', re.UNICODE)),
45 (None, re.compile(r'<!--.*')),
46 ('linecomment', re.compile(r'//.*')),
47 ('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
48 ('dotted_name', dotted_name_re),
49 ('name', name_re),
50 ('number', re.compile(r'''(
51 (?:0|[1-9]\d*)
52 (\.\d+)?
53 ([eE][-+]?\d+)? |
54 (0x[a-fA-F0-9]+)
55 )''', re.VERBOSE)),
56 ('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
57 ('operator', re.compile(r'(%s)' % '|'.join(re.escape(op) for op in operators))),
58 ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
59 ('string', re.compile(r'''(
60 '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
61 "(?:[^"\\]*(?:\\.[^"\\]*)*)"
62 )''', re.VERBOSE | re.DOTALL)),
63] # fmt: skip
64
65
66def get_rules(
67 jsx: bool,
68 dotted: bool,
69 template_string: bool,
70) -> list[tuple[str | None, re.Pattern[str]]]:
71 """
72 Get a tokenization rule list given the passed syntax options.
73
74 Internal to this module.
75 """
76 rules = []
77 for token_type, rule in _rules:
78 if not jsx and token_type and 'jsx' in token_type:
79 continue
80 if not template_string and token_type == 'template_string':
81 continue
82 if token_type == 'dotted_name':
83 if not dotted:
84 continue
85 token_type = 'name'
86 rules.append((token_type, rule))
87 return rules
88
89
90def indicates_division(token: Token) -> bool:
91 """A helper function that helps the tokenizer to decide if the current
92 token may be followed by a division operator.
93 """
94 if token.type == 'operator':
95 return token.value in (')', ']', '}', '++', '--')
96 return token.type in ('name', 'number', 'string', 'regexp')
97
98
99def unquote_string(string: str) -> str:
100 """Unquote a string with JavaScript rules. The string has to start with
101 string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
102 """
103 assert string and string[0] == string[-1] and string[0] in '"\'`', (
104 'string provided is not properly delimited'
105 )
106 string = line_join_re.sub('\\1', string[1:-1])
107 result: list[str] = []
108 add = result.append
109 pos = 0
110
111 while True:
112 # scan for the next escape
113 escape_pos = string.find('\\', pos)
114 if escape_pos < 0:
115 break
116 add(string[pos:escape_pos])
117
118 # check which character is escaped
119 next_char = string[escape_pos + 1]
120 if next_char in escapes:
121 add(escapes[next_char])
122
123 # unicode escapes. trie to consume up to four characters of
124 # hexadecimal characters and try to interpret them as unicode
125 # character point. If there is no such character point, put
126 # all the consumed characters into the string.
127 elif next_char in 'uU':
128 escaped = uni_escape_re.match(string, escape_pos + 2)
129 if escaped is not None:
130 escaped_value = escaped.group()
131 if len(escaped_value) == 4:
132 try:
133 add(chr(int(escaped_value, 16)))
134 except ValueError:
135 pass
136 else:
137 pos = escape_pos + 6
138 continue
139 add(next_char + escaped_value)
140 pos = escaped.end()
141 continue
142 else:
143 add(next_char)
144
145 # hex escapes. conversion from 2-digits hex to char is infallible
146 elif next_char in 'xX':
147 escaped = hex_escape_re.match(string, escape_pos + 2)
148 if escaped is not None:
149 escaped_value = escaped.group()
150 add(chr(int(escaped_value, 16)))
151 pos = escape_pos + 2 + len(escaped_value)
152 continue
153 else:
154 add(next_char)
155
156 # bogus escape. Just remove the backslash.
157 else:
158 add(next_char)
159 pos = escape_pos + 2
160
161 if pos < len(string):
162 add(string[pos:])
163
164 return ''.join(result)
165
166
167def tokenize(
168 source: str,
169 jsx: bool = True,
170 dotted: bool = True,
171 template_string: bool = True,
172 lineno: int = 1,
173) -> Generator[Token, None, None]:
174 """
175 Tokenize JavaScript/JSX source. Returns a generator of tokens.
176
177 :param source: The JavaScript source to tokenize.
178 :param jsx: Enable (limited) JSX parsing.
179 :param dotted: Read dotted names as single name token.
180 :param template_string: Support ES6 template strings
181 :param lineno: starting line number (optional)
182 """
183 may_divide = False
184 pos = 0
185 end = len(source)
186 rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
187
188 while pos < end:
189 # handle regular rules first
190 for token_type, rule in rules: # noqa: B007
191 match = rule.match(source, pos)
192 if match is not None:
193 break
194 # if we don't have a match we don't give up yet, but check for
195 # division operators or regular expression literals, based on
196 # the status of `may_divide` which is determined by the last
197 # processed non-whitespace token using `indicates_division`.
198 else:
199 if may_divide:
200 match = division_re.match(source, pos)
201 token_type = 'operator'
202 else:
203 match = regex_re.match(source, pos)
204 token_type = 'regexp'
205 if match is None:
206 # woops. invalid syntax. jump one char ahead and try again.
207 pos += 1
208 continue
209
210 token_value = match.group()
211 if token_type is not None:
212 token = Token(token_type, token_value, lineno)
213 may_divide = indicates_division(token)
214 yield token
215 lineno += len(line_re.findall(token_value))
216 pos = match.end()