Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/babel/messages/jslexer.py: 58%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

102 statements  

1""" 

2 babel.messages.jslexer 

3 ~~~~~~~~~~~~~~~~~~~~~~ 

4 

5 A simple JavaScript 1.5 lexer which is used for the JavaScript 

6 extractor. 

7 

8 :copyright: (c) 2013-2024 by the Babel Team. 

9 :license: BSD, see LICENSE for more details. 

10""" 

11from __future__ import annotations 

12 

13import re 

14from collections.abc import Generator 

15from typing import NamedTuple 

16 

17operators: list[str] = sorted([ 

18 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', 

19 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', 

20 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', 

21 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':', 

22], key=len, reverse=True) 

23 

24escapes: dict[str, str] = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} 

25 

26name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE) 

27dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE) 

28division_re = re.compile(r'/=?') 

29regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL) 

30line_re = re.compile(r'(\r\n|\n|\r)') 

31line_join_re = re.compile(r'\\' + line_re.pattern) 

32uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') 

33hex_escape_re = re.compile(r'[a-fA-F0-9]{1,2}') 

34 

35 

36class Token(NamedTuple): 

37 type: str 

38 value: str 

39 lineno: int 

40 

41 

42_rules: list[tuple[str | None, re.Pattern[str]]] = [ 

43 (None, re.compile(r'\s+', re.UNICODE)), 

44 (None, re.compile(r'<!--.*')), 

45 ('linecomment', re.compile(r'//.*')), 

46 ('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)), 

47 ('dotted_name', dotted_name_re), 

48 ('name', name_re), 

49 ('number', re.compile(r'''( 

50 (?:0|[1-9]\d*) 

51 (\.\d+)? 

52 ([eE][-+]?\d+)? | 

53 (0x[a-fA-F0-9]+) 

54 )''', re.VERBOSE)), 

55 ('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules` 

56 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), 

57 ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)), 

58 ('string', re.compile(r'''( 

59 '(?:[^'\\]*(?:\\.[^'\\]*)*)' | 

60 "(?:[^"\\]*(?:\\.[^"\\]*)*)" 

61 )''', re.VERBOSE | re.DOTALL)), 

62] 

63 

64 

65def get_rules(jsx: bool, dotted: bool, template_string: bool) -> list[tuple[str | None, re.Pattern[str]]]: 

66 """ 

67 Get a tokenization rule list given the passed syntax options. 

68 

69 Internal to this module. 

70 """ 

71 rules = [] 

72 for token_type, rule in _rules: 

73 if not jsx and token_type and 'jsx' in token_type: 

74 continue 

75 if not template_string and token_type == 'template_string': 

76 continue 

77 if token_type == 'dotted_name': 

78 if not dotted: 

79 continue 

80 token_type = 'name' 

81 rules.append((token_type, rule)) 

82 return rules 

83 

84 

85def indicates_division(token: Token) -> bool: 

86 """A helper function that helps the tokenizer to decide if the current 

87 token may be followed by a division operator. 

88 """ 

89 if token.type == 'operator': 

90 return token.value in (')', ']', '}', '++', '--') 

91 return token.type in ('name', 'number', 'string', 'regexp') 

92 

93 

94def unquote_string(string: str) -> str: 

95 """Unquote a string with JavaScript rules. The string has to start with 

96 string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).) 

97 """ 

98 assert string and string[0] == string[-1] and string[0] in '"\'`', \ 

99 'string provided is not properly delimited' 

100 string = line_join_re.sub('\\1', string[1:-1]) 

101 result: list[str] = [] 

102 add = result.append 

103 pos = 0 

104 

105 while True: 

106 # scan for the next escape 

107 escape_pos = string.find('\\', pos) 

108 if escape_pos < 0: 

109 break 

110 add(string[pos:escape_pos]) 

111 

112 # check which character is escaped 

113 next_char = string[escape_pos + 1] 

114 if next_char in escapes: 

115 add(escapes[next_char]) 

116 

117 # unicode escapes. trie to consume up to four characters of 

118 # hexadecimal characters and try to interpret them as unicode 

119 # character point. If there is no such character point, put 

120 # all the consumed characters into the string. 

121 elif next_char in 'uU': 

122 escaped = uni_escape_re.match(string, escape_pos + 2) 

123 if escaped is not None: 

124 escaped_value = escaped.group() 

125 if len(escaped_value) == 4: 

126 try: 

127 add(chr(int(escaped_value, 16))) 

128 except ValueError: 

129 pass 

130 else: 

131 pos = escape_pos + 6 

132 continue 

133 add(next_char + escaped_value) 

134 pos = escaped.end() 

135 continue 

136 else: 

137 add(next_char) 

138 

139 # hex escapes. conversion from 2-digits hex to char is infallible 

140 elif next_char in 'xX': 

141 escaped = hex_escape_re.match(string, escape_pos + 2) 

142 if escaped is not None: 

143 escaped_value = escaped.group() 

144 add(chr(int(escaped_value, 16))) 

145 pos = escape_pos + 2 + len(escaped_value) 

146 continue 

147 else: 

148 add(next_char) 

149 

150 # bogus escape. Just remove the backslash. 

151 else: 

152 add(next_char) 

153 pos = escape_pos + 2 

154 

155 if pos < len(string): 

156 add(string[pos:]) 

157 

158 return ''.join(result) 

159 

160 

161def tokenize(source: str, jsx: bool = True, dotted: bool = True, template_string: bool = True, lineno: int = 1) -> Generator[Token, None, None]: 

162 """ 

163 Tokenize JavaScript/JSX source. Returns a generator of tokens. 

164 

165 :param jsx: Enable (limited) JSX parsing. 

166 :param dotted: Read dotted names as single name token. 

167 :param template_string: Support ES6 template strings 

168 :param lineno: starting line number (optional) 

169 """ 

170 may_divide = False 

171 pos = 0 

172 end = len(source) 

173 rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string) 

174 

175 while pos < end: 

176 # handle regular rules first 

177 for token_type, rule in rules: # noqa: B007 

178 match = rule.match(source, pos) 

179 if match is not None: 

180 break 

181 # if we don't have a match we don't give up yet, but check for 

182 # division operators or regular expression literals, based on 

183 # the status of `may_divide` which is determined by the last 

184 # processed non-whitespace token using `indicates_division`. 

185 else: 

186 if may_divide: 

187 match = division_re.match(source, pos) 

188 token_type = 'operator' 

189 else: 

190 match = regex_re.match(source, pos) 

191 token_type = 'regexp' 

192 if match is None: 

193 # woops. invalid syntax. jump one char ahead and try again. 

194 pos += 1 

195 continue 

196 

197 token_value = match.group() 

198 if token_type is not None: 

199 token = Token(token_type, token_value, lineno) 

200 may_divide = indicates_division(token) 

201 yield token 

202 lineno += len(line_re.findall(token_value)) 

203 pos = match.end()