Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/google/cloud/bigquery/magics/line_arg_parser/lexer.py: 100%

29 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-26 06:07 +0000

1# Copyright 2020 Google LLC 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15from collections import namedtuple 

16from collections import OrderedDict 

17import itertools 

18import re 

19 

20import enum 

21 

22 

23Token = namedtuple("Token", ("type_", "lexeme", "pos")) 

24StateTransition = namedtuple("StateTransition", ("new_state", "total_offset")) 

25 

26# Pattern matching is done with regexes, and the order in which the token patterns are 

27# defined is important. 

28# 

29# Suppose we had the following token definitions: 

30# * INT - a token matching integers, 

31# * FLOAT - a token matching floating point numbers, 

32# * DOT - a token matching a single literal dot character, i.e. "." 

33# 

34# The FLOAT token would have to be defined first, since we would want the input "1.23" 

35# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT). 

36# 

37# Sometimes, however, different tokens match too similar patterns, and it is not 

38# possible to define them in order that would avoid any ambiguity. One such case are 

39# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42". 

40# 

41# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are 

42# used to split token definitions into subgroups, and in each lexer state only a single 

43# subgroup is used for tokenizing the input. Lexer states can therefore be though of as 

44# token namespaces. 

45# 

46# For example, while parsing the value of the "--params" option, we do not want to 

47# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python 

48# dictionary and verify its syntactial correctness. On the other hand, while parsing 

49# the value of an option other than "--params", we do not really care about its 

50# structure, and thus do not want to use any of the "Python tokens" for pattern matching. 

51# 

52# Token definition order is important, thus an OrderedDict is used. In addition, PEP 468 

53# guarantees us that the order of kwargs is preserved in Python 3.6+. 

54token_types = OrderedDict( 

55 state_parse_pos_args=OrderedDict( 

56 GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list 

57 DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID 

58 ), 

59 state_parse_non_params_options=OrderedDict( 

60 GOTO_PARSE_PARAMS_OPTION=r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option 

61 OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)", 

62 OPTION_EQ=r"(?P<OPTION_EQ>=)", 

63 OPT_VAL=r"(?P<OPT_VAL>\S+?(?=\s|--|$))", 

64 ), 

65 state_parse_params_option=OrderedDict( 

66 PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( # single and double quoted strings 

67 r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"' 

68 ), 

69 PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))", 

70 PARAMS_OPT_EQ=r"(?P<PARAMS_OPT_EQ>=)", 

71 GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec 

72 PY_BOOL=r"(?P<PY_BOOL>True|False)", 

73 DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)", 

74 PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)", 

75 SQUOTE=r"(?P<SQUOTE>')", 

76 DQUOTE=r'(?P<DQUOTE>")', 

77 COLON=r"(?P<COLON>:)", 

78 COMMA=r"(?P<COMMA>,)", 

79 LCURL=r"(?P<LCURL>\{)", 

80 RCURL=r"(?P<RCURL>})", 

81 LSQUARE=r"(?P<LSQUARE>\[)", 

82 RSQUARE=r"(?P<RSQUARE>])", 

83 LPAREN=r"(?P<LPAREN>\()", 

84 RPAREN=r"(?P<RPAREN>\))", 

85 ), 

86 common=OrderedDict( 

87 WS=r"(?P<WS>\s+)", 

88 EOL=r"(?P<EOL>$)", 

89 UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else 

90 ), 

91) 

92 

93 

94class AutoStrEnum(str, enum.Enum): 

95 """Base enum class for for name=value str enums.""" 

96 

97 def _generate_next_value_(name, start, count, last_values): 

98 return name 

99 

100 

101TokenType = AutoStrEnum( # type: ignore # pytype: disable=wrong-arg-types 

102 "TokenType", 

103 [ 

104 (name, enum.auto()) 

105 for name in itertools.chain.from_iterable(token_types.values()) 

106 if not name.startswith("GOTO_") 

107 ], 

108) 

109 

110 

111class LexerState(AutoStrEnum): 

112 PARSE_POS_ARGS = enum.auto() # parsing positional arguments 

113 PARSE_NON_PARAMS_OPTIONS = enum.auto() # parsing options other than "--params" 

114 PARSE_PARAMS_OPTION = enum.auto() # parsing the "--params" option 

115 STATE_END = enum.auto() 

116 

117 

118class Lexer(object): 

119 """Lexical analyzer for tokenizing the cell magic input line.""" 

120 

121 _GRAND_PATTERNS = { 

122 LexerState.PARSE_POS_ARGS: re.compile( 

123 "|".join( 

124 itertools.chain( 

125 token_types["state_parse_pos_args"].values(), 

126 token_types["common"].values(), 

127 ) 

128 ) 

129 ), 

130 LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile( 

131 "|".join( 

132 itertools.chain( 

133 token_types["state_parse_non_params_options"].values(), 

134 token_types["common"].values(), 

135 ) 

136 ) 

137 ), 

138 LexerState.PARSE_PARAMS_OPTION: re.compile( 

139 "|".join( 

140 itertools.chain( 

141 token_types["state_parse_params_option"].values(), 

142 token_types["common"].values(), 

143 ) 

144 ) 

145 ), 

146 } 

147 

148 def __init__(self, input_text): 

149 self._text = input_text 

150 

151 def __iter__(self): 

152 # Since re.scanner does not seem to support manipulating inner scanner states, 

153 # we need to implement lexer state transitions manually using special 

154 # non-capturing lookahead token patterns to signal when a state transition 

155 # should be made. 

156 # Since we don't have "nested" states, we don't really need a stack and 

157 # this simple mechanism is sufficient. 

158 state = LexerState.PARSE_POS_ARGS 

159 offset = 0 # the number of characters processed so far 

160 

161 while state != LexerState.STATE_END: 

162 token_stream = self._find_state_tokens(state, offset) 

163 

164 for maybe_token in token_stream: # pragma: NO COVER 

165 if isinstance(maybe_token, StateTransition): 

166 state = maybe_token.new_state 

167 offset = maybe_token.total_offset 

168 break 

169 

170 if maybe_token.type_ != TokenType.WS: 

171 yield maybe_token 

172 

173 if maybe_token.type_ == TokenType.EOL: 

174 state = LexerState.STATE_END 

175 break 

176 

177 def _find_state_tokens(self, state, current_offset): 

178 """Scan the input for current state's tokens starting at ``current_offset``. 

179 

180 Args: 

181 state (LexerState): The current lexer state. 

182 current_offset (int): The offset in the input text, i.e. the number 

183 of characters already scanned so far. 

184 

185 Yields: 

186 The next ``Token`` or ``StateTransition`` instance. 

187 """ 

188 pattern = self._GRAND_PATTERNS[state] 

189 scanner = pattern.finditer(self._text, current_offset) 

190 

191 for match in scanner: # pragma: NO COVER 

192 token_type = match.lastgroup 

193 

194 if token_type.startswith("GOTO_"): 

195 yield StateTransition( 

196 new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix 

197 total_offset=match.start(), 

198 ) 

199 

200 yield Token(token_type, match.group(), match.start())