Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/google/cloud/bigquery/magics/line_arg

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15from collections import namedtuple

16from collections import OrderedDict

17import itertools

18import re

20import enum

23Token = namedtuple("Token", ("type_", "lexeme", "pos"))

24StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))

26# Pattern matching is done with regexes, and the order in which the token patterns are

27# defined is important.

28#

29# Suppose we had the following token definitions:

30# * INT - a token matching integers,

31# * FLOAT - a token matching floating point numbers,

32# * DOT - a token matching a single literal dot character, i.e. "."

33#

34# The FLOAT token would have to be defined first, since we would want the input "1.23"

35# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT).

36#

37# Sometimes, however, different tokens match too similar patterns, and it is not

38# possible to define them in order that would avoid any ambiguity. One such case are

39# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42".

40#

41# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are

42# used to split token definitions into subgroups, and in each lexer state only a single

43# subgroup is used for tokenizing the input. Lexer states can therefore be though of as

44# token namespaces.

45#

46# For example, while parsing the value of the "--params" option, we do not want to

47# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python

48# dictionary and verify its syntactial correctness. On the other hand, while parsing

49# the value of an option other than "--params", we do not really care about its

50# structure, and thus do not want to use any of the "Python tokens" for pattern matching.

51#

52# Token definition order is important, thus an OrderedDict is used. In addition, PEP 468

53# guarantees us that the order of kwargs is preserved in Python 3.6+.

54token_types = OrderedDict(

55 state_parse_pos_args=OrderedDict(

56 GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list

57 DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID

58 ),

59 state_parse_non_params_options=OrderedDict(

60 GOTO_PARSE_PARAMS_OPTION=r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option

61 OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)",

62 OPTION_EQ=r"(?P<OPTION_EQ>=)",

63 OPT_VAL=r"(?P<OPT_VAL>\S+?(?=\s|--|$))",

64 ),

65 state_parse_params_option=OrderedDict(

66 PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( # single and double quoted strings

67 r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"'

68 ),

69 PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))",

70 PARAMS_OPT_EQ=r"(?P<PARAMS_OPT_EQ>=)",

71 GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec

72 PY_BOOL=r"(?P<PY_BOOL>True|False)",

73 DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)",

74 PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)",

75 SQUOTE=r"(?P<SQUOTE>')",

76 DQUOTE=r'(?P<DQUOTE>")',

77 COLON=r"(?P<COLON>:)",

78 COMMA=r"(?P<COMMA>,)",

79 LCURL=r"(?P<LCURL>\{)",

80 RCURL=r"(?P<RCURL>})",

81 LSQUARE=r"(?P<LSQUARE>\[)",

82 RSQUARE=r"(?P<RSQUARE>])",

83 LPAREN=r"(?P<LPAREN>\()",

84 RPAREN=r"(?P<RPAREN>\))",

85 ),

86 common=OrderedDict(

87 WS=r"(?P<WS>\s+)",

88 EOL=r"(?P<EOL>$)",

89 UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else

90 ),

91)

94class AutoStrEnum(str, enum.Enum):

95 """Base enum class for for name=value str enums."""

97 def _generate_next_value_(name, start, count, last_values):

98 return name

100

101TokenType = AutoStrEnum( # type: ignore # pytype: disable=wrong-arg-types

102 "TokenType",

103 [

104 (name, enum.auto())

105 for name in itertools.chain.from_iterable(token_types.values())

106 if not name.startswith("GOTO_")

107 ],

108)

109

110

111class LexerState(AutoStrEnum):

112 PARSE_POS_ARGS = enum.auto() # parsing positional arguments

113 PARSE_NON_PARAMS_OPTIONS = enum.auto() # parsing options other than "--params"

114 PARSE_PARAMS_OPTION = enum.auto() # parsing the "--params" option

115 STATE_END = enum.auto()

116

117

118class Lexer(object):

119 """Lexical analyzer for tokenizing the cell magic input line."""

120

121 _GRAND_PATTERNS = {

122 LexerState.PARSE_POS_ARGS: re.compile(

123 "|".join(

124 itertools.chain(

125 token_types["state_parse_pos_args"].values(),

126 token_types["common"].values(),

127 )

128 )

129 ),

130 LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile(

131 "|".join(

132 itertools.chain(

133 token_types["state_parse_non_params_options"].values(),

134 token_types["common"].values(),

135 )

136 )

137 ),

138 LexerState.PARSE_PARAMS_OPTION: re.compile(

139 "|".join(

140 itertools.chain(

141 token_types["state_parse_params_option"].values(),

142 token_types["common"].values(),

143 )

144 )

145 ),

146 }

147

148 def __init__(self, input_text):

149 self._text = input_text

150

151 def __iter__(self):

152 # Since re.scanner does not seem to support manipulating inner scanner states,

153 # we need to implement lexer state transitions manually using special

154 # non-capturing lookahead token patterns to signal when a state transition

155 # should be made.

156 # Since we don't have "nested" states, we don't really need a stack and

157 # this simple mechanism is sufficient.

158 state = LexerState.PARSE_POS_ARGS

159 offset = 0 # the number of characters processed so far

160

161 while state != LexerState.STATE_END:

162 token_stream = self._find_state_tokens(state, offset)

163

164 for maybe_token in token_stream: # pragma: NO COVER

165 if isinstance(maybe_token, StateTransition):

166 state = maybe_token.new_state

167 offset = maybe_token.total_offset

168 break

169

170 if maybe_token.type_ != TokenType.WS:

171 yield maybe_token

172

173 if maybe_token.type_ == TokenType.EOL:

174 state = LexerState.STATE_END

175 break

176

177 def _find_state_tokens(self, state, current_offset):

178 """Scan the input for current state's tokens starting at ``current_offset``.

179

180 Args:

181 state (LexerState): The current lexer state.

182 current_offset (int): The offset in the input text, i.e. the number

183 of characters already scanned so far.

184

185 Yields:

186 The next ``Token`` or ``StateTransition`` instance.

187 """

188 pattern = self._GRAND_PATTERNS[state]

189 scanner = pattern.finditer(self._text, current_offset)

190

191 for match in scanner: # pragma: NO COVER

192 token_type = match.lastgroup

193

194 if token_type.startswith("GOTO_"):

195 yield StateTransition(

196 new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix

197 total_offset=match.start(),

198 )

199

200 yield Token(token_type, match.group(), match.start())

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/google/cloud/bigquery/magics/line_arg_parser/lexer.py: 97%

30 statements