1# Copyright 2020 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15from collections import namedtuple
16from collections import OrderedDict
17import itertools
18import re
19
20import enum
21
22
23Token = namedtuple("Token", ("type_", "lexeme", "pos"))
24StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
25
26# Pattern matching is done with regexes, and the order in which the token patterns are
27# defined is important.
28#
29# Suppose we had the following token definitions:
30# * INT - a token matching integers,
31# * FLOAT - a token matching floating point numbers,
32# * DOT - a token matching a single literal dot character, i.e. "."
33#
34# The FLOAT token would have to be defined first, since we would want the input "1.23"
35# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT).
36#
37# Sometimes, however, different tokens match too similar patterns, and it is not
38# possible to define them in order that would avoid any ambiguity. One such case are
39# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42".
40#
41# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are
42# used to split token definitions into subgroups, and in each lexer state only a single
43# subgroup is used for tokenizing the input. Lexer states can therefore be though of as
44# token namespaces.
45#
46# For example, while parsing the value of the "--params" option, we do not want to
47# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python
48# dictionary and verify its syntactial correctness. On the other hand, while parsing
49# the value of an option other than "--params", we do not really care about its
50# structure, and thus do not want to use any of the "Python tokens" for pattern matching.
51#
52# Token definition order is important, thus an OrderedDict is used. In addition, PEP 468
53# guarantees us that the order of kwargs is preserved in Python 3.6+.
54token_types = OrderedDict(
55 state_parse_pos_args=OrderedDict(
56 GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list
57 DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID
58 ),
59 state_parse_non_params_options=OrderedDict(
60 GOTO_PARSE_PARAMS_OPTION=r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option
61 OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)",
62 OPTION_EQ=r"(?P<OPTION_EQ>=)",
63 OPT_VAL=r"(?P<OPT_VAL>\S+?(?=\s|--|$))",
64 ),
65 state_parse_params_option=OrderedDict(
66 PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( # single and double quoted strings
67 r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"'
68 ),
69 PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))",
70 PARAMS_OPT_EQ=r"(?P<PARAMS_OPT_EQ>=)",
71 GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec
72 PY_BOOL=r"(?P<PY_BOOL>True|False)",
73 DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)",
74 PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)",
75 SQUOTE=r"(?P<SQUOTE>')",
76 DQUOTE=r'(?P<DQUOTE>")',
77 COLON=r"(?P<COLON>:)",
78 COMMA=r"(?P<COMMA>,)",
79 LCURL=r"(?P<LCURL>\{)",
80 RCURL=r"(?P<RCURL>})",
81 LSQUARE=r"(?P<LSQUARE>\[)",
82 RSQUARE=r"(?P<RSQUARE>])",
83 LPAREN=r"(?P<LPAREN>\()",
84 RPAREN=r"(?P<RPAREN>\))",
85 ),
86 common=OrderedDict(
87 WS=r"(?P<WS>\s+)",
88 EOL=r"(?P<EOL>$)",
89 UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else
90 ),
91)
92
93
94class AutoStrEnum(str, enum.Enum):
95 """Base enum class for for name=value str enums."""
96
97 def _generate_next_value_(name, start, count, last_values):
98 return name
99
100
101TokenType = AutoStrEnum( # type: ignore # pytype: disable=wrong-arg-types
102 "TokenType",
103 [
104 (name, enum.auto())
105 for name in itertools.chain.from_iterable(token_types.values())
106 if not name.startswith("GOTO_")
107 ],
108)
109
110
111class LexerState(AutoStrEnum):
112 PARSE_POS_ARGS = enum.auto() # parsing positional arguments
113 PARSE_NON_PARAMS_OPTIONS = enum.auto() # parsing options other than "--params"
114 PARSE_PARAMS_OPTION = enum.auto() # parsing the "--params" option
115 STATE_END = enum.auto()
116
117
118class Lexer(object):
119 """Lexical analyzer for tokenizing the cell magic input line."""
120
121 _GRAND_PATTERNS = {
122 LexerState.PARSE_POS_ARGS: re.compile(
123 "|".join(
124 itertools.chain(
125 token_types["state_parse_pos_args"].values(),
126 token_types["common"].values(),
127 )
128 )
129 ),
130 LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile(
131 "|".join(
132 itertools.chain(
133 token_types["state_parse_non_params_options"].values(),
134 token_types["common"].values(),
135 )
136 )
137 ),
138 LexerState.PARSE_PARAMS_OPTION: re.compile(
139 "|".join(
140 itertools.chain(
141 token_types["state_parse_params_option"].values(),
142 token_types["common"].values(),
143 )
144 )
145 ),
146 }
147
148 def __init__(self, input_text):
149 self._text = input_text
150
151 def __iter__(self):
152 # Since re.scanner does not seem to support manipulating inner scanner states,
153 # we need to implement lexer state transitions manually using special
154 # non-capturing lookahead token patterns to signal when a state transition
155 # should be made.
156 # Since we don't have "nested" states, we don't really need a stack and
157 # this simple mechanism is sufficient.
158 state = LexerState.PARSE_POS_ARGS
159 offset = 0 # the number of characters processed so far
160
161 while state != LexerState.STATE_END:
162 token_stream = self._find_state_tokens(state, offset)
163
164 for maybe_token in token_stream: # pragma: NO COVER
165 if isinstance(maybe_token, StateTransition):
166 state = maybe_token.new_state
167 offset = maybe_token.total_offset
168 break
169
170 if maybe_token.type_ != TokenType.WS:
171 yield maybe_token
172
173 if maybe_token.type_ == TokenType.EOL:
174 state = LexerState.STATE_END
175 break
176
177 def _find_state_tokens(self, state, current_offset):
178 """Scan the input for current state's tokens starting at ``current_offset``.
179
180 Args:
181 state (LexerState): The current lexer state.
182 current_offset (int): The offset in the input text, i.e. the number
183 of characters already scanned so far.
184
185 Yields:
186 The next ``Token`` or ``StateTransition`` instance.
187 """
188 pattern = self._GRAND_PATTERNS[state]
189 scanner = pattern.finditer(self._text, current_offset)
190
191 for match in scanner: # pragma: NO COVER
192 token_type = match.lastgroup
193
194 if token_type.startswith("GOTO_"):
195 yield StateTransition(
196 new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix
197 total_offset=match.start(),
198 )
199
200 yield Token(token_type, match.group(), match.start())