/src/behaviortreecpp/src/script_tokenizer.cpp
Line | Count | Source |
1 | | /* Copyright (C) 2022-2025 Davide Faconti - All Rights Reserved |
2 | | * |
3 | | * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), |
4 | | * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, |
5 | | * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: |
6 | | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
7 | | * |
8 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
9 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
10 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
11 | | */ |
12 | | |
13 | | #include "behaviortree_cpp/scripting/any_types.hpp" |
14 | | |
15 | | #include <cctype> |
16 | | |
17 | | namespace BT::Scripting |
18 | | { |
19 | | |
20 | | namespace |
21 | | { |
22 | | |
23 | | bool isIdentStart(char c) |
24 | 32.3M | { |
25 | 32.3M | return std::isalpha(static_cast<unsigned char>(c)) != 0 || c == '_' || c == '@'; |
26 | 32.3M | } |
27 | | |
28 | | bool isIdentChar(char c) |
29 | 75.7M | { |
30 | 75.7M | return std::isalnum(static_cast<unsigned char>(c)) != 0 || c == '_'; |
31 | 75.7M | } |
32 | | |
33 | | bool isDigit(char c) |
34 | 64.0M | { |
35 | 64.0M | return std::isdigit(static_cast<unsigned char>(c)) != 0; |
36 | 64.0M | } |
37 | | |
38 | | bool isHexDigit(char c) |
39 | 770k | { |
40 | 770k | return std::isxdigit(static_cast<unsigned char>(c)) != 0; |
41 | 770k | } |
42 | | |
43 | | // Consume trailing garbage after a malformed number token. |
44 | | void consumeTrailingGarbage(const std::string& source, size_t len, size_t& i) |
45 | 411k | { |
46 | 902k | while(i < len && (isIdentChar(source[i]) || source[i] == '.')) |
47 | 491k | { |
48 | 491k | ++i; |
49 | 491k | } |
50 | 411k | } |
51 | | |
52 | | struct NumberResult |
53 | | { |
54 | | bool is_real = false; |
55 | | bool has_error = false; |
56 | | }; |
57 | | |
58 | | NumberResult scanHexNumber(const std::string& source, size_t len, size_t& i) |
59 | 435k | { |
60 | 435k | NumberResult result; |
61 | 435k | i += 2; // skip "0x"/"0X" |
62 | 435k | if(i >= len || !isHexDigit(source[i])) |
63 | 431k | { |
64 | 431k | result.has_error = true; |
65 | 431k | } |
66 | 3.19k | else |
67 | 3.19k | { |
68 | 335k | while(i < len && isHexDigit(source[i])) |
69 | 332k | { |
70 | 332k | ++i; |
71 | 332k | } |
72 | 3.19k | } |
73 | | // Hex numbers don't support dot or exponent |
74 | 435k | if(i < len && (source[i] == '.' || isIdentChar(source[i]))) |
75 | 409k | { |
76 | 409k | result.has_error = true; |
77 | 409k | consumeTrailingGarbage(source, len, i); |
78 | 409k | } |
79 | 435k | return result; |
80 | 435k | } |
81 | | |
82 | | NumberResult scanDecimalNumber(const std::string& source, size_t len, size_t& i) |
83 | 2.54M | { |
84 | 2.54M | NumberResult result; |
85 | | |
86 | | // Integer part |
87 | 18.0M | while(i < len && isDigit(source[i])) |
88 | 15.4M | { |
89 | 15.4M | ++i; |
90 | 15.4M | } |
91 | | // Fractional part |
92 | 2.54M | if(i < len && source[i] == '.') |
93 | 14.4k | { |
94 | | // Distinguish from ".." (concat operator) |
95 | 14.4k | if(i + 1 < len && source[i + 1] == '.') |
96 | 7.15k | { |
97 | | // Stop here: "65.." is Integer("65") + DotDot |
98 | 7.15k | } |
99 | 7.33k | else if(i + 1 < len && isDigit(source[i + 1])) |
100 | 6.07k | { |
101 | 6.07k | result.is_real = true; |
102 | 6.07k | ++i; // consume '.' |
103 | 13.0M | while(i < len && isDigit(source[i])) |
104 | 13.0M | { |
105 | 13.0M | ++i; |
106 | 13.0M | } |
107 | 6.07k | } |
108 | 1.26k | else |
109 | 1.26k | { |
110 | | // "65." or "65.x" -- incomplete real |
111 | 1.26k | result.has_error = true; |
112 | 1.26k | ++i; // consume the dot |
113 | 1.26k | consumeTrailingGarbage(source, len, i); |
114 | 1.26k | } |
115 | 14.4k | } |
116 | | // Exponent (only for decimal numbers) |
117 | 2.54M | if(!result.has_error && i < len && (source[i] == 'e' || source[i] == 'E')) |
118 | 21.0k | { |
119 | 21.0k | result.is_real = true; |
120 | 21.0k | ++i; // consume 'e'/'E' |
121 | 21.0k | if(i < len && (source[i] == '+' || source[i] == '-')) |
122 | 9.00k | { |
123 | 9.00k | ++i; // consume sign |
124 | 9.00k | } |
125 | 21.0k | if(i >= len || !isDigit(source[i])) |
126 | 4.70k | { |
127 | 4.70k | result.has_error = true; |
128 | 4.70k | } |
129 | 16.3k | else |
130 | 16.3k | { |
131 | 257k | while(i < len && isDigit(source[i])) |
132 | 241k | { |
133 | 241k | ++i; |
134 | 241k | } |
135 | 16.3k | } |
136 | 21.0k | } |
137 | | // Trailing alpha (e.g. "3foo", "65.43foo") |
138 | 2.54M | if(!result.has_error && i < len && isIdentStart(source[i])) |
139 | 112k | { |
140 | 112k | result.has_error = true; |
141 | 4.03M | while(i < len && isIdentChar(source[i])) |
142 | 3.92M | { |
143 | 3.92M | ++i; |
144 | 3.92M | } |
145 | 112k | } |
146 | 2.54M | return result; |
147 | 2.54M | } |
148 | | |
149 | | TokenType matchTwoCharOp(char c, char next) |
150 | 21.7M | { |
151 | 21.7M | if(c == '.' && next == '.') |
152 | 17.6k | return TokenType::DotDot; |
153 | 21.7M | if(c == '&' && next == '&') |
154 | 2.63k | return TokenType::AmpAmp; |
155 | 21.7M | if(c == '|' && next == '|') |
156 | 4.06k | return TokenType::PipePipe; |
157 | 21.6M | if(c == '=' && next == '=') |
158 | 31.6k | return TokenType::EqualEqual; |
159 | 21.6M | if(c == '!' && next == '=') |
160 | 159k | return TokenType::BangEqual; |
161 | 21.5M | if(c == '<' && next == '=') |
162 | 13.0k | return TokenType::LessEqual; |
163 | 21.4M | if(c == '>' && next == '=') |
164 | 12.9k | return TokenType::GreaterEqual; |
165 | 21.4M | if(c == ':' && next == '=') |
166 | 58.8k | return TokenType::ColonEqual; |
167 | 21.4M | if(c == '+' && next == '=') |
168 | 16.3k | return TokenType::PlusEqual; |
169 | 21.4M | if(c == '-' && next == '=') |
170 | 2.78k | return TokenType::MinusEqual; |
171 | 21.4M | if(c == '*' && next == '=') |
172 | 1.41k | return TokenType::StarEqual; |
173 | 21.4M | if(c == '/' && next == '=') |
174 | 8.06k | return TokenType::SlashEqual; |
175 | 21.3M | return TokenType::Error; |
176 | 21.4M | } |
177 | | |
178 | | TokenType matchSingleCharOp(char c) |
179 | 21.3M | { |
180 | 21.3M | switch(c) |
181 | 21.3M | { |
182 | 22.6k | case '+': |
183 | 22.6k | return TokenType::Plus; |
184 | 49.3k | case '-': |
185 | 49.3k | return TokenType::Minus; |
186 | 2.81k | case '*': |
187 | 2.81k | return TokenType::Star; |
188 | 11.9k | case '/': |
189 | 11.9k | return TokenType::Slash; |
190 | 3.69k | case '&': |
191 | 3.69k | return TokenType::Ampersand; |
192 | 6.28k | case '|': |
193 | 6.28k | return TokenType::Pipe; |
194 | 1.49k | case '^': |
195 | 1.49k | return TokenType::Caret; |
196 | 110k | case '~': |
197 | 110k | return TokenType::Tilde; |
198 | 67.9k | case '!': |
199 | 67.9k | return TokenType::Bang; |
200 | 211k | case '<': |
201 | 211k | return TokenType::Less; |
202 | 1.48M | case '>': |
203 | 1.48M | return TokenType::Greater; |
204 | 27.1k | case '=': |
205 | 27.1k | return TokenType::Equal; |
206 | 9.98k | case '?': |
207 | 9.98k | return TokenType::Question; |
208 | 27.0k | case ':': |
209 | 27.0k | return TokenType::Colon; |
210 | 170k | case '(': |
211 | 170k | return TokenType::LeftParen; |
212 | 5.46k | case ')': |
213 | 5.46k | return TokenType::RightParen; |
214 | 482k | case ';': |
215 | 482k | return TokenType::Semicolon; |
216 | 18.6M | default: |
217 | 18.6M | return TokenType::Error; |
218 | 21.3M | } |
219 | 21.3M | } |
220 | | |
221 | | } // namespace |
222 | | |
223 | | std::vector<Token> tokenize(const std::string& source) |
224 | 13.3k | { |
225 | 13.3k | std::vector<Token> tokens; |
226 | 13.3k | const size_t len = source.size(); |
227 | 13.3k | size_t i = 0; |
228 | | |
229 | 33.0M | while(i < len) |
230 | 32.9M | { |
231 | 32.9M | const char c = source[i]; |
232 | | |
233 | | // Skip whitespace (space, tab, newline, carriage return) |
234 | 32.9M | if(c == ' ' || c == '\t' || c == '\n' || c == '\r') |
235 | 111k | { |
236 | 111k | ++i; |
237 | 111k | continue; |
238 | 111k | } |
239 | | |
240 | 32.8M | const size_t start = i; |
241 | | |
242 | | // Single-quoted string literal |
243 | 32.8M | if(c == '\'') |
244 | 129k | { |
245 | 129k | ++i; |
246 | 59.8M | while(i < len && source[i] != '\'') |
247 | 59.7M | { |
248 | 59.7M | ++i; |
249 | 59.7M | } |
250 | 129k | if(i < len) |
251 | 128k | { |
252 | | // extract content without quotes |
253 | 128k | std::string_view text(&source[start + 1], i - start - 1); |
254 | 128k | tokens.push_back({ TokenType::String, text, start }); |
255 | 128k | ++i; // skip closing quote |
256 | 128k | } |
257 | 361 | else |
258 | 361 | { |
259 | 361 | std::string_view text(&source[start], i - start); |
260 | 361 | tokens.push_back({ TokenType::Error, text, start }); |
261 | 361 | } |
262 | 129k | continue; |
263 | 129k | } |
264 | | |
265 | | // Number literal (integer or real) |
266 | 32.7M | if(isDigit(c)) |
267 | 2.97M | { |
268 | 2.97M | NumberResult nr; |
269 | 2.97M | const bool is_hex = |
270 | 2.97M | c == '0' && i + 1 < len && (source[i + 1] == 'x' || source[i + 1] == 'X'); |
271 | 2.97M | if(is_hex) |
272 | 435k | { |
273 | 435k | nr = scanHexNumber(source, len, i); |
274 | 435k | } |
275 | 2.54M | else |
276 | 2.54M | { |
277 | 2.54M | nr = scanDecimalNumber(source, len, i); |
278 | 2.54M | } |
279 | | |
280 | 2.97M | std::string_view text(&source[start], i - start); |
281 | 2.97M | if(nr.has_error) |
282 | 550k | { |
283 | 550k | tokens.push_back({ TokenType::Error, text, start }); |
284 | 550k | } |
285 | 2.42M | else if(nr.is_real) |
286 | 17.5k | { |
287 | 17.5k | tokens.push_back({ TokenType::Real, text, start }); |
288 | 17.5k | } |
289 | 2.40M | else |
290 | 2.40M | { |
291 | 2.40M | tokens.push_back({ TokenType::Integer, text, start }); |
292 | 2.40M | } |
293 | 2.97M | continue; |
294 | 2.97M | } |
295 | | |
296 | | // Identifier or keyword (true/false) |
297 | 29.7M | if(isIdentStart(c)) |
298 | 8.04M | { |
299 | 8.04M | ++i; // consume start character (may not be isIdentChar, e.g. '@') |
300 | 70.4M | while(i < len && isIdentChar(source[i])) |
301 | 62.3M | { |
302 | 62.3M | ++i; |
303 | 62.3M | } |
304 | 8.04M | if(std::string_view text(&source[start], i - start); text == "true" || text == "fal" |
305 | 8.04M | "se") |
306 | 1.55k | { |
307 | 1.55k | tokens.push_back({ TokenType::Boolean, text, start }); |
308 | 1.55k | } |
309 | 8.04M | else |
310 | 8.04M | { |
311 | 8.04M | tokens.push_back({ TokenType::Identifier, text, start }); |
312 | 8.04M | } |
313 | 8.04M | continue; |
314 | 8.04M | } |
315 | | |
316 | | // Two-character operators (check before single-char) |
317 | 21.7M | if(i + 1 < len) |
318 | 21.7M | { |
319 | 21.7M | TokenType two_char_type = matchTwoCharOp(c, source[i + 1]); |
320 | 21.7M | if(two_char_type != TokenType::Error) |
321 | 328k | { |
322 | 328k | std::string_view text(&source[start], 2); |
323 | 328k | tokens.push_back({ two_char_type, text, start }); |
324 | 328k | i += 2; |
325 | 328k | continue; |
326 | 328k | } |
327 | 21.7M | } |
328 | | |
329 | | // Single-character operators and delimiters |
330 | 21.3M | std::string_view text(&source[start], 1); |
331 | 21.3M | tokens.push_back({ matchSingleCharOp(c), text, start }); |
332 | 21.3M | ++i; |
333 | 21.3M | } |
334 | | |
335 | | // Sentinel |
336 | 13.3k | tokens.push_back({ TokenType::EndOfInput, {}, i }); |
337 | 13.3k | return tokens; |
338 | 13.3k | } |
339 | | |
340 | | } // namespace BT::Scripting |