/src/libpg_query/src/postgres/src_backend_parser_parser.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*-------------------------------------------------------------------- |
2 | | * Symbols referenced in this file: |
3 | | * - raw_parser |
4 | | * - base_yylex |
5 | | * - check_uescapechar |
6 | | * - str_udeescape |
7 | | * - hexval |
8 | | * - check_unicode_value |
9 | | * - raw_parser |
10 | | *-------------------------------------------------------------------- |
11 | | */ |
12 | | |
13 | | /*------------------------------------------------------------------------- |
14 | | * |
15 | | * parser.c |
16 | | * Main entry point/driver for PostgreSQL grammar |
17 | | * |
18 | | * Note that the grammar is not allowed to perform any table access |
19 | | * (since we need to be able to do basic parsing even while inside an |
20 | | * aborted transaction). Therefore, the data structures returned by |
21 | | * the grammar are "raw" parsetrees that still need to be analyzed by |
22 | | * analyze.c and related files. |
23 | | * |
24 | | * |
25 | | * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group |
26 | | * Portions Copyright (c) 1994, Regents of the University of California |
27 | | * |
28 | | * IDENTIFICATION |
29 | | * src/backend/parser/parser.c |
30 | | * |
31 | | *------------------------------------------------------------------------- |
32 | | */ |
33 | | |
34 | | #include "postgres.h" |
35 | | |
36 | | #include "mb/pg_wchar.h" |
37 | | #include "parser/gramparse.h" |
38 | | #include "parser/parser.h" |
39 | | #include "parser/scansup.h" |
40 | | |
41 | | static bool check_uescapechar(unsigned char escape); |
42 | | static char *str_udeescape(const char *str, char escape, |
43 | | int position, core_yyscan_t yyscanner); |
44 | | |
45 | | |
46 | | /* |
47 | | * raw_parser |
48 | | * Given a query in string form, do lexical and grammatical analysis. |
49 | | * |
50 | | * Returns a list of raw (un-analyzed) parse trees. The contents of the |
51 | | * list have the form required by the specified RawParseMode. |
52 | | */ |
53 | | List * |
54 | | raw_parser(const char *str, RawParseMode mode) |
55 | 245 | { |
56 | 245 | core_yyscan_t yyscanner; |
57 | 245 | base_yy_extra_type yyextra; |
58 | 245 | int yyresult; |
59 | | |
60 | | /* initialize the flex scanner */ |
61 | 245 | yyscanner = scanner_init(str, &yyextra.core_yy_extra, |
62 | 245 | &ScanKeywords, ScanKeywordTokens); |
63 | | |
64 | | /* base_yylex() only needs us to initialize the lookahead token, if any */ |
65 | 245 | if (mode == RAW_PARSE_DEFAULT) |
66 | 245 | yyextra.have_lookahead = false; |
67 | 0 | else |
68 | 0 | { |
69 | | /* this array is indexed by RawParseMode enum */ |
70 | 0 | static const int mode_token[] = { |
71 | 0 | 0, /* RAW_PARSE_DEFAULT */ |
72 | 0 | MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */ |
73 | 0 | MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */ |
74 | 0 | MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */ |
75 | 0 | MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */ |
76 | | MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */ |
77 | 0 | }; |
78 | |
|
79 | 0 | yyextra.have_lookahead = true; |
80 | 0 | yyextra.lookahead_token = mode_token[mode]; |
81 | 0 | yyextra.lookahead_yylloc = 0; |
82 | 0 | yyextra.lookahead_end = NULL; |
83 | 0 | } |
84 | | |
85 | | /* initialize the bison parser */ |
86 | 245 | parser_init(&yyextra); |
87 | | |
88 | | /* Parse! */ |
89 | 245 | yyresult = base_yyparse(yyscanner); |
90 | | |
91 | | /* Clean up (release memory) */ |
92 | 245 | scanner_finish(yyscanner); |
93 | | |
94 | 245 | if (yyresult) /* error */ |
95 | 0 | return NIL; |
96 | | |
97 | 245 | return yyextra.parsetree; |
98 | 245 | } |
99 | | |
100 | | |
101 | | /* |
102 | | * Intermediate filter between parser and core lexer (core_yylex in scan.l). |
103 | | * |
104 | | * This filter is needed because in some cases the standard SQL grammar |
105 | | * requires more than one token lookahead. We reduce these cases to one-token |
106 | | * lookahead by replacing tokens here, in order to keep the grammar LALR(1). |
107 | | * |
108 | | * Using a filter is simpler than trying to recognize multiword tokens |
109 | | * directly in scan.l, because we'd have to allow for comments between the |
110 | | * words. Furthermore it's not clear how to do that without re-introducing |
111 | | * scanner backtrack, which would cost more performance than this filter |
112 | | * layer does. |
113 | | * |
114 | | * We also use this filter to convert UIDENT and USCONST sequences into |
115 | | * plain IDENT and SCONST tokens. While that could be handled by additional |
116 | | * productions in the main grammar, it's more efficient to do it like this. |
117 | | * |
118 | | * The filter also provides a convenient place to translate between |
119 | | * the core_YYSTYPE and YYSTYPE representations (which are really the |
120 | | * same thing anyway, but notationally they're different). |
121 | | */ |
122 | | int |
123 | | base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) |
124 | 22.8M | { |
125 | 22.8M | base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); |
126 | 22.8M | int cur_token; |
127 | 22.8M | int next_token; |
128 | 22.8M | int cur_token_length; |
129 | 22.8M | YYLTYPE cur_yylloc; |
130 | | |
131 | | /* Get next token --- we might already have it */ |
132 | 22.8M | if (yyextra->have_lookahead) |
133 | 7.68k | { |
134 | 7.68k | cur_token = yyextra->lookahead_token; |
135 | 7.68k | lvalp->core_yystype = yyextra->lookahead_yylval; |
136 | 7.68k | *llocp = yyextra->lookahead_yylloc; |
137 | 7.68k | if (yyextra->lookahead_end) |
138 | 7.68k | *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
139 | 7.68k | yyextra->have_lookahead = false; |
140 | 7.68k | } |
141 | 22.8M | else |
142 | 22.8M | cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); |
143 | | |
144 | | /* |
145 | | * If this token isn't one that requires lookahead, just return it. If it |
146 | | * does, determine the token length. (We could get that via strlen(), but |
147 | | * since we have such a small set of possibilities, hardwiring seems |
148 | | * feasible and more efficient --- at least for the fixed-length cases.) |
149 | | */ |
150 | 22.8M | switch (cur_token) |
151 | 22.8M | { |
152 | 7.40k | case NOT: |
153 | 7.40k | cur_token_length = 3; |
154 | 7.40k | break; |
155 | 0 | case NULLS_P: |
156 | 0 | cur_token_length = 5; |
157 | 0 | break; |
158 | 211 | case WITH: |
159 | 211 | cur_token_length = 4; |
160 | 211 | break; |
161 | 74 | case UIDENT: |
162 | 79 | case USCONST: |
163 | 79 | cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); |
164 | 79 | break; |
165 | 91 | case SQL_COMMENT: |
166 | 94 | case C_COMMENT: |
167 | 94 | return base_yylex(lvalp, llocp, yyscanner); |
168 | 22.8M | default: |
169 | 22.8M | return cur_token; |
170 | 22.8M | } |
171 | | |
172 | | /* |
173 | | * Identify end+1 of current token. core_yylex() has temporarily stored a |
174 | | * '\0' here, and will undo that when we call it again. We need to redo |
175 | | * it to fully revert the lookahead call for error reporting purposes. |
176 | | */ |
177 | 7.69k | yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + |
178 | 7.69k | *llocp + cur_token_length; |
179 | 7.69k | Assert(*(yyextra->lookahead_end) == '\0'); |
180 | | |
181 | | /* |
182 | | * Save and restore *llocp around the call. It might look like we could |
183 | | * avoid this by just passing &lookahead_yylloc to core_yylex(), but that |
184 | | * does not work because flex actually holds onto the last-passed pointer |
185 | | * internally, and will use that for error reporting. We need any error |
186 | | * reports to point to the current token, not the next one. |
187 | | */ |
188 | 7.69k | cur_yylloc = *llocp; |
189 | | |
190 | | /* Get next token, saving outputs into lookahead variables */ |
191 | 7.69k | next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); |
192 | 7.69k | yyextra->lookahead_token = next_token; |
193 | 7.69k | yyextra->lookahead_yylloc = *llocp; |
194 | | |
195 | 7.69k | *llocp = cur_yylloc; |
196 | | |
197 | | /* Now revert the un-truncation of the current token */ |
198 | 7.69k | yyextra->lookahead_hold_char = *(yyextra->lookahead_end); |
199 | 7.69k | *(yyextra->lookahead_end) = '\0'; |
200 | | |
201 | 7.69k | yyextra->have_lookahead = true; |
202 | | |
203 | | /* Replace cur_token if needed, based on lookahead */ |
204 | 7.69k | switch (cur_token) |
205 | 7.69k | { |
206 | 7.40k | case NOT: |
207 | | /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ |
208 | 7.40k | switch (next_token) |
209 | 7.40k | { |
210 | 0 | case BETWEEN: |
211 | 35 | case IN_P: |
212 | 35 | case LIKE: |
213 | 290 | case ILIKE: |
214 | 290 | case SIMILAR: |
215 | 290 | cur_token = NOT_LA; |
216 | 290 | break; |
217 | 7.40k | } |
218 | 7.40k | break; |
219 | | |
220 | 7.40k | case NULLS_P: |
221 | | /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ |
222 | 0 | switch (next_token) |
223 | 0 | { |
224 | 0 | case FIRST_P: |
225 | 0 | case LAST_P: |
226 | 0 | cur_token = NULLS_LA; |
227 | 0 | break; |
228 | 0 | } |
229 | 0 | break; |
230 | | |
231 | 211 | case WITH: |
232 | | /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ |
233 | 211 | switch (next_token) |
234 | 211 | { |
235 | 0 | case TIME: |
236 | 0 | case ORDINALITY: |
237 | 0 | cur_token = WITH_LA; |
238 | 0 | break; |
239 | 211 | } |
240 | 211 | break; |
241 | | |
242 | 211 | case UIDENT: |
243 | 79 | case USCONST: |
244 | | /* Look ahead for UESCAPE */ |
245 | 79 | if (next_token == UESCAPE) |
246 | 0 | { |
247 | | /* Yup, so get third token, which had better be SCONST */ |
248 | 0 | const char *escstr; |
249 | | |
250 | | /* Again save and restore *llocp */ |
251 | 0 | cur_yylloc = *llocp; |
252 | | |
253 | | /* Un-truncate current token so errors point to third token */ |
254 | 0 | *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
255 | | |
256 | | /* Get third token */ |
257 | 0 | next_token = core_yylex(&(yyextra->lookahead_yylval), |
258 | 0 | llocp, yyscanner); |
259 | | |
260 | | /* If we throw error here, it will point to third token */ |
261 | 0 | if (next_token != SCONST) |
262 | 0 | scanner_yyerror("UESCAPE must be followed by a simple string literal", |
263 | 0 | yyscanner); |
264 | | |
265 | 0 | escstr = yyextra->lookahead_yylval.str; |
266 | 0 | if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) |
267 | 0 | scanner_yyerror("invalid Unicode escape character", |
268 | 0 | yyscanner); |
269 | | |
270 | | /* Now restore *llocp; errors will point to first token */ |
271 | 0 | *llocp = cur_yylloc; |
272 | | |
273 | | /* Apply Unicode conversion */ |
274 | 0 | lvalp->core_yystype.str = |
275 | 0 | str_udeescape(lvalp->core_yystype.str, |
276 | 0 | escstr[0], |
277 | 0 | *llocp, |
278 | 0 | yyscanner); |
279 | | |
280 | | /* |
281 | | * We don't need to revert the un-truncation of UESCAPE. What |
282 | | * we do want to do is clear have_lookahead, thereby consuming |
283 | | * all three tokens. |
284 | | */ |
285 | 0 | yyextra->have_lookahead = false; |
286 | 0 | } |
287 | 79 | else |
288 | 79 | { |
289 | | /* No UESCAPE, so convert using default escape character */ |
290 | 79 | lvalp->core_yystype.str = |
291 | 79 | str_udeescape(lvalp->core_yystype.str, |
292 | 79 | '\\', |
293 | 79 | *llocp, |
294 | 79 | yyscanner); |
295 | 79 | } |
296 | | |
297 | 79 | if (cur_token == UIDENT) |
298 | 73 | { |
299 | | /* It's an identifier, so truncate as appropriate */ |
300 | 73 | truncate_identifier(lvalp->core_yystype.str, |
301 | 73 | strlen(lvalp->core_yystype.str), |
302 | 73 | true); |
303 | 73 | cur_token = IDENT; |
304 | 73 | } |
305 | 6 | else if (cur_token == USCONST) |
306 | 5 | { |
307 | 5 | cur_token = SCONST; |
308 | 5 | } |
309 | 79 | break; |
310 | 7.69k | } |
311 | | |
312 | 7.69k | return cur_token; |
313 | 7.69k | } |
314 | | |
315 | | /* convert hex digit (caller should have verified that) to value */ |
316 | | static unsigned int |
317 | | hexval(unsigned char c) |
318 | 270 | { |
319 | 270 | if (c >= '0' && c <= '9') |
320 | 215 | return c - '0'; |
321 | 55 | if (c >= 'a' && c <= 'f') |
322 | 55 | return c - 'a' + 0xA; |
323 | 0 | if (c >= 'A' && c <= 'F') |
324 | 0 | return c - 'A' + 0xA; |
325 | 0 | elog(ERROR, "invalid hexadecimal digit"); |
326 | 0 | return 0; /* not reached */ |
327 | 0 | } |
328 | | |
329 | | /* is Unicode code point acceptable? */ |
330 | | static void |
331 | | check_unicode_value(pg_wchar c) |
332 | 65 | { |
333 | 65 | if (!is_valid_unicode_codepoint(c)) |
334 | 65 | ereport(ERROR, |
335 | 65 | (errcode(ERRCODE_SYNTAX_ERROR), |
336 | 65 | errmsg("invalid Unicode escape value"))); |
337 | 65 | } |
338 | | |
339 | | /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ |
340 | | static bool |
341 | | check_uescapechar(unsigned char escape) |
342 | 0 | { |
343 | 0 | if (isxdigit(escape) |
344 | 0 | || escape == '+' |
345 | 0 | || escape == '\'' |
346 | 0 | || escape == '"' |
347 | 0 | || scanner_isspace(escape)) |
348 | 0 | return false; |
349 | 0 | else |
350 | 0 | return true; |
351 | 0 | } |
352 | | |
353 | | /* |
354 | | * Process Unicode escapes in "str", producing a palloc'd plain string |
355 | | * |
356 | | * escape: the escape character to use |
357 | | * position: start position of U&'' or U&"" string token |
358 | | * yyscanner: context information needed for error reports |
359 | | */ |
360 | | static char * |
361 | | str_udeescape(const char *str, char escape, |
362 | | int position, core_yyscan_t yyscanner) |
363 | 79 | { |
364 | 79 | const char *in; |
365 | 79 | char *new, |
366 | 79 | *out; |
367 | 79 | size_t new_len; |
368 | 79 | pg_wchar pair_first = 0; |
369 | 79 | ScannerCallbackState scbstate; |
370 | | |
371 | | /* |
372 | | * Guesstimate that result will be no longer than input, but allow enough |
373 | | * padding for Unicode conversion. |
374 | | */ |
375 | 79 | new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1; |
376 | 79 | new = palloc(new_len); |
377 | | |
378 | 79 | in = str; |
379 | 79 | out = new; |
380 | 3.68M | while (*in) |
381 | 3.68M | { |
382 | | /* Enlarge string if needed */ |
383 | 3.68M | size_t out_dist = out - new; |
384 | | |
385 | 3.68M | if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1)) |
386 | 0 | { |
387 | 0 | new_len *= 2; |
388 | 0 | new = repalloc(new, new_len); |
389 | 0 | out = new + out_dist; |
390 | 0 | } |
391 | | |
392 | 3.68M | if (in[0] == escape) |
393 | 2.65k | { |
394 | | /* |
395 | | * Any errors reported while processing this escape sequence will |
396 | | * have an error cursor pointing at the escape. |
397 | | */ |
398 | 2.65k | setup_scanner_errposition_callback(&scbstate, yyscanner, |
399 | 2.65k | in - str + position + 3); /* 3 for U&" */ |
400 | 2.65k | if (in[1] == escape) |
401 | 2.58k | { |
402 | 2.58k | if (pair_first) |
403 | 0 | goto invalid_pair; |
404 | 2.58k | *out++ = escape; |
405 | 2.58k | in += 2; |
406 | 2.58k | } |
407 | 66 | else if (isxdigit((unsigned char) in[1]) && |
408 | 66 | isxdigit((unsigned char) in[2]) && |
409 | 66 | isxdigit((unsigned char) in[3]) && |
410 | 66 | isxdigit((unsigned char) in[4])) |
411 | 60 | { |
412 | 60 | pg_wchar unicode; |
413 | | |
414 | 60 | unicode = (hexval(in[1]) << 12) + |
415 | 60 | (hexval(in[2]) << 8) + |
416 | 60 | (hexval(in[3]) << 4) + |
417 | 60 | hexval(in[4]); |
418 | 60 | check_unicode_value(unicode); |
419 | 60 | if (pair_first) |
420 | 0 | { |
421 | 0 | if (is_utf16_surrogate_second(unicode)) |
422 | 0 | { |
423 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
424 | 0 | pair_first = 0; |
425 | 0 | } |
426 | 0 | else |
427 | 0 | goto invalid_pair; |
428 | 0 | } |
429 | 60 | else if (is_utf16_surrogate_second(unicode)) |
430 | 0 | goto invalid_pair; |
431 | | |
432 | 60 | if (is_utf16_surrogate_first(unicode)) |
433 | 0 | pair_first = unicode; |
434 | 60 | else |
435 | 60 | { |
436 | 60 | pg_unicode_to_server(unicode, (unsigned char *) out); |
437 | 60 | out += strlen(out); |
438 | 60 | } |
439 | 60 | in += 5; |
440 | 60 | } |
441 | 6 | else if (in[1] == '+' && |
442 | 6 | isxdigit((unsigned char) in[2]) && |
443 | 6 | isxdigit((unsigned char) in[3]) && |
444 | 6 | isxdigit((unsigned char) in[4]) && |
445 | 6 | isxdigit((unsigned char) in[5]) && |
446 | 6 | isxdigit((unsigned char) in[6]) && |
447 | 6 | isxdigit((unsigned char) in[7])) |
448 | 5 | { |
449 | 5 | pg_wchar unicode; |
450 | | |
451 | 5 | unicode = (hexval(in[2]) << 20) + |
452 | 5 | (hexval(in[3]) << 16) + |
453 | 5 | (hexval(in[4]) << 12) + |
454 | 5 | (hexval(in[5]) << 8) + |
455 | 5 | (hexval(in[6]) << 4) + |
456 | 5 | hexval(in[7]); |
457 | 5 | check_unicode_value(unicode); |
458 | 5 | if (pair_first) |
459 | 0 | { |
460 | 0 | if (is_utf16_surrogate_second(unicode)) |
461 | 0 | { |
462 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
463 | 0 | pair_first = 0; |
464 | 0 | } |
465 | 0 | else |
466 | 0 | goto invalid_pair; |
467 | 0 | } |
468 | 5 | else if (is_utf16_surrogate_second(unicode)) |
469 | 0 | goto invalid_pair; |
470 | | |
471 | 5 | if (is_utf16_surrogate_first(unicode)) |
472 | 0 | pair_first = unicode; |
473 | 5 | else |
474 | 5 | { |
475 | 5 | pg_unicode_to_server(unicode, (unsigned char *) out); |
476 | 5 | out += strlen(out); |
477 | 5 | } |
478 | 5 | in += 8; |
479 | 5 | } |
480 | 1 | else |
481 | 1 | ereport(ERROR, |
482 | 2.65k | (errcode(ERRCODE_SYNTAX_ERROR), |
483 | 2.65k | errmsg("invalid Unicode escape"), |
484 | 2.65k | errhint("Unicode escapes must be \\XXXX or \\+XXXXXX."))); |
485 | | |
486 | 2.65k | cancel_scanner_errposition_callback(&scbstate); |
487 | 2.65k | } |
488 | 3.68M | else |
489 | 3.68M | { |
490 | 3.68M | if (pair_first) |
491 | 0 | goto invalid_pair; |
492 | | |
493 | 3.68M | *out++ = *in++; |
494 | 3.68M | } |
495 | 3.68M | } |
496 | | |
497 | | /* unfinished surrogate pair? */ |
498 | 79 | if (pair_first) |
499 | 0 | goto invalid_pair; |
500 | | |
501 | 79 | *out = '\0'; |
502 | 79 | return new; |
503 | | |
504 | | /* |
505 | | * We might get here with the error callback active, or not. Call |
506 | | * scanner_errposition to make sure an error cursor appears; if the |
507 | | * callback is active, this is duplicative but harmless. |
508 | | */ |
509 | 0 | invalid_pair: |
510 | 0 | ereport(ERROR, |
511 | 0 | (errcode(ERRCODE_SYNTAX_ERROR), |
512 | 0 | errmsg("invalid Unicode surrogate pair"), |
513 | 0 | scanner_errposition(in - str + position + 3, /* 3 for U&" */ |
514 | 0 | yyscanner))); |
515 | 0 | return NULL; /* keep compiler quiet */ |
516 | 0 | } |