/src/postgres/src/backend/parser/parser.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * parser.c |
4 | | * Main entry point/driver for PostgreSQL grammar |
5 | | * |
6 | | * Note that the grammar is not allowed to perform any table access |
7 | | * (since we need to be able to do basic parsing even while inside an |
8 | | * aborted transaction). Therefore, the data structures returned by |
9 | | * the grammar are "raw" parsetrees that still need to be analyzed by |
10 | | * analyze.c and related files. |
11 | | * |
12 | | * |
13 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
14 | | * Portions Copyright (c) 1994, Regents of the University of California |
15 | | * |
16 | | * IDENTIFICATION |
17 | | * src/backend/parser/parser.c |
18 | | * |
19 | | *------------------------------------------------------------------------- |
20 | | */ |
21 | | |
22 | | #include "postgres.h" |
23 | | |
24 | | #include "gramparse.h" |
25 | | #include "mb/pg_wchar.h" |
26 | | #include "parser/parser.h" |
27 | | #include "parser/scansup.h" |
28 | | |
29 | | static bool check_uescapechar(unsigned char escape); |
30 | | static char *str_udeescape(const char *str, char escape, |
31 | | int position, core_yyscan_t yyscanner); |
32 | | |
33 | | |
34 | | /* |
35 | | * raw_parser |
36 | | * Given a query in string form, do lexical and grammatical analysis. |
37 | | * |
38 | | * Returns a list of raw (un-analyzed) parse trees. The contents of the |
39 | | * list have the form required by the specified RawParseMode. |
40 | | */ |
41 | | List * |
42 | | raw_parser(const char *str, RawParseMode mode) |
43 | 4.39k | { |
44 | 4.39k | core_yyscan_t yyscanner; |
45 | 4.39k | base_yy_extra_type yyextra; |
46 | 4.39k | int yyresult; |
47 | | |
48 | | /* initialize the flex scanner */ |
49 | 4.39k | yyscanner = scanner_init(str, &yyextra.core_yy_extra, |
50 | 4.39k | &ScanKeywords, ScanKeywordTokens); |
51 | | |
52 | | /* base_yylex() only needs us to initialize the lookahead token, if any */ |
53 | 4.39k | if (mode == RAW_PARSE_DEFAULT) |
54 | 0 | yyextra.have_lookahead = false; |
55 | 4.39k | else |
56 | 4.39k | { |
57 | | /* this array is indexed by RawParseMode enum */ |
58 | 4.39k | static const int mode_token[] = { |
59 | 4.39k | [RAW_PARSE_DEFAULT] = 0, |
60 | 4.39k | [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME, |
61 | 4.39k | [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR, |
62 | 4.39k | [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1, |
63 | 4.39k | [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2, |
64 | 4.39k | [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3, |
65 | 4.39k | }; |
66 | | |
67 | 4.39k | yyextra.have_lookahead = true; |
68 | 4.39k | yyextra.lookahead_token = mode_token[mode]; |
69 | 4.39k | yyextra.lookahead_yylloc = 0; |
70 | 4.39k | yyextra.lookahead_end = NULL; |
71 | 4.39k | } |
72 | | |
73 | | /* initialize the bison parser */ |
74 | 4.39k | parser_init(&yyextra); |
75 | | |
76 | | /* Parse! */ |
77 | 4.39k | yyresult = base_yyparse(yyscanner); |
78 | | |
79 | | /* Clean up (release memory) */ |
80 | 4.39k | scanner_finish(yyscanner); |
81 | | |
82 | 4.39k | if (yyresult) /* error */ |
83 | 0 | return NIL; |
84 | | |
85 | 4.39k | return yyextra.parsetree; |
86 | 4.39k | } |
87 | | |
88 | | |
89 | | /* |
90 | | * Intermediate filter between parser and core lexer (core_yylex in scan.l). |
91 | | * |
92 | | * This filter is needed because in some cases the standard SQL grammar |
93 | | * requires more than one token lookahead. We reduce these cases to one-token |
94 | | * lookahead by replacing tokens here, in order to keep the grammar LALR(1). |
95 | | * |
96 | | * Using a filter is simpler than trying to recognize multiword tokens |
97 | | * directly in scan.l, because we'd have to allow for comments between the |
98 | | * words. Furthermore it's not clear how to do that without re-introducing |
99 | | * scanner backtrack, which would cost more performance than this filter |
100 | | * layer does. |
101 | | * |
102 | | * We also use this filter to convert UIDENT and USCONST sequences into |
103 | | * plain IDENT and SCONST tokens. While that could be handled by additional |
104 | | * productions in the main grammar, it's more efficient to do it like this. |
105 | | * |
106 | | * The filter also provides a convenient place to translate between |
107 | | * the core_YYSTYPE and YYSTYPE representations (which are really the |
108 | | * same thing anyway, but notationally they're different). |
109 | | */ |
110 | | int |
111 | | base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) |
112 | 10.5M | { |
113 | 10.5M | base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); |
114 | 10.5M | int cur_token; |
115 | 10.5M | int next_token; |
116 | 10.5M | int cur_token_length; |
117 | 10.5M | YYLTYPE cur_yylloc; |
118 | | |
119 | | /* Get next token --- we might already have it */ |
120 | 10.5M | if (yyextra->have_lookahead) |
121 | 6.60k | { |
122 | 6.60k | cur_token = yyextra->lookahead_token; |
123 | 6.60k | lvalp->core_yystype = yyextra->lookahead_yylval; |
124 | 6.60k | *llocp = yyextra->lookahead_yylloc; |
125 | 6.60k | if (yyextra->lookahead_end) |
126 | 2.20k | *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
127 | 6.60k | yyextra->have_lookahead = false; |
128 | 6.60k | } |
129 | 10.5M | else |
130 | 10.5M | cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); |
131 | | |
132 | | /* |
133 | | * If this token isn't one that requires lookahead, just return it. If it |
134 | | * does, determine the token length. (We could get that via strlen(), but |
135 | | * since we have such a small set of possibilities, hardwiring seems |
136 | | * feasible and more efficient --- at least for the fixed-length cases.) |
137 | | */ |
138 | 10.5M | switch (cur_token) |
139 | 10.5M | { |
140 | 73 | case FORMAT: |
141 | 73 | cur_token_length = 6; |
142 | 73 | break; |
143 | 1.46k | case NOT: |
144 | 1.46k | cur_token_length = 3; |
145 | 1.46k | break; |
146 | 205 | case NULLS_P: |
147 | 205 | cur_token_length = 5; |
148 | 205 | break; |
149 | 0 | case WITH: |
150 | 0 | cur_token_length = 4; |
151 | 0 | break; |
152 | 438 | case UIDENT: |
153 | 911 | case USCONST: |
154 | 911 | cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); |
155 | 911 | break; |
156 | 0 | case WITHOUT: |
157 | 0 | cur_token_length = 7; |
158 | 0 | break; |
159 | 10.5M | default: |
160 | 10.5M | return cur_token; |
161 | 10.5M | } |
162 | | |
163 | | /* |
164 | | * Identify end+1 of current token. core_yylex() has temporarily stored a |
165 | | * '\0' here, and will undo that when we call it again. We need to redo |
166 | | * it to fully revert the lookahead call for error reporting purposes. |
167 | | */ |
168 | 2.65k | yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + |
169 | 2.65k | *llocp + cur_token_length; |
170 | 2.65k | Assert(*(yyextra->lookahead_end) == '\0'); |
171 | | |
172 | | /* |
173 | | * Save and restore *llocp around the call. It might look like we could |
174 | | * avoid this by just passing &lookahead_yylloc to core_yylex(), but that |
175 | | * does not work because flex actually holds onto the last-passed pointer |
176 | | * internally, and will use that for error reporting. We need any error |
177 | | * reports to point to the current token, not the next one. |
178 | | */ |
179 | 2.65k | cur_yylloc = *llocp; |
180 | | |
181 | | /* Get next token, saving outputs into lookahead variables */ |
182 | 2.65k | next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); |
183 | 2.65k | yyextra->lookahead_token = next_token; |
184 | 2.65k | yyextra->lookahead_yylloc = *llocp; |
185 | | |
186 | 2.65k | *llocp = cur_yylloc; |
187 | | |
188 | | /* Now revert the un-truncation of the current token */ |
189 | 2.65k | yyextra->lookahead_hold_char = *(yyextra->lookahead_end); |
190 | 2.65k | *(yyextra->lookahead_end) = '\0'; |
191 | | |
192 | 2.65k | yyextra->have_lookahead = true; |
193 | | |
194 | | /* Replace cur_token if needed, based on lookahead */ |
195 | 2.65k | switch (cur_token) |
196 | 2.65k | { |
197 | 73 | case FORMAT: |
198 | | /* Replace FORMAT by FORMAT_LA if it's followed by JSON */ |
199 | 73 | switch (next_token) |
200 | 73 | { |
201 | 1 | case JSON: |
202 | 1 | cur_token = FORMAT_LA; |
203 | 1 | break; |
204 | 73 | } |
205 | 73 | break; |
206 | | |
207 | 1.46k | case NOT: |
208 | | /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ |
209 | 1.46k | switch (next_token) |
210 | 1.46k | { |
211 | 0 | case BETWEEN: |
212 | 209 | case IN_P: |
213 | 209 | case LIKE: |
214 | 209 | case ILIKE: |
215 | 209 | case SIMILAR: |
216 | 209 | cur_token = NOT_LA; |
217 | 209 | break; |
218 | 1.46k | } |
219 | 1.46k | break; |
220 | | |
221 | 1.46k | case NULLS_P: |
222 | | /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ |
223 | 205 | switch (next_token) |
224 | 205 | { |
225 | 0 | case FIRST_P: |
226 | 0 | case LAST_P: |
227 | 0 | cur_token = NULLS_LA; |
228 | 0 | break; |
229 | 205 | } |
230 | 205 | break; |
231 | | |
232 | 205 | case WITH: |
233 | | /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ |
234 | 0 | switch (next_token) |
235 | 0 | { |
236 | 0 | case TIME: |
237 | 0 | case ORDINALITY: |
238 | 0 | cur_token = WITH_LA; |
239 | 0 | break; |
240 | 0 | } |
241 | 0 | break; |
242 | | |
243 | 0 | case WITHOUT: |
244 | | /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */ |
245 | 0 | switch (next_token) |
246 | 0 | { |
247 | 0 | case TIME: |
248 | 0 | cur_token = WITHOUT_LA; |
249 | 0 | break; |
250 | 0 | } |
251 | 0 | break; |
252 | | |
253 | 435 | case UIDENT: |
254 | 905 | case USCONST: |
255 | | /* Look ahead for UESCAPE */ |
256 | 905 | if (next_token == UESCAPE) |
257 | 0 | { |
258 | | /* Yup, so get third token, which had better be SCONST */ |
259 | 0 | const char *escstr; |
260 | | |
261 | | /* Again save and restore *llocp */ |
262 | 0 | cur_yylloc = *llocp; |
263 | | |
264 | | /* Un-truncate current token so errors point to third token */ |
265 | 0 | *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
266 | | |
267 | | /* Get third token */ |
268 | 0 | next_token = core_yylex(&(yyextra->lookahead_yylval), |
269 | 0 | llocp, yyscanner); |
270 | | |
271 | | /* If we throw error here, it will point to third token */ |
272 | 0 | if (next_token != SCONST) |
273 | 0 | scanner_yyerror("UESCAPE must be followed by a simple string literal", |
274 | 0 | yyscanner); |
275 | |
|
276 | 0 | escstr = yyextra->lookahead_yylval.str; |
277 | 0 | if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) |
278 | 0 | scanner_yyerror("invalid Unicode escape character", |
279 | 0 | yyscanner); |
280 | | |
281 | | /* Now restore *llocp; errors will point to first token */ |
282 | 0 | *llocp = cur_yylloc; |
283 | | |
284 | | /* Apply Unicode conversion */ |
285 | 0 | lvalp->core_yystype.str = |
286 | 0 | str_udeescape(lvalp->core_yystype.str, |
287 | 0 | escstr[0], |
288 | 0 | *llocp, |
289 | 0 | yyscanner); |
290 | | |
291 | | /* |
292 | | * We don't need to revert the un-truncation of UESCAPE. What |
293 | | * we do want to do is clear have_lookahead, thereby consuming |
294 | | * all three tokens. |
295 | | */ |
296 | 0 | yyextra->have_lookahead = false; |
297 | 0 | } |
298 | 905 | else |
299 | 905 | { |
300 | | /* No UESCAPE, so convert using default escape character */ |
301 | 905 | lvalp->core_yystype.str = |
302 | 905 | str_udeescape(lvalp->core_yystype.str, |
303 | 905 | '\\', |
304 | 905 | *llocp, |
305 | 905 | yyscanner); |
306 | 905 | } |
307 | | |
308 | 905 | if (cur_token == UIDENT) |
309 | 276 | { |
310 | | /* It's an identifier, so truncate as appropriate */ |
311 | 276 | truncate_identifier(lvalp->core_yystype.str, |
312 | 276 | strlen(lvalp->core_yystype.str), |
313 | 276 | true); |
314 | 276 | cur_token = IDENT; |
315 | 276 | } |
316 | 629 | else if (cur_token == USCONST) |
317 | 261 | { |
318 | 261 | cur_token = SCONST; |
319 | 261 | } |
320 | 905 | break; |
321 | 2.65k | } |
322 | | |
323 | 2.27k | return cur_token; |
324 | 2.65k | } |
325 | | |
326 | | /* convert hex digit (caller should have verified that) to value */ |
327 | | static unsigned int |
328 | | hexval(unsigned char c) |
329 | 11.6k | { |
330 | 11.6k | if (c >= '0' && c <= '9') |
331 | 9.17k | return c - '0'; |
332 | 2.43k | if (c >= 'a' && c <= 'f') |
333 | 1.16k | return c - 'a' + 0xA; |
334 | 1.26k | if (c >= 'A' && c <= 'F') |
335 | 1.26k | return c - 'A' + 0xA; |
336 | 0 | elog(ERROR, "invalid hexadecimal digit"); |
337 | 0 | return 0; /* not reached */ |
338 | 0 | } |
339 | | |
340 | | /* is Unicode code point acceptable? */ |
341 | | static void |
342 | | check_unicode_value(pg_wchar c) |
343 | 2.37k | { |
344 | 2.37k | if (!is_valid_unicode_codepoint(c)) |
345 | 2.37k | ereport(ERROR, |
346 | 2.37k | (errcode(ERRCODE_SYNTAX_ERROR), |
347 | 2.37k | errmsg("invalid Unicode escape value"))); |
348 | 2.37k | } |
349 | | |
350 | | /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ |
351 | | static bool |
352 | | check_uescapechar(unsigned char escape) |
353 | 0 | { |
354 | 0 | if (isxdigit(escape) |
355 | 0 | || escape == '+' |
356 | 0 | || escape == '\'' |
357 | 0 | || escape == '"' |
358 | 0 | || scanner_isspace(escape)) |
359 | 0 | return false; |
360 | 0 | else |
361 | 0 | return true; |
362 | 0 | } |
363 | | |
364 | | /* |
365 | | * Process Unicode escapes in "str", producing a palloc'd plain string |
366 | | * |
367 | | * escape: the escape character to use |
368 | | * position: start position of U&'' or U&"" string token |
369 | | * yyscanner: context information needed for error reports |
370 | | */ |
371 | | static char * |
372 | | str_udeescape(const char *str, char escape, |
373 | | int position, core_yyscan_t yyscanner) |
374 | 905 | { |
375 | 905 | const char *in; |
376 | 905 | char *new, |
377 | 905 | *out; |
378 | 905 | size_t new_len; |
379 | 905 | pg_wchar pair_first = 0; |
380 | 905 | ScannerCallbackState scbstate; |
381 | | |
382 | | /* |
383 | | * Guesstimate that result will be no longer than input, but allow enough |
384 | | * padding for Unicode conversion. |
385 | | */ |
386 | 905 | new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1; |
387 | 905 | new = palloc(new_len); |
388 | | |
389 | 905 | in = str; |
390 | 905 | out = new; |
391 | 3.83M | while (*in) |
392 | 3.83M | { |
393 | | /* Enlarge string if needed */ |
394 | 3.83M | size_t out_dist = out - new; |
395 | | |
396 | 3.83M | if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1)) |
397 | 0 | { |
398 | 0 | new_len *= 2; |
399 | 0 | new = repalloc(new, new_len); |
400 | 0 | out = new + out_dist; |
401 | 0 | } |
402 | | |
403 | 3.83M | if (in[0] == escape) |
404 | 2.70k | { |
405 | | /* |
406 | | * Any errors reported while processing this escape sequence will |
407 | | * have an error cursor pointing at the escape. |
408 | | */ |
409 | 2.70k | setup_scanner_errposition_callback(&scbstate, yyscanner, |
410 | 2.70k | in - str + position + 3); /* 3 for U&" */ |
411 | 2.70k | if (in[1] == escape) |
412 | 266 | { |
413 | 266 | if (pair_first) |
414 | 10 | goto invalid_pair; |
415 | 256 | *out++ = escape; |
416 | 256 | in += 2; |
417 | 256 | } |
418 | 2.43k | else if (isxdigit((unsigned char) in[1]) && |
419 | 2.43k | isxdigit((unsigned char) in[2]) && |
420 | 2.43k | isxdigit((unsigned char) in[3]) && |
421 | 2.43k | isxdigit((unsigned char) in[4])) |
422 | 1.33k | { |
423 | 1.33k | pg_wchar unicode; |
424 | | |
425 | 1.33k | unicode = (hexval(in[1]) << 12) + |
426 | 1.33k | (hexval(in[2]) << 8) + |
427 | 1.33k | (hexval(in[3]) << 4) + |
428 | 1.33k | hexval(in[4]); |
429 | 1.33k | check_unicode_value(unicode); |
430 | 1.33k | if (pair_first) |
431 | 41 | { |
432 | 41 | if (is_utf16_surrogate_second(unicode)) |
433 | 14 | { |
434 | 14 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
435 | 14 | pair_first = 0; |
436 | 14 | } |
437 | 27 | else |
438 | 27 | goto invalid_pair; |
439 | 41 | } |
440 | 1.29k | else if (is_utf16_surrogate_second(unicode)) |
441 | 13 | goto invalid_pair; |
442 | | |
443 | 1.29k | if (is_utf16_surrogate_first(unicode)) |
444 | 128 | pair_first = unicode; |
445 | 1.16k | else |
446 | 1.16k | { |
447 | 1.16k | pg_unicode_to_server(unicode, (unsigned char *) out); |
448 | 1.16k | out += strlen(out); |
449 | 1.16k | } |
450 | 1.29k | in += 5; |
451 | 1.29k | } |
452 | 1.10k | else if (in[1] == '+' && |
453 | 1.10k | isxdigit((unsigned char) in[2]) && |
454 | 1.10k | isxdigit((unsigned char) in[3]) && |
455 | 1.10k | isxdigit((unsigned char) in[4]) && |
456 | 1.10k | isxdigit((unsigned char) in[5]) && |
457 | 1.10k | isxdigit((unsigned char) in[6]) && |
458 | 1.10k | isxdigit((unsigned char) in[7])) |
459 | 1.04k | { |
460 | 1.04k | pg_wchar unicode; |
461 | | |
462 | 1.04k | unicode = (hexval(in[2]) << 20) + |
463 | 1.04k | (hexval(in[3]) << 16) + |
464 | 1.04k | (hexval(in[4]) << 12) + |
465 | 1.04k | (hexval(in[5]) << 8) + |
466 | 1.04k | (hexval(in[6]) << 4) + |
467 | 1.04k | hexval(in[7]); |
468 | 1.04k | check_unicode_value(unicode); |
469 | 1.04k | if (pair_first) |
470 | 54 | { |
471 | 54 | if (is_utf16_surrogate_second(unicode)) |
472 | 13 | { |
473 | 13 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
474 | 13 | pair_first = 0; |
475 | 13 | } |
476 | 41 | else |
477 | 41 | goto invalid_pair; |
478 | 54 | } |
479 | 993 | else if (is_utf16_surrogate_second(unicode)) |
480 | 11 | goto invalid_pair; |
481 | | |
482 | 995 | if (is_utf16_surrogate_first(unicode)) |
483 | 18 | pair_first = unicode; |
484 | 977 | else |
485 | 977 | { |
486 | 977 | pg_unicode_to_server(unicode, (unsigned char *) out); |
487 | 977 | out += strlen(out); |
488 | 977 | } |
489 | 995 | in += 8; |
490 | 995 | } |
491 | 58 | else |
492 | 58 | ereport(ERROR, |
493 | 2.60k | (errcode(ERRCODE_SYNTAX_ERROR), |
494 | 2.60k | errmsg("invalid Unicode escape"), |
495 | 2.60k | errhint("Unicode escapes must be \\XXXX or \\+XXXXXX."))); |
496 | | |
497 | 2.60k | cancel_scanner_errposition_callback(&scbstate); |
498 | 2.60k | } |
499 | 3.83M | else |
500 | 3.83M | { |
501 | 3.83M | if (pair_first) |
502 | 12 | goto invalid_pair; |
503 | | |
504 | 3.83M | *out++ = *in++; |
505 | 3.83M | } |
506 | 3.83M | } |
507 | | |
508 | | /* unfinished surrogate pair? */ |
509 | 791 | if (pair_first) |
510 | 28 | goto invalid_pair; |
511 | | |
512 | 763 | *out = '\0'; |
513 | 763 | return new; |
514 | | |
515 | | /* |
516 | | * We might get here with the error callback active, or not. Call |
517 | | * scanner_errposition to make sure an error cursor appears; if the |
518 | | * callback is active, this is duplicative but harmless. |
519 | | */ |
520 | 142 | invalid_pair: |
521 | 142 | ereport(ERROR, |
522 | 142 | (errcode(ERRCODE_SYNTAX_ERROR), |
523 | 142 | errmsg("invalid Unicode surrogate pair"), |
524 | 142 | scanner_errposition(in - str + position + 3, /* 3 for U&" */ |
525 | 142 | yyscanner))); |
526 | 142 | return NULL; /* keep compiler quiet */ |
527 | 142 | } |