Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/backend/parser/parser.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * parser.c
4
 *    Main entry point/driver for PostgreSQL grammar
5
 *
6
 * Note that the grammar is not allowed to perform any table access
7
 * (since we need to be able to do basic parsing even while inside an
8
 * aborted transaction).  Therefore, the data structures returned by
9
 * the grammar are "raw" parsetrees that still need to be analyzed by
10
 * analyze.c and related files.
11
 *
12
 *
13
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14
 * Portions Copyright (c) 1994, Regents of the University of California
15
 *
16
 * IDENTIFICATION
17
 *    src/backend/parser/parser.c
18
 *
19
 *-------------------------------------------------------------------------
20
 */
21
22
#include "postgres.h"
23
24
#include "gramparse.h"
25
#include "mb/pg_wchar.h"
26
#include "parser/parser.h"
27
#include "parser/scansup.h"
28
29
static bool check_uescapechar(unsigned char escape);
30
static char *str_udeescape(const char *str, char escape,
31
               int position, core_yyscan_t yyscanner);
32
33
34
/*
35
 * raw_parser
36
 *    Given a query in string form, do lexical and grammatical analysis.
37
 *
38
 * Returns a list of raw (un-analyzed) parse trees.  The contents of the
39
 * list have the form required by the specified RawParseMode.
40
 */
41
List *
42
raw_parser(const char *str, RawParseMode mode)
43
4.39k
{
44
4.39k
  core_yyscan_t yyscanner;
45
4.39k
  base_yy_extra_type yyextra;
46
4.39k
  int     yyresult;
47
48
  /* initialize the flex scanner */
49
4.39k
  yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50
4.39k
               &ScanKeywords, ScanKeywordTokens);
51
52
  /* base_yylex() only needs us to initialize the lookahead token, if any */
53
4.39k
  if (mode == RAW_PARSE_DEFAULT)
54
0
    yyextra.have_lookahead = false;
55
4.39k
  else
56
4.39k
  {
57
    /* this array is indexed by RawParseMode enum */
58
4.39k
    static const int mode_token[] = {
59
4.39k
      [RAW_PARSE_DEFAULT] = 0,
60
4.39k
      [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61
4.39k
      [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62
4.39k
      [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63
4.39k
      [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64
4.39k
      [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65
4.39k
    };
66
67
4.39k
    yyextra.have_lookahead = true;
68
4.39k
    yyextra.lookahead_token = mode_token[mode];
69
4.39k
    yyextra.lookahead_yylloc = 0;
70
4.39k
    yyextra.lookahead_end = NULL;
71
4.39k
  }
72
73
  /* initialize the bison parser */
74
4.39k
  parser_init(&yyextra);
75
76
  /* Parse! */
77
4.39k
  yyresult = base_yyparse(yyscanner);
78
79
  /* Clean up (release memory) */
80
4.39k
  scanner_finish(yyscanner);
81
82
4.39k
  if (yyresult)        /* error */
83
0
    return NIL;
84
85
4.39k
  return yyextra.parsetree;
86
4.39k
}
87
88
89
/*
90
 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91
 *
92
 * This filter is needed because in some cases the standard SQL grammar
93
 * requires more than one token lookahead.  We reduce these cases to one-token
94
 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95
 *
96
 * Using a filter is simpler than trying to recognize multiword tokens
97
 * directly in scan.l, because we'd have to allow for comments between the
98
 * words.  Furthermore it's not clear how to do that without re-introducing
99
 * scanner backtrack, which would cost more performance than this filter
100
 * layer does.
101
 *
102
 * We also use this filter to convert UIDENT and USCONST sequences into
103
 * plain IDENT and SCONST tokens.  While that could be handled by additional
104
 * productions in the main grammar, it's more efficient to do it like this.
105
 *
106
 * The filter also provides a convenient place to translate between
107
 * the core_YYSTYPE and YYSTYPE representations (which are really the
108
 * same thing anyway, but notationally they're different).
109
 */
110
int
111
base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112
10.5M
{
113
10.5M
  base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114
10.5M
  int     cur_token;
115
10.5M
  int     next_token;
116
10.5M
  int     cur_token_length;
117
10.5M
  YYLTYPE    cur_yylloc;
118
119
  /* Get next token --- we might already have it */
120
10.5M
  if (yyextra->have_lookahead)
121
6.60k
  {
122
6.60k
    cur_token = yyextra->lookahead_token;
123
6.60k
    lvalp->core_yystype = yyextra->lookahead_yylval;
124
6.60k
    *llocp = yyextra->lookahead_yylloc;
125
6.60k
    if (yyextra->lookahead_end)
126
2.20k
      *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127
6.60k
    yyextra->have_lookahead = false;
128
6.60k
  }
129
10.5M
  else
130
10.5M
    cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131
132
  /*
133
   * If this token isn't one that requires lookahead, just return it.  If it
134
   * does, determine the token length.  (We could get that via strlen(), but
135
   * since we have such a small set of possibilities, hardwiring seems
136
   * feasible and more efficient --- at least for the fixed-length cases.)
137
   */
138
10.5M
  switch (cur_token)
139
10.5M
  {
140
73
    case FORMAT:
141
73
      cur_token_length = 6;
142
73
      break;
143
1.46k
    case NOT:
144
1.46k
      cur_token_length = 3;
145
1.46k
      break;
146
205
    case NULLS_P:
147
205
      cur_token_length = 5;
148
205
      break;
149
0
    case WITH:
150
0
      cur_token_length = 4;
151
0
      break;
152
438
    case UIDENT:
153
911
    case USCONST:
154
911
      cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155
911
      break;
156
0
    case WITHOUT:
157
0
      cur_token_length = 7;
158
0
      break;
159
10.5M
    default:
160
10.5M
      return cur_token;
161
10.5M
  }
162
163
  /*
164
   * Identify end+1 of current token.  core_yylex() has temporarily stored a
165
   * '\0' here, and will undo that when we call it again.  We need to redo
166
   * it to fully revert the lookahead call for error reporting purposes.
167
   */
168
2.65k
  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169
2.65k
    *llocp + cur_token_length;
170
2.65k
  Assert(*(yyextra->lookahead_end) == '\0');
171
172
  /*
173
   * Save and restore *llocp around the call.  It might look like we could
174
   * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175
   * does not work because flex actually holds onto the last-passed pointer
176
   * internally, and will use that for error reporting.  We need any error
177
   * reports to point to the current token, not the next one.
178
   */
179
2.65k
  cur_yylloc = *llocp;
180
181
  /* Get next token, saving outputs into lookahead variables */
182
2.65k
  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183
2.65k
  yyextra->lookahead_token = next_token;
184
2.65k
  yyextra->lookahead_yylloc = *llocp;
185
186
2.65k
  *llocp = cur_yylloc;
187
188
  /* Now revert the un-truncation of the current token */
189
2.65k
  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190
2.65k
  *(yyextra->lookahead_end) = '\0';
191
192
2.65k
  yyextra->have_lookahead = true;
193
194
  /* Replace cur_token if needed, based on lookahead */
195
2.65k
  switch (cur_token)
196
2.65k
  {
197
73
    case FORMAT:
198
      /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199
73
      switch (next_token)
200
73
      {
201
1
        case JSON:
202
1
          cur_token = FORMAT_LA;
203
1
          break;
204
73
      }
205
73
      break;
206
207
1.46k
    case NOT:
208
      /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209
1.46k
      switch (next_token)
210
1.46k
      {
211
0
        case BETWEEN:
212
209
        case IN_P:
213
209
        case LIKE:
214
209
        case ILIKE:
215
209
        case SIMILAR:
216
209
          cur_token = NOT_LA;
217
209
          break;
218
1.46k
      }
219
1.46k
      break;
220
221
1.46k
    case NULLS_P:
222
      /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223
205
      switch (next_token)
224
205
      {
225
0
        case FIRST_P:
226
0
        case LAST_P:
227
0
          cur_token = NULLS_LA;
228
0
          break;
229
205
      }
230
205
      break;
231
232
205
    case WITH:
233
      /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234
0
      switch (next_token)
235
0
      {
236
0
        case TIME:
237
0
        case ORDINALITY:
238
0
          cur_token = WITH_LA;
239
0
          break;
240
0
      }
241
0
      break;
242
243
0
    case WITHOUT:
244
      /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245
0
      switch (next_token)
246
0
      {
247
0
        case TIME:
248
0
          cur_token = WITHOUT_LA;
249
0
          break;
250
0
      }
251
0
      break;
252
253
435
    case UIDENT:
254
905
    case USCONST:
255
      /* Look ahead for UESCAPE */
256
905
      if (next_token == UESCAPE)
257
0
      {
258
        /* Yup, so get third token, which had better be SCONST */
259
0
        const char *escstr;
260
261
        /* Again save and restore *llocp */
262
0
        cur_yylloc = *llocp;
263
264
        /* Un-truncate current token so errors point to third token */
265
0
        *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266
267
        /* Get third token */
268
0
        next_token = core_yylex(&(yyextra->lookahead_yylval),
269
0
                    llocp, yyscanner);
270
271
        /* If we throw error here, it will point to third token */
272
0
        if (next_token != SCONST)
273
0
          scanner_yyerror("UESCAPE must be followed by a simple string literal",
274
0
                  yyscanner);
275
276
0
        escstr = yyextra->lookahead_yylval.str;
277
0
        if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278
0
          scanner_yyerror("invalid Unicode escape character",
279
0
                  yyscanner);
280
281
        /* Now restore *llocp; errors will point to first token */
282
0
        *llocp = cur_yylloc;
283
284
        /* Apply Unicode conversion */
285
0
        lvalp->core_yystype.str =
286
0
          str_udeescape(lvalp->core_yystype.str,
287
0
                  escstr[0],
288
0
                  *llocp,
289
0
                  yyscanner);
290
291
        /*
292
         * We don't need to revert the un-truncation of UESCAPE.  What
293
         * we do want to do is clear have_lookahead, thereby consuming
294
         * all three tokens.
295
         */
296
0
        yyextra->have_lookahead = false;
297
0
      }
298
905
      else
299
905
      {
300
        /* No UESCAPE, so convert using default escape character */
301
905
        lvalp->core_yystype.str =
302
905
          str_udeescape(lvalp->core_yystype.str,
303
905
                  '\\',
304
905
                  *llocp,
305
905
                  yyscanner);
306
905
      }
307
308
905
      if (cur_token == UIDENT)
309
276
      {
310
        /* It's an identifier, so truncate as appropriate */
311
276
        truncate_identifier(lvalp->core_yystype.str,
312
276
                  strlen(lvalp->core_yystype.str),
313
276
                  true);
314
276
        cur_token = IDENT;
315
276
      }
316
629
      else if (cur_token == USCONST)
317
261
      {
318
261
        cur_token = SCONST;
319
261
      }
320
905
      break;
321
2.65k
  }
322
323
2.27k
  return cur_token;
324
2.65k
}
325
326
/* convert hex digit (caller should have verified that) to value */
327
static unsigned int
328
hexval(unsigned char c)
329
11.6k
{
330
11.6k
  if (c >= '0' && c <= '9')
331
9.17k
    return c - '0';
332
2.43k
  if (c >= 'a' && c <= 'f')
333
1.16k
    return c - 'a' + 0xA;
334
1.26k
  if (c >= 'A' && c <= 'F')
335
1.26k
    return c - 'A' + 0xA;
336
0
  elog(ERROR, "invalid hexadecimal digit");
337
0
  return 0;         /* not reached */
338
0
}
339
340
/* is Unicode code point acceptable? */
341
static void
342
check_unicode_value(pg_wchar c)
343
2.37k
{
344
2.37k
  if (!is_valid_unicode_codepoint(c))
345
2.37k
    ereport(ERROR,
346
2.37k
        (errcode(ERRCODE_SYNTAX_ERROR),
347
2.37k
         errmsg("invalid Unicode escape value")));
348
2.37k
}
349
350
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351
static bool
352
check_uescapechar(unsigned char escape)
353
0
{
354
0
  if (isxdigit(escape)
355
0
    || escape == '+'
356
0
    || escape == '\''
357
0
    || escape == '"'
358
0
    || scanner_isspace(escape))
359
0
    return false;
360
0
  else
361
0
    return true;
362
0
}
363
364
/*
365
 * Process Unicode escapes in "str", producing a palloc'd plain string
366
 *
367
 * escape: the escape character to use
368
 * position: start position of U&'' or U&"" string token
369
 * yyscanner: context information needed for error reports
370
 */
371
static char *
372
str_udeescape(const char *str, char escape,
373
        int position, core_yyscan_t yyscanner)
374
905
{
375
905
  const char *in;
376
905
  char     *new,
377
905
         *out;
378
905
  size_t    new_len;
379
905
  pg_wchar  pair_first = 0;
380
905
  ScannerCallbackState scbstate;
381
382
  /*
383
   * Guesstimate that result will be no longer than input, but allow enough
384
   * padding for Unicode conversion.
385
   */
386
905
  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387
905
  new = palloc(new_len);
388
389
905
  in = str;
390
905
  out = new;
391
3.83M
  while (*in)
392
3.83M
  {
393
    /* Enlarge string if needed */
394
3.83M
    size_t    out_dist = out - new;
395
396
3.83M
    if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397
0
    {
398
0
      new_len *= 2;
399
0
      new = repalloc(new, new_len);
400
0
      out = new + out_dist;
401
0
    }
402
403
3.83M
    if (in[0] == escape)
404
2.70k
    {
405
      /*
406
       * Any errors reported while processing this escape sequence will
407
       * have an error cursor pointing at the escape.
408
       */
409
2.70k
      setup_scanner_errposition_callback(&scbstate, yyscanner,
410
2.70k
                         in - str + position + 3);  /* 3 for U&" */
411
2.70k
      if (in[1] == escape)
412
266
      {
413
266
        if (pair_first)
414
10
          goto invalid_pair;
415
256
        *out++ = escape;
416
256
        in += 2;
417
256
      }
418
2.43k
      else if (isxdigit((unsigned char) in[1]) &&
419
2.43k
           isxdigit((unsigned char) in[2]) &&
420
2.43k
           isxdigit((unsigned char) in[3]) &&
421
2.43k
           isxdigit((unsigned char) in[4]))
422
1.33k
      {
423
1.33k
        pg_wchar  unicode;
424
425
1.33k
        unicode = (hexval(in[1]) << 12) +
426
1.33k
          (hexval(in[2]) << 8) +
427
1.33k
          (hexval(in[3]) << 4) +
428
1.33k
          hexval(in[4]);
429
1.33k
        check_unicode_value(unicode);
430
1.33k
        if (pair_first)
431
41
        {
432
41
          if (is_utf16_surrogate_second(unicode))
433
14
          {
434
14
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435
14
            pair_first = 0;
436
14
          }
437
27
          else
438
27
            goto invalid_pair;
439
41
        }
440
1.29k
        else if (is_utf16_surrogate_second(unicode))
441
13
          goto invalid_pair;
442
443
1.29k
        if (is_utf16_surrogate_first(unicode))
444
128
          pair_first = unicode;
445
1.16k
        else
446
1.16k
        {
447
1.16k
          pg_unicode_to_server(unicode, (unsigned char *) out);
448
1.16k
          out += strlen(out);
449
1.16k
        }
450
1.29k
        in += 5;
451
1.29k
      }
452
1.10k
      else if (in[1] == '+' &&
453
1.10k
           isxdigit((unsigned char) in[2]) &&
454
1.10k
           isxdigit((unsigned char) in[3]) &&
455
1.10k
           isxdigit((unsigned char) in[4]) &&
456
1.10k
           isxdigit((unsigned char) in[5]) &&
457
1.10k
           isxdigit((unsigned char) in[6]) &&
458
1.10k
           isxdigit((unsigned char) in[7]))
459
1.04k
      {
460
1.04k
        pg_wchar  unicode;
461
462
1.04k
        unicode = (hexval(in[2]) << 20) +
463
1.04k
          (hexval(in[3]) << 16) +
464
1.04k
          (hexval(in[4]) << 12) +
465
1.04k
          (hexval(in[5]) << 8) +
466
1.04k
          (hexval(in[6]) << 4) +
467
1.04k
          hexval(in[7]);
468
1.04k
        check_unicode_value(unicode);
469
1.04k
        if (pair_first)
470
54
        {
471
54
          if (is_utf16_surrogate_second(unicode))
472
13
          {
473
13
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474
13
            pair_first = 0;
475
13
          }
476
41
          else
477
41
            goto invalid_pair;
478
54
        }
479
993
        else if (is_utf16_surrogate_second(unicode))
480
11
          goto invalid_pair;
481
482
995
        if (is_utf16_surrogate_first(unicode))
483
18
          pair_first = unicode;
484
977
        else
485
977
        {
486
977
          pg_unicode_to_server(unicode, (unsigned char *) out);
487
977
          out += strlen(out);
488
977
        }
489
995
        in += 8;
490
995
      }
491
58
      else
492
58
        ereport(ERROR,
493
2.60k
            (errcode(ERRCODE_SYNTAX_ERROR),
494
2.60k
             errmsg("invalid Unicode escape"),
495
2.60k
             errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496
497
2.60k
      cancel_scanner_errposition_callback(&scbstate);
498
2.60k
    }
499
3.83M
    else
500
3.83M
    {
501
3.83M
      if (pair_first)
502
12
        goto invalid_pair;
503
504
3.83M
      *out++ = *in++;
505
3.83M
    }
506
3.83M
  }
507
508
  /* unfinished surrogate pair? */
509
791
  if (pair_first)
510
28
    goto invalid_pair;
511
512
763
  *out = '\0';
513
763
  return new;
514
515
  /*
516
   * We might get here with the error callback active, or not.  Call
517
   * scanner_errposition to make sure an error cursor appears; if the
518
   * callback is active, this is duplicative but harmless.
519
   */
520
142
invalid_pair:
521
142
  ereport(ERROR,
522
142
      (errcode(ERRCODE_SYNTAX_ERROR),
523
142
       errmsg("invalid Unicode surrogate pair"),
524
142
       scanner_errposition(in - str + position + 3, /* 3 for U&" */
525
142
                 yyscanner)));
526
142
  return NULL;       /* keep compiler quiet */
527
142
}