Coverage Report

Created: 2023-11-19 06:08

/src/libpg_query/src/postgres/src_backend_parser_parser.c
Line
Count
Source (jump to first uncovered line)
1
/*--------------------------------------------------------------------
2
 * Symbols referenced in this file:
3
 * - raw_parser
4
 * - base_yylex
5
 * - check_uescapechar
6
 * - str_udeescape
7
 * - hexval
8
 * - check_unicode_value
9
 * - raw_parser
10
 *--------------------------------------------------------------------
11
 */
12
13
/*-------------------------------------------------------------------------
14
 *
15
 * parser.c
16
 *    Main entry point/driver for PostgreSQL grammar
17
 *
18
 * Note that the grammar is not allowed to perform any table access
19
 * (since we need to be able to do basic parsing even while inside an
20
 * aborted transaction).  Therefore, the data structures returned by
21
 * the grammar are "raw" parsetrees that still need to be analyzed by
22
 * analyze.c and related files.
23
 *
24
 *
25
 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
26
 * Portions Copyright (c) 1994, Regents of the University of California
27
 *
28
 * IDENTIFICATION
29
 *    src/backend/parser/parser.c
30
 *
31
 *-------------------------------------------------------------------------
32
 */
33
34
#include "postgres.h"
35
36
#include "mb/pg_wchar.h"
37
#include "parser/gramparse.h"
38
#include "parser/parser.h"
39
#include "parser/scansup.h"
40
41
static bool check_uescapechar(unsigned char escape);
42
static char *str_udeescape(const char *str, char escape,
43
               int position, core_yyscan_t yyscanner);
44
45
46
/*
47
 * raw_parser
48
 *    Given a query in string form, do lexical and grammatical analysis.
49
 *
50
 * Returns a list of raw (un-analyzed) parse trees.  The contents of the
51
 * list have the form required by the specified RawParseMode.
52
 */
53
List *
54
raw_parser(const char *str, RawParseMode mode)
55
245
{
56
245
  core_yyscan_t yyscanner;
57
245
  base_yy_extra_type yyextra;
58
245
  int     yyresult;
59
60
  /* initialize the flex scanner */
61
245
  yyscanner = scanner_init(str, &yyextra.core_yy_extra,
62
245
               &ScanKeywords, ScanKeywordTokens);
63
64
  /* base_yylex() only needs us to initialize the lookahead token, if any */
65
245
  if (mode == RAW_PARSE_DEFAULT)
66
245
    yyextra.have_lookahead = false;
67
0
  else
68
0
  {
69
    /* this array is indexed by RawParseMode enum */
70
0
    static const int mode_token[] = {
71
0
      0,          /* RAW_PARSE_DEFAULT */
72
0
      MODE_TYPE_NAME,   /* RAW_PARSE_TYPE_NAME */
73
0
      MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
74
0
      MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
75
0
      MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
76
      MODE_PLPGSQL_ASSIGN3  /* RAW_PARSE_PLPGSQL_ASSIGN3 */
77
0
    };
78
79
0
    yyextra.have_lookahead = true;
80
0
    yyextra.lookahead_token = mode_token[mode];
81
0
    yyextra.lookahead_yylloc = 0;
82
0
    yyextra.lookahead_end = NULL;
83
0
  }
84
85
  /* initialize the bison parser */
86
245
  parser_init(&yyextra);
87
88
  /* Parse! */
89
245
  yyresult = base_yyparse(yyscanner);
90
91
  /* Clean up (release memory) */
92
245
  scanner_finish(yyscanner);
93
94
245
  if (yyresult)        /* error */
95
0
    return NIL;
96
97
245
  return yyextra.parsetree;
98
245
}
99
100
101
/*
102
 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
103
 *
104
 * This filter is needed because in some cases the standard SQL grammar
105
 * requires more than one token lookahead.  We reduce these cases to one-token
106
 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
107
 *
108
 * Using a filter is simpler than trying to recognize multiword tokens
109
 * directly in scan.l, because we'd have to allow for comments between the
110
 * words.  Furthermore it's not clear how to do that without re-introducing
111
 * scanner backtrack, which would cost more performance than this filter
112
 * layer does.
113
 *
114
 * We also use this filter to convert UIDENT and USCONST sequences into
115
 * plain IDENT and SCONST tokens.  While that could be handled by additional
116
 * productions in the main grammar, it's more efficient to do it like this.
117
 *
118
 * The filter also provides a convenient place to translate between
119
 * the core_YYSTYPE and YYSTYPE representations (which are really the
120
 * same thing anyway, but notationally they're different).
121
 */
122
int
123
base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
124
22.8M
{
125
22.8M
  base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
126
22.8M
  int     cur_token;
127
22.8M
  int     next_token;
128
22.8M
  int     cur_token_length;
129
22.8M
  YYLTYPE    cur_yylloc;
130
131
  /* Get next token --- we might already have it */
132
22.8M
  if (yyextra->have_lookahead)
133
7.68k
  {
134
7.68k
    cur_token = yyextra->lookahead_token;
135
7.68k
    lvalp->core_yystype = yyextra->lookahead_yylval;
136
7.68k
    *llocp = yyextra->lookahead_yylloc;
137
7.68k
    if (yyextra->lookahead_end)
138
7.68k
      *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
139
7.68k
    yyextra->have_lookahead = false;
140
7.68k
  }
141
22.8M
  else
142
22.8M
    cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
143
144
  /*
145
   * If this token isn't one that requires lookahead, just return it.  If it
146
   * does, determine the token length.  (We could get that via strlen(), but
147
   * since we have such a small set of possibilities, hardwiring seems
148
   * feasible and more efficient --- at least for the fixed-length cases.)
149
   */
150
22.8M
  switch (cur_token)
151
22.8M
  {
152
7.40k
    case NOT:
153
7.40k
      cur_token_length = 3;
154
7.40k
      break;
155
0
    case NULLS_P:
156
0
      cur_token_length = 5;
157
0
      break;
158
211
    case WITH:
159
211
      cur_token_length = 4;
160
211
      break;
161
74
    case UIDENT:
162
79
    case USCONST:
163
79
      cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
164
79
      break;
165
91
    case SQL_COMMENT:
166
94
    case C_COMMENT:
167
94
      return base_yylex(lvalp, llocp, yyscanner);
168
22.8M
    default:
169
22.8M
      return cur_token;
170
22.8M
  }
171
172
  /*
173
   * Identify end+1 of current token.  core_yylex() has temporarily stored a
174
   * '\0' here, and will undo that when we call it again.  We need to redo
175
   * it to fully revert the lookahead call for error reporting purposes.
176
   */
177
7.69k
  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
178
7.69k
    *llocp + cur_token_length;
179
7.69k
  Assert(*(yyextra->lookahead_end) == '\0');
180
181
  /*
182
   * Save and restore *llocp around the call.  It might look like we could
183
   * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
184
   * does not work because flex actually holds onto the last-passed pointer
185
   * internally, and will use that for error reporting.  We need any error
186
   * reports to point to the current token, not the next one.
187
   */
188
7.69k
  cur_yylloc = *llocp;
189
190
  /* Get next token, saving outputs into lookahead variables */
191
7.69k
  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
192
7.69k
  yyextra->lookahead_token = next_token;
193
7.69k
  yyextra->lookahead_yylloc = *llocp;
194
195
7.69k
  *llocp = cur_yylloc;
196
197
  /* Now revert the un-truncation of the current token */
198
7.69k
  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
199
7.69k
  *(yyextra->lookahead_end) = '\0';
200
201
7.69k
  yyextra->have_lookahead = true;
202
203
  /* Replace cur_token if needed, based on lookahead */
204
7.69k
  switch (cur_token)
205
7.69k
  {
206
7.40k
    case NOT:
207
      /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
208
7.40k
      switch (next_token)
209
7.40k
      {
210
0
        case BETWEEN:
211
35
        case IN_P:
212
35
        case LIKE:
213
290
        case ILIKE:
214
290
        case SIMILAR:
215
290
          cur_token = NOT_LA;
216
290
          break;
217
7.40k
      }
218
7.40k
      break;
219
220
7.40k
    case NULLS_P:
221
      /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
222
0
      switch (next_token)
223
0
      {
224
0
        case FIRST_P:
225
0
        case LAST_P:
226
0
          cur_token = NULLS_LA;
227
0
          break;
228
0
      }
229
0
      break;
230
231
211
    case WITH:
232
      /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
233
211
      switch (next_token)
234
211
      {
235
0
        case TIME:
236
0
        case ORDINALITY:
237
0
          cur_token = WITH_LA;
238
0
          break;
239
211
      }
240
211
      break;
241
242
211
    case UIDENT:
243
79
    case USCONST:
244
      /* Look ahead for UESCAPE */
245
79
      if (next_token == UESCAPE)
246
0
      {
247
        /* Yup, so get third token, which had better be SCONST */
248
0
        const char *escstr;
249
250
        /* Again save and restore *llocp */
251
0
        cur_yylloc = *llocp;
252
253
        /* Un-truncate current token so errors point to third token */
254
0
        *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
255
256
        /* Get third token */
257
0
        next_token = core_yylex(&(yyextra->lookahead_yylval),
258
0
                    llocp, yyscanner);
259
260
        /* If we throw error here, it will point to third token */
261
0
        if (next_token != SCONST)
262
0
          scanner_yyerror("UESCAPE must be followed by a simple string literal",
263
0
                  yyscanner);
264
265
0
        escstr = yyextra->lookahead_yylval.str;
266
0
        if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
267
0
          scanner_yyerror("invalid Unicode escape character",
268
0
                  yyscanner);
269
270
        /* Now restore *llocp; errors will point to first token */
271
0
        *llocp = cur_yylloc;
272
273
        /* Apply Unicode conversion */
274
0
        lvalp->core_yystype.str =
275
0
          str_udeescape(lvalp->core_yystype.str,
276
0
                  escstr[0],
277
0
                  *llocp,
278
0
                  yyscanner);
279
280
        /*
281
         * We don't need to revert the un-truncation of UESCAPE.  What
282
         * we do want to do is clear have_lookahead, thereby consuming
283
         * all three tokens.
284
         */
285
0
        yyextra->have_lookahead = false;
286
0
      }
287
79
      else
288
79
      {
289
        /* No UESCAPE, so convert using default escape character */
290
79
        lvalp->core_yystype.str =
291
79
          str_udeescape(lvalp->core_yystype.str,
292
79
                  '\\',
293
79
                  *llocp,
294
79
                  yyscanner);
295
79
      }
296
297
79
      if (cur_token == UIDENT)
298
73
      {
299
        /* It's an identifier, so truncate as appropriate */
300
73
        truncate_identifier(lvalp->core_yystype.str,
301
73
                  strlen(lvalp->core_yystype.str),
302
73
                  true);
303
73
        cur_token = IDENT;
304
73
      }
305
6
      else if (cur_token == USCONST)
306
5
      {
307
5
        cur_token = SCONST;
308
5
      }
309
79
      break;
310
7.69k
  }
311
312
7.69k
  return cur_token;
313
7.69k
}
314
315
/* convert hex digit (caller should have verified that) to value */
316
static unsigned int
317
hexval(unsigned char c)
318
270
{
319
270
  if (c >= '0' && c <= '9')
320
215
    return c - '0';
321
55
  if (c >= 'a' && c <= 'f')
322
55
    return c - 'a' + 0xA;
323
0
  if (c >= 'A' && c <= 'F')
324
0
    return c - 'A' + 0xA;
325
0
  elog(ERROR, "invalid hexadecimal digit");
326
0
  return 0;         /* not reached */
327
0
}
328
329
/* is Unicode code point acceptable? */
330
static void
331
check_unicode_value(pg_wchar c)
332
65
{
333
65
  if (!is_valid_unicode_codepoint(c))
334
65
    ereport(ERROR,
335
65
        (errcode(ERRCODE_SYNTAX_ERROR),
336
65
         errmsg("invalid Unicode escape value")));
337
65
}
338
339
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
340
static bool
341
check_uescapechar(unsigned char escape)
342
0
{
343
0
  if (isxdigit(escape)
344
0
    || escape == '+'
345
0
    || escape == '\''
346
0
    || escape == '"'
347
0
    || scanner_isspace(escape))
348
0
    return false;
349
0
  else
350
0
    return true;
351
0
}
352
353
/*
354
 * Process Unicode escapes in "str", producing a palloc'd plain string
355
 *
356
 * escape: the escape character to use
357
 * position: start position of U&'' or U&"" string token
358
 * yyscanner: context information needed for error reports
359
 */
360
static char *
361
str_udeescape(const char *str, char escape,
362
        int position, core_yyscan_t yyscanner)
363
79
{
364
79
  const char *in;
365
79
  char     *new,
366
79
         *out;
367
79
  size_t    new_len;
368
79
  pg_wchar  pair_first = 0;
369
79
  ScannerCallbackState scbstate;
370
371
  /*
372
   * Guesstimate that result will be no longer than input, but allow enough
373
   * padding for Unicode conversion.
374
   */
375
79
  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
376
79
  new = palloc(new_len);
377
378
79
  in = str;
379
79
  out = new;
380
3.68M
  while (*in)
381
3.68M
  {
382
    /* Enlarge string if needed */
383
3.68M
    size_t    out_dist = out - new;
384
385
3.68M
    if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
386
0
    {
387
0
      new_len *= 2;
388
0
      new = repalloc(new, new_len);
389
0
      out = new + out_dist;
390
0
    }
391
392
3.68M
    if (in[0] == escape)
393
2.65k
    {
394
      /*
395
       * Any errors reported while processing this escape sequence will
396
       * have an error cursor pointing at the escape.
397
       */
398
2.65k
      setup_scanner_errposition_callback(&scbstate, yyscanner,
399
2.65k
                         in - str + position + 3);  /* 3 for U&" */
400
2.65k
      if (in[1] == escape)
401
2.58k
      {
402
2.58k
        if (pair_first)
403
0
          goto invalid_pair;
404
2.58k
        *out++ = escape;
405
2.58k
        in += 2;
406
2.58k
      }
407
66
      else if (isxdigit((unsigned char) in[1]) &&
408
66
           isxdigit((unsigned char) in[2]) &&
409
66
           isxdigit((unsigned char) in[3]) &&
410
66
           isxdigit((unsigned char) in[4]))
411
60
      {
412
60
        pg_wchar  unicode;
413
414
60
        unicode = (hexval(in[1]) << 12) +
415
60
          (hexval(in[2]) << 8) +
416
60
          (hexval(in[3]) << 4) +
417
60
          hexval(in[4]);
418
60
        check_unicode_value(unicode);
419
60
        if (pair_first)
420
0
        {
421
0
          if (is_utf16_surrogate_second(unicode))
422
0
          {
423
0
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
424
0
            pair_first = 0;
425
0
          }
426
0
          else
427
0
            goto invalid_pair;
428
0
        }
429
60
        else if (is_utf16_surrogate_second(unicode))
430
0
          goto invalid_pair;
431
432
60
        if (is_utf16_surrogate_first(unicode))
433
0
          pair_first = unicode;
434
60
        else
435
60
        {
436
60
          pg_unicode_to_server(unicode, (unsigned char *) out);
437
60
          out += strlen(out);
438
60
        }
439
60
        in += 5;
440
60
      }
441
6
      else if (in[1] == '+' &&
442
6
           isxdigit((unsigned char) in[2]) &&
443
6
           isxdigit((unsigned char) in[3]) &&
444
6
           isxdigit((unsigned char) in[4]) &&
445
6
           isxdigit((unsigned char) in[5]) &&
446
6
           isxdigit((unsigned char) in[6]) &&
447
6
           isxdigit((unsigned char) in[7]))
448
5
      {
449
5
        pg_wchar  unicode;
450
451
5
        unicode = (hexval(in[2]) << 20) +
452
5
          (hexval(in[3]) << 16) +
453
5
          (hexval(in[4]) << 12) +
454
5
          (hexval(in[5]) << 8) +
455
5
          (hexval(in[6]) << 4) +
456
5
          hexval(in[7]);
457
5
        check_unicode_value(unicode);
458
5
        if (pair_first)
459
0
        {
460
0
          if (is_utf16_surrogate_second(unicode))
461
0
          {
462
0
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
463
0
            pair_first = 0;
464
0
          }
465
0
          else
466
0
            goto invalid_pair;
467
0
        }
468
5
        else if (is_utf16_surrogate_second(unicode))
469
0
          goto invalid_pair;
470
471
5
        if (is_utf16_surrogate_first(unicode))
472
0
          pair_first = unicode;
473
5
        else
474
5
        {
475
5
          pg_unicode_to_server(unicode, (unsigned char *) out);
476
5
          out += strlen(out);
477
5
        }
478
5
        in += 8;
479
5
      }
480
1
      else
481
1
        ereport(ERROR,
482
2.65k
            (errcode(ERRCODE_SYNTAX_ERROR),
483
2.65k
             errmsg("invalid Unicode escape"),
484
2.65k
             errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
485
486
2.65k
      cancel_scanner_errposition_callback(&scbstate);
487
2.65k
    }
488
3.68M
    else
489
3.68M
    {
490
3.68M
      if (pair_first)
491
0
        goto invalid_pair;
492
493
3.68M
      *out++ = *in++;
494
3.68M
    }
495
3.68M
  }
496
497
  /* unfinished surrogate pair? */
498
79
  if (pair_first)
499
0
    goto invalid_pair;
500
501
79
  *out = '\0';
502
79
  return new;
503
504
  /*
505
   * We might get here with the error callback active, or not.  Call
506
   * scanner_errposition to make sure an error cursor appears; if the
507
   * callback is active, this is duplicative but harmless.
508
   */
509
0
invalid_pair:
510
0
  ereport(ERROR,
511
0
      (errcode(ERRCODE_SYNTAX_ERROR),
512
0
       errmsg("invalid Unicode surrogate pair"),
513
0
       scanner_errposition(in - str + position + 3, /* 3 for U&" */
514
0
                 yyscanner)));
515
0
  return NULL;       /* keep compiler quiet */
516
0
}