/src/postgres/src/backend/parser/parser.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * parser.c
 *    Main entry point/driver for PostgreSQL grammar
 *
 * Note that the grammar is not allowed to perform any table access
 * (since we need to be able to do basic parsing even while inside an
 * aborted transaction).  Therefore, the data structures returned by
 * the grammar are "raw" parsetrees that still need to be analyzed by
 * analyze.c and related files.
 *
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *    src/backend/parser/parser.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "gramparse.h"
#include "mb/pg_wchar.h"
#include "parser/parser.h"
#include "parser/scansup.h"

static bool check_uescapechar(unsigned char escape);
static char *str_udeescape(const char *str, char escape,
               int position, core_yyscan_t yyscanner);


/*
 * raw_parser
 *    Given a query in string form, do lexical and grammatical analysis.
 *
 * Returns a list of raw (un-analyzed) parse trees.  The contents of the
 * list have the form required by the specified RawParseMode.
 */
List *
raw_parser(const char *str, RawParseMode mode)
{
  core_yyscan_t yyscanner;
  base_yy_extra_type yyextra;
  int     yyresult;

  /* initialize the flex scanner */
  yyscanner = scanner_init(str, &yyextra.core_yy_extra,
               &ScanKeywords, ScanKeywordTokens);

  /* base_yylex() only needs us to initialize the lookahead token, if any */
  if (mode == RAW_PARSE_DEFAULT)
    yyextra.have_lookahead = false;
  else
  {
    /* this array is indexed by RawParseMode enum */
    static const int mode_token[] = {
      [RAW_PARSE_DEFAULT] = 0,
      [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
      [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
      [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
      [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
      [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
    };

    yyextra.have_lookahead = true;
    yyextra.lookahead_token = mode_token[mode];
    yyextra.lookahead_yylloc = 0;
    yyextra.lookahead_end = NULL;
  }

  /* initialize the bison parser */
  parser_init(&yyextra);

  /* Parse! */
  yyresult = base_yyparse(yyscanner);

  /* Clean up (release memory) */
  scanner_finish(yyscanner);

  if (yyresult)        /* error */
    return NIL;

  return yyextra.parsetree;
}


/*
 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
 *
 * This filter is needed because in some cases the standard SQL grammar
 * requires more than one token lookahead.  We reduce these cases to one-token
 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
 *
 * Using a filter is simpler than trying to recognize multiword tokens
 * directly in scan.l, because we'd have to allow for comments between the
 * words.  Furthermore it's not clear how to do that without re-introducing
 * scanner backtrack, which would cost more performance than this filter
 * layer does.
 *
 * We also use this filter to convert UIDENT and USCONST sequences into
 * plain IDENT and SCONST tokens.  While that could be handled by additional
 * productions in the main grammar, it's more efficient to do it like this.
 *
 * The filter also provides a convenient place to translate between
 * the core_YYSTYPE and YYSTYPE representations (which are really the
 * same thing anyway, but notationally they're different).
 */
int
base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
{
  base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
  int     cur_token;
  int     next_token;
  int     cur_token_length;
  YYLTYPE    cur_yylloc;

  /* Get next token --- we might already have it */
  if (yyextra->have_lookahead)
  {
    cur_token = yyextra->lookahead_token;
    lvalp->core_yystype = yyextra->lookahead_yylval;
    *llocp = yyextra->lookahead_yylloc;
    if (yyextra->lookahead_end)
      *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
    yyextra->have_lookahead = false;
  }
  else
    cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);

  /*
   * If this token isn't one that requires lookahead, just return it.  If it
   * does, determine the token length.  (We could get that via strlen(), but
   * since we have such a small set of possibilities, hardwiring seems
   * feasible and more efficient --- at least for the fixed-length cases.)
   */
  switch (cur_token)
  {
    case FORMAT:
      cur_token_length = 6;
      break;
    case NOT:
      cur_token_length = 3;
      break;
    case NULLS_P:
      cur_token_length = 5;
      break;
    case WITH:
      cur_token_length = 4;
      break;
    case UIDENT:
    case USCONST:
      cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
      break;
    case WITHOUT:
      cur_token_length = 7;
      break;
    default:
      return cur_token;
  }

  /*
   * Identify end+1 of current token.  core_yylex() has temporarily stored a
   * '\0' here, and will undo that when we call it again.  We need to redo
   * it to fully revert the lookahead call for error reporting purposes.
   */
  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
    *llocp + cur_token_length;
  Assert(*(yyextra->lookahead_end) == '\0');

  /*
   * Save and restore *llocp around the call.  It might look like we could
   * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
   * does not work because flex actually holds onto the last-passed pointer
   * internally, and will use that for error reporting.  We need any error
   * reports to point to the current token, not the next one.
   */
  cur_yylloc = *llocp;

  /* Get next token, saving outputs into lookahead variables */
  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
  yyextra->lookahead_token = next_token;
  yyextra->lookahead_yylloc = *llocp;

  *llocp = cur_yylloc;

  /* Now revert the un-truncation of the current token */
  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
  *(yyextra->lookahead_end) = '\0';

  yyextra->have_lookahead = true;

  /* Replace cur_token if needed, based on lookahead */
  switch (cur_token)
  {
    case FORMAT:
      /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
      switch (next_token)
      {
        case JSON:
          cur_token = FORMAT_LA;
          break;
      }
      break;

    case NOT:
      /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
      switch (next_token)
      {
        case BETWEEN:
        case IN_P:
        case LIKE:
        case ILIKE:
        case SIMILAR:
          cur_token = NOT_LA;
          break;
      }
      break;

    case NULLS_P:
      /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
      switch (next_token)
      {
        case FIRST_P:
        case LAST_P:
          cur_token = NULLS_LA;
          break;
      }
      break;

    case WITH:
      /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
      switch (next_token)
      {
        case TIME:
        case ORDINALITY:
          cur_token = WITH_LA;
          break;
      }
      break;

    case WITHOUT:
      /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
      switch (next_token)
      {
        case TIME:
          cur_token = WITHOUT_LA;
          break;
      }
      break;

    case UIDENT:
    case USCONST:
      /* Look ahead for UESCAPE */
      if (next_token == UESCAPE)
      {
        /* Yup, so get third token, which had better be SCONST */
        const char *escstr;

        /* Again save and restore *llocp */
        cur_yylloc = *llocp;

        /* Un-truncate current token so errors point to third token */
        *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

        /* Get third token */
        next_token = core_yylex(&(yyextra->lookahead_yylval),
                    llocp, yyscanner);

        /* If we throw error here, it will point to third token */
        if (next_token != SCONST)
          scanner_yyerror("UESCAPE must be followed by a simple string literal",
                  yyscanner);

        escstr = yyextra->lookahead_yylval.str;
        if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
          scanner_yyerror("invalid Unicode escape character",
                  yyscanner);

        /* Now restore *llocp; errors will point to first token */
        *llocp = cur_yylloc;

        /* Apply Unicode conversion */
        lvalp->core_yystype.str =
          str_udeescape(lvalp->core_yystype.str,
                  escstr[0],
                  *llocp,
                  yyscanner);

        /*
         * We don't need to revert the un-truncation of UESCAPE.  What
         * we do want to do is clear have_lookahead, thereby consuming
         * all three tokens.
         */
        yyextra->have_lookahead = false;
      }
      else
      {
        /* No UESCAPE, so convert using default escape character */
        lvalp->core_yystype.str =
          str_udeescape(lvalp->core_yystype.str,
                  '\\',
                  *llocp,
                  yyscanner);
      }

      if (cur_token == UIDENT)
      {
        /* It's an identifier, so truncate as appropriate */
        truncate_identifier(lvalp->core_yystype.str,
                  strlen(lvalp->core_yystype.str),
                  true);
        cur_token = IDENT;
      }
      else if (cur_token == USCONST)
      {
        cur_token = SCONST;
      }
      break;
  }

  return cur_token;
}

/* convert hex digit (caller should have verified that) to value */
static unsigned int
hexval(unsigned char c)
{
  if (c >= '0' && c <= '9')
    return c - '0';
  if (c >= 'a' && c <= 'f')
    return c - 'a' + 0xA;
  if (c >= 'A' && c <= 'F')
    return c - 'A' + 0xA;
  elog(ERROR, "invalid hexadecimal digit");
  return 0;         /* not reached */
}

/* is Unicode code point acceptable? */
static void
check_unicode_value(pg_wchar c)
{
  if (!is_valid_unicode_codepoint(c))
    ereport(ERROR,
        (errcode(ERRCODE_SYNTAX_ERROR),
         errmsg("invalid Unicode escape value")));
}

/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
static bool
check_uescapechar(unsigned char escape)
{
  if (isxdigit(escape)
    || escape == '+'
    || escape == '\''
    || escape == '"'
    || scanner_isspace(escape))
    return false;
  else
    return true;
}

/*
 * Process Unicode escapes in "str", producing a palloc'd plain string
 *
 * escape: the escape character to use
 * position: start position of U&'' or U&"" string token
 * yyscanner: context information needed for error reports
 */
static char *
str_udeescape(const char *str, char escape,
        int position, core_yyscan_t yyscanner)
{
  const char *in;
  char     *new,
         *out;
  size_t    new_len;
  pg_wchar  pair_first = 0;
  ScannerCallbackState scbstate;

  /*
   * Guesstimate that result will be no longer than input, but allow enough
   * padding for Unicode conversion.
   */
  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
  new = palloc(new_len);

  in = str;
  out = new;
  while (*in)
  {
    /* Enlarge string if needed */
    size_t    out_dist = out - new;

    if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
    {
      new_len *= 2;
      new = repalloc(new, new_len);
      out = new + out_dist;
    }

    if (in[0] == escape)
    {
      /*
       * Any errors reported while processing this escape sequence will
       * have an error cursor pointing at the escape.
       */
      setup_scanner_errposition_callback(&scbstate, yyscanner,
                         in - str + position + 3);  /* 3 for U&" */
      if (in[1] == escape)
      {
        if (pair_first)
          goto invalid_pair;
        *out++ = escape;
        in += 2;
      }
      else if (isxdigit((unsigned char) in[1]) &&
           isxdigit((unsigned char) in[2]) &&
           isxdigit((unsigned char) in[3]) &&
           isxdigit((unsigned char) in[4]))
      {
        pg_wchar  unicode;

        unicode = (hexval(in[1]) << 12) +
          (hexval(in[2]) << 8) +
          (hexval(in[3]) << 4) +
          hexval(in[4]);
        check_unicode_value(unicode);
        if (pair_first)
        {
          if (is_utf16_surrogate_second(unicode))
          {
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
            pair_first = 0;
          }
          else
            goto invalid_pair;
        }
        else if (is_utf16_surrogate_second(unicode))
          goto invalid_pair;

        if (is_utf16_surrogate_first(unicode))
          pair_first = unicode;
        else
        {
          pg_unicode_to_server(unicode, (unsigned char *) out);
          out += strlen(out);
        }
        in += 5;
      }
      else if (in[1] == '+' &&
           isxdigit((unsigned char) in[2]) &&
           isxdigit((unsigned char) in[3]) &&
           isxdigit((unsigned char) in[4]) &&
           isxdigit((unsigned char) in[5]) &&
           isxdigit((unsigned char) in[6]) &&
           isxdigit((unsigned char) in[7]))
      {
        pg_wchar  unicode;

        unicode = (hexval(in[2]) << 20) +
          (hexval(in[3]) << 16) +
          (hexval(in[4]) << 12) +
          (hexval(in[5]) << 8) +
          (hexval(in[6]) << 4) +
          hexval(in[7]);
        check_unicode_value(unicode);
        if (pair_first)
        {
          if (is_utf16_surrogate_second(unicode))
          {
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
            pair_first = 0;
          }
          else
            goto invalid_pair;
        }
        else if (is_utf16_surrogate_second(unicode))
          goto invalid_pair;

        if (is_utf16_surrogate_first(unicode))
          pair_first = unicode;
        else
        {
          pg_unicode_to_server(unicode, (unsigned char *) out);
          out += strlen(out);
        }
        in += 8;
      }
      else
        ereport(ERROR,
            (errcode(ERRCODE_SYNTAX_ERROR),
             errmsg("invalid Unicode escape"),
             errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));

      cancel_scanner_errposition_callback(&scbstate);
    }
    else
    {
      if (pair_first)
        goto invalid_pair;

      *out++ = *in++;
    }
  }

  /* unfinished surrogate pair? */
  if (pair_first)
    goto invalid_pair;

  *out = '\0';
  return new;

  /*
   * We might get here with the error callback active, or not.  Call
   * scanner_errposition to make sure an error cursor appears; if the
   * callback is active, this is duplicative but harmless.
   */
invalid_pair:
  ereport(ERROR,
      (errcode(ERRCODE_SYNTAX_ERROR),
       errmsg("invalid Unicode surrogate pair"),
       scanner_errposition(in - str + position + 3, /* 3 for U&" */
                 yyscanner)));
  return NULL;       /* keep compiler quiet */
}

Coverage Report

Created: 2025-07-03 06:49

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* parser.c
4		* Main entry point/driver for PostgreSQL grammar
5		*
6		* Note that the grammar is not allowed to perform any table access
7		* (since we need to be able to do basic parsing even while inside an
8		* aborted transaction). Therefore, the data structures returned by
9		* the grammar are "raw" parsetrees that still need to be analyzed by
10		* analyze.c and related files.
11		*
12		*
13		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14		* Portions Copyright (c) 1994, Regents of the University of California
15		*
16		* IDENTIFICATION
17		* src/backend/parser/parser.c
18		*
19		*-------------------------------------------------------------------------
20		*/
21
22		#include "postgres.h"
23
24		#include "gramparse.h"
25		#include "mb/pg_wchar.h"
26		#include "parser/parser.h"
27		#include "parser/scansup.h"
28
29		static bool check_uescapechar(unsigned char escape);
30		static char str_udeescape(const char str, char escape,
31		int position, core_yyscan_t yyscanner);
32
33
34		/*
35		* raw_parser
36		* Given a query in string form, do lexical and grammatical analysis.
37		*
38		* Returns a list of raw (un-analyzed) parse trees. The contents of the
39		* list have the form required by the specified RawParseMode.
40		*/
41		List *
42		raw_parser(const char *str, RawParseMode mode)
43	4.39k	{
44	4.39k	core_yyscan_t yyscanner;
45	4.39k	base_yy_extra_type yyextra;
46	4.39k	int yyresult;
47
48		/* initialize the flex scanner */
49	4.39k	yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50	4.39k	&ScanKeywords, ScanKeywordTokens);
51
52		/* base_yylex() only needs us to initialize the lookahead token, if any */
53	4.39k	if (mode == RAW_PARSE_DEFAULT)
54	0	yyextra.have_lookahead = false;
55	4.39k	else
56	4.39k	{
57		/* this array is indexed by RawParseMode enum */
58	4.39k	static const int mode_token[] = {
59	4.39k	[RAW_PARSE_DEFAULT] = 0,
60	4.39k	[RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61	4.39k	[RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62	4.39k	[RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63	4.39k	[RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64	4.39k	[RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65	4.39k	};
66
67	4.39k	yyextra.have_lookahead = true;
68	4.39k	yyextra.lookahead_token = mode_token[mode];
69	4.39k	yyextra.lookahead_yylloc = 0;
70	4.39k	yyextra.lookahead_end = NULL;
71	4.39k	}
72
73		/* initialize the bison parser */
74	4.39k	parser_init(&yyextra);
75
76		/* Parse! */
77	4.39k	yyresult = base_yyparse(yyscanner);
78
79		/* Clean up (release memory) */
80	4.39k	scanner_finish(yyscanner);
81
82	4.39k	if (yyresult) /* error */
83	0	return NIL;
84
85	4.39k	return yyextra.parsetree;
86	4.39k	}
87
88
89		/*
90		* Intermediate filter between parser and core lexer (core_yylex in scan.l).
91		*
92		* This filter is needed because in some cases the standard SQL grammar
93		* requires more than one token lookahead. We reduce these cases to one-token
94		* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95		*
96		* Using a filter is simpler than trying to recognize multiword tokens
97		* directly in scan.l, because we'd have to allow for comments between the
98		* words. Furthermore it's not clear how to do that without re-introducing
99		* scanner backtrack, which would cost more performance than this filter
100		* layer does.
101		*
102		* We also use this filter to convert UIDENT and USCONST sequences into
103		* plain IDENT and SCONST tokens. While that could be handled by additional
104		* productions in the main grammar, it's more efficient to do it like this.
105		*
106		* The filter also provides a convenient place to translate between
107		* the core_YYSTYPE and YYSTYPE representations (which are really the
108		* same thing anyway, but notationally they're different).
109		*/
110		int
111		base_yylex(YYSTYPE lvalp, YYLTYPE llocp, core_yyscan_t yyscanner)
112	10.5M	{
113	10.5M	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114	10.5M	int cur_token;
115	10.5M	int next_token;
116	10.5M	int cur_token_length;
117	10.5M	YYLTYPE cur_yylloc;
118
119		/* Get next token --- we might already have it */
120	10.5M	if (yyextra->have_lookahead)
121	6.60k	{
122	6.60k	cur_token = yyextra->lookahead_token;
123	6.60k	lvalp->core_yystype = yyextra->lookahead_yylval;
124	6.60k	*llocp = yyextra->lookahead_yylloc;
125	6.60k	if (yyextra->lookahead_end)
126	2.20k	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127	6.60k	yyextra->have_lookahead = false;
128	6.60k	}
129	10.5M	else
130	10.5M	cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131
132		/*
133		* If this token isn't one that requires lookahead, just return it. If it
134		* does, determine the token length. (We could get that via strlen(), but
135		* since we have such a small set of possibilities, hardwiring seems
136		* feasible and more efficient --- at least for the fixed-length cases.)
137		*/
138	10.5M	switch (cur_token)
139	10.5M	{
140	73	case FORMAT:
141	73	cur_token_length = 6;
142	73	break;
143	1.46k	case NOT:
144	1.46k	cur_token_length = 3;
145	1.46k	break;
146	205	case NULLS_P:
147	205	cur_token_length = 5;
148	205	break;
149	0	case WITH:
150	0	cur_token_length = 4;
151	0	break;
152	438	case UIDENT:
153	911	case USCONST:
154	911	cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155	911	break;
156	0	case WITHOUT:
157	0	cur_token_length = 7;
158	0	break;
159	10.5M	default:
160	10.5M	return cur_token;
161	10.5M	}
162
163		/*
164		* Identify end+1 of current token. core_yylex() has temporarily stored a
165		* '\0' here, and will undo that when we call it again. We need to redo
166		* it to fully revert the lookahead call for error reporting purposes.
167		*/
168	2.65k	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169	2.65k	*llocp + cur_token_length;
170	2.65k	Assert(*(yyextra->lookahead_end) == '\0');
171
172		/*
173		* Save and restore *llocp around the call. It might look like we could
174		* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175		* does not work because flex actually holds onto the last-passed pointer
176		* internally, and will use that for error reporting. We need any error
177		* reports to point to the current token, not the next one.
178		*/
179	2.65k	cur_yylloc = *llocp;
180
181		/* Get next token, saving outputs into lookahead variables */
182	2.65k	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183	2.65k	yyextra->lookahead_token = next_token;
184	2.65k	yyextra->lookahead_yylloc = *llocp;
185
186	2.65k	*llocp = cur_yylloc;
187
188		/* Now revert the un-truncation of the current token */
189	2.65k	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190	2.65k	*(yyextra->lookahead_end) = '\0';
191
192	2.65k	yyextra->have_lookahead = true;
193
194		/* Replace cur_token if needed, based on lookahead */
195	2.65k	switch (cur_token)
196	2.65k	{
197	73	case FORMAT:
198		/* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199	73	switch (next_token)
200	73	{
201	1	case JSON:
202	1	cur_token = FORMAT_LA;
203	1	break;
204	73	}
205	73	break;
206
207	1.46k	case NOT:
208		/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209	1.46k	switch (next_token)
210	1.46k	{
211	0	case BETWEEN:
212	209	case IN_P:
213	209	case LIKE:
214	209	case ILIKE:
215	209	case SIMILAR:
216	209	cur_token = NOT_LA;
217	209	break;
218	1.46k	}
219	1.46k	break;
220
221	1.46k	case NULLS_P:
222		/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223	205	switch (next_token)
224	205	{
225	0	case FIRST_P:
226	0	case LAST_P:
227	0	cur_token = NULLS_LA;
228	0	break;
229	205	}
230	205	break;
231
232	205	case WITH:
233		/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234	0	switch (next_token)
235	0	{
236	0	case TIME:
237	0	case ORDINALITY:
238	0	cur_token = WITH_LA;
239	0	break;
240	0	}
241	0	break;
242
243	0	case WITHOUT:
244		/* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245	0	switch (next_token)
246	0	{
247	0	case TIME:
248	0	cur_token = WITHOUT_LA;
249	0	break;
250	0	}
251	0	break;
252
253	435	case UIDENT:
254	905	case USCONST:
255		/* Look ahead for UESCAPE */
256	905	if (next_token == UESCAPE)
257	0	{
258		/* Yup, so get third token, which had better be SCONST */
259	0	const char *escstr;
260
261		/* Again save and restore llocp /
262	0	cur_yylloc = *llocp;
263
264		/* Un-truncate current token so errors point to third token */
265	0	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266
267		/* Get third token */
268	0	next_token = core_yylex(&(yyextra->lookahead_yylval),
269	0	llocp, yyscanner);
270
271		/* If we throw error here, it will point to third token */
272	0	if (next_token != SCONST)
273	0	scanner_yyerror("UESCAPE must be followed by a simple string literal",
274	0	yyscanner);
275
276	0	escstr = yyextra->lookahead_yylval.str;
277	0	if (strlen(escstr) != 1 \|\| !check_uescapechar(escstr[0]))
278	0	scanner_yyerror("invalid Unicode escape character",
279	0	yyscanner);
280
281		/* Now restore llocp; errors will point to first token /
282	0	*llocp = cur_yylloc;
283
284		/* Apply Unicode conversion */
285	0	lvalp->core_yystype.str =
286	0	str_udeescape(lvalp->core_yystype.str,
287	0	escstr[0],
288	0	*llocp,
289	0	yyscanner);
290
291		/*
292		* We don't need to revert the un-truncation of UESCAPE. What
293		* we do want to do is clear have_lookahead, thereby consuming
294		* all three tokens.
295		*/
296	0	yyextra->have_lookahead = false;
297	0	}
298	905	else
299	905	{
300		/* No UESCAPE, so convert using default escape character */
301	905	lvalp->core_yystype.str =
302	905	str_udeescape(lvalp->core_yystype.str,
303	905	'\\',
304	905	*llocp,
305	905	yyscanner);
306	905	}
307
308	905	if (cur_token == UIDENT)
309	276	{
310		/* It's an identifier, so truncate as appropriate */
311	276	truncate_identifier(lvalp->core_yystype.str,
312	276	strlen(lvalp->core_yystype.str),
313	276	true);
314	276	cur_token = IDENT;
315	276	}
316	629	else if (cur_token == USCONST)
317	261	{
318	261	cur_token = SCONST;
319	261	}
320	905	break;
321	2.65k	}
322
323	2.27k	return cur_token;
324	2.65k	}
325
326		/* convert hex digit (caller should have verified that) to value */
327		static unsigned int
328		hexval(unsigned char c)
329	11.6k	{
330	11.6k	if (c >= '0' && c <= '9')
331	9.17k	return c - '0';
332	2.43k	if (c >= 'a' && c <= 'f')
333	1.16k	return c - 'a' + 0xA;
334	1.26k	if (c >= 'A' && c <= 'F')
335	1.26k	return c - 'A' + 0xA;
336	0	elog(ERROR, "invalid hexadecimal digit");
337	0	return 0; /* not reached */
338	0	}
339
340		/* is Unicode code point acceptable? */
341		static void
342		check_unicode_value(pg_wchar c)
343	2.37k	{
344	2.37k	if (!is_valid_unicode_codepoint(c))
345	2.37k	ereport(ERROR,
346	2.37k	(errcode(ERRCODE_SYNTAX_ERROR),
347	2.37k	errmsg("invalid Unicode escape value")));
348	2.37k	}
349
350		/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351		static bool
352		check_uescapechar(unsigned char escape)
353	0	{
354	0	if (isxdigit(escape)
355	0	\|\| escape == '+'
356	0	\|\| escape == '\''
357	0	\|\| escape == '"'
358	0	\|\| scanner_isspace(escape))
359	0	return false;
360	0	else
361	0	return true;
362	0	}
363
364		/*
365		* Process Unicode escapes in "str", producing a palloc'd plain string
366		*
367		* escape: the escape character to use
368		* position: start position of U&'' or U&"" string token
369		* yyscanner: context information needed for error reports
370		*/
371		static char *
372		str_udeescape(const char *str, char escape,
373		int position, core_yyscan_t yyscanner)
374	905	{
375	905	const char *in;
376	905	char *new,
377	905	*out;
378	905	size_t new_len;
379	905	pg_wchar pair_first = 0;
380	905	ScannerCallbackState scbstate;
381
382		/*
383		* Guesstimate that result will be no longer than input, but allow enough
384		* padding for Unicode conversion.
385		*/
386	905	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387	905	new = palloc(new_len);
388
389	905	in = str;
390	905	out = new;
391	3.83M	while (*in)
392	3.83M	{
393		/* Enlarge string if needed */
394	3.83M	size_t out_dist = out - new;
395
396	3.83M	if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397	0	{
398	0	new_len *= 2;
399	0	new = repalloc(new, new_len);
400	0	out = new + out_dist;
401	0	}
402
403	3.83M	if (in[0] == escape)
404	2.70k	{
405		/*
406		* Any errors reported while processing this escape sequence will
407		* have an error cursor pointing at the escape.
408		*/
409	2.70k	setup_scanner_errposition_callback(&scbstate, yyscanner,
410	2.70k	in - str + position + 3); /* 3 for U&" */
411	2.70k	if (in[1] == escape)
412	266	{
413	266	if (pair_first)
414	10	goto invalid_pair;
415	256	*out++ = escape;
416	256	in += 2;
417	256	}
418	2.43k	else if (isxdigit((unsigned char) in[1]) &&
419	2.43k	isxdigit((unsigned char) in[2]) &&
420	2.43k	isxdigit((unsigned char) in[3]) &&
421	2.43k	isxdigit((unsigned char) in[4]))
422	1.33k	{
423	1.33k	pg_wchar unicode;
424
425	1.33k	unicode = (hexval(in[1]) << 12) +
426	1.33k	(hexval(in[2]) << 8) +
427	1.33k	(hexval(in[3]) << 4) +
428	1.33k	hexval(in[4]);
429	1.33k	check_unicode_value(unicode);
430	1.33k	if (pair_first)
431	41	{
432	41	if (is_utf16_surrogate_second(unicode))
433	14	{
434	14	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435	14	pair_first = 0;
436	14	}
437	27	else
438	27	goto invalid_pair;
439	41	}
440	1.29k	else if (is_utf16_surrogate_second(unicode))
441	13	goto invalid_pair;
442
443	1.29k	if (is_utf16_surrogate_first(unicode))
444	128	pair_first = unicode;
445	1.16k	else
446	1.16k	{
447	1.16k	pg_unicode_to_server(unicode, (unsigned char *) out);
448	1.16k	out += strlen(out);
449	1.16k	}
450	1.29k	in += 5;
451	1.29k	}
452	1.10k	else if (in[1] == '+' &&
453	1.10k	isxdigit((unsigned char) in[2]) &&
454	1.10k	isxdigit((unsigned char) in[3]) &&
455	1.10k	isxdigit((unsigned char) in[4]) &&
456	1.10k	isxdigit((unsigned char) in[5]) &&
457	1.10k	isxdigit((unsigned char) in[6]) &&
458	1.10k	isxdigit((unsigned char) in[7]))
459	1.04k	{
460	1.04k	pg_wchar unicode;
461
462	1.04k	unicode = (hexval(in[2]) << 20) +
463	1.04k	(hexval(in[3]) << 16) +
464	1.04k	(hexval(in[4]) << 12) +
465	1.04k	(hexval(in[5]) << 8) +
466	1.04k	(hexval(in[6]) << 4) +
467	1.04k	hexval(in[7]);
468	1.04k	check_unicode_value(unicode);
469	1.04k	if (pair_first)
470	54	{
471	54	if (is_utf16_surrogate_second(unicode))
472	13	{
473	13	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474	13	pair_first = 0;
475	13	}
476	41	else
477	41	goto invalid_pair;
478	54	}
479	993	else if (is_utf16_surrogate_second(unicode))
480	11	goto invalid_pair;
481
482	995	if (is_utf16_surrogate_first(unicode))
483	18	pair_first = unicode;
484	977	else
485	977	{
486	977	pg_unicode_to_server(unicode, (unsigned char *) out);
487	977	out += strlen(out);
488	977	}
489	995	in += 8;
490	995	}
491	58	else
492	58	ereport(ERROR,
493	2.60k	(errcode(ERRCODE_SYNTAX_ERROR),
494	2.60k	errmsg("invalid Unicode escape"),
495	2.60k	errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496
497	2.60k	cancel_scanner_errposition_callback(&scbstate);
498	2.60k	}
499	3.83M	else
500	3.83M	{
501	3.83M	if (pair_first)
502	12	goto invalid_pair;
503
504	3.83M	out++ = in++;
505	3.83M	}
506	3.83M	}
507
508		/* unfinished surrogate pair? */
509	791	if (pair_first)
510	28	goto invalid_pair;
511
512	763	*out = '\0';
513	763	return new;
514
515		/*
516		* We might get here with the error callback active, or not. Call
517		* scanner_errposition to make sure an error cursor appears; if the
518		* callback is active, this is duplicative but harmless.
519		*/
520	142	invalid_pair:
521	142	ereport(ERROR,
522	142	(errcode(ERRCODE_SYNTAX_ERROR),
523	142	errmsg("invalid Unicode surrogate pair"),
524	142	scanner_errposition(in - str + position + 3, /* 3 for U&" */
525	142	yyscanner)));
526	142	return NULL; /* keep compiler quiet */
527	142	}