/src/libpg_query/src/postgres/src_backend_parser_parser.c

Source (jump to first uncovered line)
/*--------------------------------------------------------------------
 * Symbols referenced in this file:
 * - raw_parser
 * - base_yylex
 * - check_uescapechar
 * - str_udeescape
 * - hexval
 * - check_unicode_value
 * - raw_parser
 *--------------------------------------------------------------------
 */

/*-------------------------------------------------------------------------
 *
 * parser.c
 *    Main entry point/driver for PostgreSQL grammar
 *
 * Note that the grammar is not allowed to perform any table access
 * (since we need to be able to do basic parsing even while inside an
 * aborted transaction).  Therefore, the data structures returned by
 * the grammar are "raw" parsetrees that still need to be analyzed by
 * analyze.c and related files.
 *
 *
 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *    src/backend/parser/parser.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "mb/pg_wchar.h"
#include "parser/gramparse.h"
#include "parser/parser.h"
#include "parser/scansup.h"

static bool check_uescapechar(unsigned char escape);
static char *str_udeescape(const char *str, char escape,
               int position, core_yyscan_t yyscanner);


/*
 * raw_parser
 *    Given a query in string form, do lexical and grammatical analysis.
 *
 * Returns a list of raw (un-analyzed) parse trees.  The contents of the
 * list have the form required by the specified RawParseMode.
 */
List *
raw_parser(const char *str, RawParseMode mode)
{
  core_yyscan_t yyscanner;
  base_yy_extra_type yyextra;
  int     yyresult;

  /* initialize the flex scanner */
  yyscanner = scanner_init(str, &yyextra.core_yy_extra,
               &ScanKeywords, ScanKeywordTokens);

  /* base_yylex() only needs us to initialize the lookahead token, if any */
  if (mode == RAW_PARSE_DEFAULT)
    yyextra.have_lookahead = false;
  else
  {
    /* this array is indexed by RawParseMode enum */
    static const int mode_token[] = {
      0,          /* RAW_PARSE_DEFAULT */
      MODE_TYPE_NAME,   /* RAW_PARSE_TYPE_NAME */
      MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
      MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
      MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
      MODE_PLPGSQL_ASSIGN3  /* RAW_PARSE_PLPGSQL_ASSIGN3 */
    };

    yyextra.have_lookahead = true;
    yyextra.lookahead_token = mode_token[mode];
    yyextra.lookahead_yylloc = 0;
    yyextra.lookahead_end = NULL;
  }

  /* initialize the bison parser */
  parser_init(&yyextra);

  /* Parse! */
  yyresult = base_yyparse(yyscanner);

  /* Clean up (release memory) */
  scanner_finish(yyscanner);

  if (yyresult)        /* error */
    return NIL;

  return yyextra.parsetree;
}


/*
 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
 *
 * This filter is needed because in some cases the standard SQL grammar
 * requires more than one token lookahead.  We reduce these cases to one-token
 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
 *
 * Using a filter is simpler than trying to recognize multiword tokens
 * directly in scan.l, because we'd have to allow for comments between the
 * words.  Furthermore it's not clear how to do that without re-introducing
 * scanner backtrack, which would cost more performance than this filter
 * layer does.
 *
 * We also use this filter to convert UIDENT and USCONST sequences into
 * plain IDENT and SCONST tokens.  While that could be handled by additional
 * productions in the main grammar, it's more efficient to do it like this.
 *
 * The filter also provides a convenient place to translate between
 * the core_YYSTYPE and YYSTYPE representations (which are really the
 * same thing anyway, but notationally they're different).
 */
int
base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
{
  base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
  int     cur_token;
  int     next_token;
  int     cur_token_length;
  YYLTYPE    cur_yylloc;

  /* Get next token --- we might already have it */
  if (yyextra->have_lookahead)
  {
    cur_token = yyextra->lookahead_token;
    lvalp->core_yystype = yyextra->lookahead_yylval;
    *llocp = yyextra->lookahead_yylloc;
    if (yyextra->lookahead_end)
      *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
    yyextra->have_lookahead = false;
  }
  else
    cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);

  /*
   * If this token isn't one that requires lookahead, just return it.  If it
   * does, determine the token length.  (We could get that via strlen(), but
   * since we have such a small set of possibilities, hardwiring seems
   * feasible and more efficient --- at least for the fixed-length cases.)
   */
  switch (cur_token)
  {
    case NOT:
      cur_token_length = 3;
      break;
    case NULLS_P:
      cur_token_length = 5;
      break;
    case WITH:
      cur_token_length = 4;
      break;
    case UIDENT:
    case USCONST:
      cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
      break;
    case SQL_COMMENT:
    case C_COMMENT:
      return base_yylex(lvalp, llocp, yyscanner);
    default:
      return cur_token;
  }

  /*
   * Identify end+1 of current token.  core_yylex() has temporarily stored a
   * '\0' here, and will undo that when we call it again.  We need to redo
   * it to fully revert the lookahead call for error reporting purposes.
   */
  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
    *llocp + cur_token_length;
  Assert(*(yyextra->lookahead_end) == '\0');

  /*
   * Save and restore *llocp around the call.  It might look like we could
   * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
   * does not work because flex actually holds onto the last-passed pointer
   * internally, and will use that for error reporting.  We need any error
   * reports to point to the current token, not the next one.
   */
  cur_yylloc = *llocp;

  /* Get next token, saving outputs into lookahead variables */
  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
  yyextra->lookahead_token = next_token;
  yyextra->lookahead_yylloc = *llocp;

  *llocp = cur_yylloc;

  /* Now revert the un-truncation of the current token */
  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
  *(yyextra->lookahead_end) = '\0';

  yyextra->have_lookahead = true;

  /* Replace cur_token if needed, based on lookahead */
  switch (cur_token)
  {
    case NOT:
      /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
      switch (next_token)
      {
        case BETWEEN:
        case IN_P:
        case LIKE:
        case ILIKE:
        case SIMILAR:
          cur_token = NOT_LA;
          break;
      }
      break;

    case NULLS_P:
      /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
      switch (next_token)
      {
        case FIRST_P:
        case LAST_P:
          cur_token = NULLS_LA;
          break;
      }
      break;

    case WITH:
      /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
      switch (next_token)
      {
        case TIME:
        case ORDINALITY:
          cur_token = WITH_LA;
          break;
      }
      break;

    case UIDENT:
    case USCONST:
      /* Look ahead for UESCAPE */
      if (next_token == UESCAPE)
      {
        /* Yup, so get third token, which had better be SCONST */
        const char *escstr;

        /* Again save and restore *llocp */
        cur_yylloc = *llocp;

        /* Un-truncate current token so errors point to third token */
        *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

        /* Get third token */
        next_token = core_yylex(&(yyextra->lookahead_yylval),
                    llocp, yyscanner);

        /* If we throw error here, it will point to third token */
        if (next_token != SCONST)
          scanner_yyerror("UESCAPE must be followed by a simple string literal",
                  yyscanner);

        escstr = yyextra->lookahead_yylval.str;
        if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
          scanner_yyerror("invalid Unicode escape character",
                  yyscanner);

        /* Now restore *llocp; errors will point to first token */
        *llocp = cur_yylloc;

        /* Apply Unicode conversion */
        lvalp->core_yystype.str =
          str_udeescape(lvalp->core_yystype.str,
                  escstr[0],
                  *llocp,
                  yyscanner);

        /*
         * We don't need to revert the un-truncation of UESCAPE.  What
         * we do want to do is clear have_lookahead, thereby consuming
         * all three tokens.
         */
        yyextra->have_lookahead = false;
      }
      else
      {
        /* No UESCAPE, so convert using default escape character */
        lvalp->core_yystype.str =
          str_udeescape(lvalp->core_yystype.str,
                  '\\',
                  *llocp,
                  yyscanner);
      }

      if (cur_token == UIDENT)
      {
        /* It's an identifier, so truncate as appropriate */
        truncate_identifier(lvalp->core_yystype.str,
                  strlen(lvalp->core_yystype.str),
                  true);
        cur_token = IDENT;
      }
      else if (cur_token == USCONST)
      {
        cur_token = SCONST;
      }
      break;
  }

  return cur_token;
}

/* convert hex digit (caller should have verified that) to value */
static unsigned int
hexval(unsigned char c)
{
  if (c >= '0' && c <= '9')
    return c - '0';
  if (c >= 'a' && c <= 'f')
    return c - 'a' + 0xA;
  if (c >= 'A' && c <= 'F')
    return c - 'A' + 0xA;
  elog(ERROR, "invalid hexadecimal digit");
  return 0;         /* not reached */
}

/* is Unicode code point acceptable? */
static void
check_unicode_value(pg_wchar c)
{
  if (!is_valid_unicode_codepoint(c))
    ereport(ERROR,
        (errcode(ERRCODE_SYNTAX_ERROR),
         errmsg("invalid Unicode escape value")));
}

/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
static bool
check_uescapechar(unsigned char escape)
{
  if (isxdigit(escape)
    || escape == '+'
    || escape == '\''
    || escape == '"'
    || scanner_isspace(escape))
    return false;
  else
    return true;
}

/*
 * Process Unicode escapes in "str", producing a palloc'd plain string
 *
 * escape: the escape character to use
 * position: start position of U&'' or U&"" string token
 * yyscanner: context information needed for error reports
 */
static char *
str_udeescape(const char *str, char escape,
        int position, core_yyscan_t yyscanner)
{
  const char *in;
  char     *new,
         *out;
  size_t    new_len;
  pg_wchar  pair_first = 0;
  ScannerCallbackState scbstate;

  /*
   * Guesstimate that result will be no longer than input, but allow enough
   * padding for Unicode conversion.
   */
  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
  new = palloc(new_len);

  in = str;
  out = new;
  while (*in)
  {
    /* Enlarge string if needed */
    size_t    out_dist = out - new;

    if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
    {
      new_len *= 2;
      new = repalloc(new, new_len);
      out = new + out_dist;
    }

    if (in[0] == escape)
    {
      /*
       * Any errors reported while processing this escape sequence will
       * have an error cursor pointing at the escape.
       */
      setup_scanner_errposition_callback(&scbstate, yyscanner,
                         in - str + position + 3);  /* 3 for U&" */
      if (in[1] == escape)
      {
        if (pair_first)
          goto invalid_pair;
        *out++ = escape;
        in += 2;
      }
      else if (isxdigit((unsigned char) in[1]) &&
           isxdigit((unsigned char) in[2]) &&
           isxdigit((unsigned char) in[3]) &&
           isxdigit((unsigned char) in[4]))
      {
        pg_wchar  unicode;

        unicode = (hexval(in[1]) << 12) +
          (hexval(in[2]) << 8) +
          (hexval(in[3]) << 4) +
          hexval(in[4]);
        check_unicode_value(unicode);
        if (pair_first)
        {
          if (is_utf16_surrogate_second(unicode))
          {
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
            pair_first = 0;
          }
          else
            goto invalid_pair;
        }
        else if (is_utf16_surrogate_second(unicode))
          goto invalid_pair;

        if (is_utf16_surrogate_first(unicode))
          pair_first = unicode;
        else
        {
          pg_unicode_to_server(unicode, (unsigned char *) out);
          out += strlen(out);
        }
        in += 5;
      }
      else if (in[1] == '+' &&
           isxdigit((unsigned char) in[2]) &&
           isxdigit((unsigned char) in[3]) &&
           isxdigit((unsigned char) in[4]) &&
           isxdigit((unsigned char) in[5]) &&
           isxdigit((unsigned char) in[6]) &&
           isxdigit((unsigned char) in[7]))
      {
        pg_wchar  unicode;

        unicode = (hexval(in[2]) << 20) +
          (hexval(in[3]) << 16) +
          (hexval(in[4]) << 12) +
          (hexval(in[5]) << 8) +
          (hexval(in[6]) << 4) +
          hexval(in[7]);
        check_unicode_value(unicode);
        if (pair_first)
        {
          if (is_utf16_surrogate_second(unicode))
          {
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
            pair_first = 0;
          }
          else
            goto invalid_pair;
        }
        else if (is_utf16_surrogate_second(unicode))
          goto invalid_pair;

        if (is_utf16_surrogate_first(unicode))
          pair_first = unicode;
        else
        {
          pg_unicode_to_server(unicode, (unsigned char *) out);
          out += strlen(out);
        }
        in += 8;
      }
      else
        ereport(ERROR,
            (errcode(ERRCODE_SYNTAX_ERROR),
             errmsg("invalid Unicode escape"),
             errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));

      cancel_scanner_errposition_callback(&scbstate);
    }
    else
    {
      if (pair_first)
        goto invalid_pair;

      *out++ = *in++;
    }
  }

  /* unfinished surrogate pair? */
  if (pair_first)
    goto invalid_pair;

  *out = '\0';
  return new;

  /*
   * We might get here with the error callback active, or not.  Call
   * scanner_errposition to make sure an error cursor appears; if the
   * callback is active, this is duplicative but harmless.
   */
invalid_pair:
  ereport(ERROR,
      (errcode(ERRCODE_SYNTAX_ERROR),
       errmsg("invalid Unicode surrogate pair"),
       scanner_errposition(in - str + position + 3, /* 3 for U&" */
                 yyscanner)));
  return NULL;       /* keep compiler quiet */
}

Coverage Report

Created: 2023-11-19 06:08

Line	Count	Source (jump to first uncovered line)
1		/*--------------------------------------------------------------------
2		* Symbols referenced in this file:
3		* - raw_parser
4		* - base_yylex
5		* - check_uescapechar
6		* - str_udeescape
7		* - hexval
8		* - check_unicode_value
9		* - raw_parser
10		*--------------------------------------------------------------------
11		*/
12
13		/*-------------------------------------------------------------------------
14		*
15		* parser.c
16		* Main entry point/driver for PostgreSQL grammar
17		*
18		* Note that the grammar is not allowed to perform any table access
19		* (since we need to be able to do basic parsing even while inside an
20		* aborted transaction). Therefore, the data structures returned by
21		* the grammar are "raw" parsetrees that still need to be analyzed by
22		* analyze.c and related files.
23		*
24		*
25		* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
26		* Portions Copyright (c) 1994, Regents of the University of California
27		*
28		* IDENTIFICATION
29		* src/backend/parser/parser.c
30		*
31		*-------------------------------------------------------------------------
32		*/
33
34		#include "postgres.h"
35
36		#include "mb/pg_wchar.h"
37		#include "parser/gramparse.h"
38		#include "parser/parser.h"
39		#include "parser/scansup.h"
40
41		static bool check_uescapechar(unsigned char escape);
42		static char str_udeescape(const char str, char escape,
43		int position, core_yyscan_t yyscanner);
44
45
46		/*
47		* raw_parser
48		* Given a query in string form, do lexical and grammatical analysis.
49		*
50		* Returns a list of raw (un-analyzed) parse trees. The contents of the
51		* list have the form required by the specified RawParseMode.
52		*/
53		List *
54		raw_parser(const char *str, RawParseMode mode)
55	245	{
56	245	core_yyscan_t yyscanner;
57	245	base_yy_extra_type yyextra;
58	245	int yyresult;
59
60		/* initialize the flex scanner */
61	245	yyscanner = scanner_init(str, &yyextra.core_yy_extra,
62	245	&ScanKeywords, ScanKeywordTokens);
63
64		/* base_yylex() only needs us to initialize the lookahead token, if any */
65	245	if (mode == RAW_PARSE_DEFAULT)
66	245	yyextra.have_lookahead = false;
67	0	else
68	0	{
69		/* this array is indexed by RawParseMode enum */
70	0	static const int mode_token[] = {
71	0	0, /* RAW_PARSE_DEFAULT */
72	0	MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
73	0	MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
74	0	MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
75	0	MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
76		MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
77	0	};
78
79	0	yyextra.have_lookahead = true;
80	0	yyextra.lookahead_token = mode_token[mode];
81	0	yyextra.lookahead_yylloc = 0;
82	0	yyextra.lookahead_end = NULL;
83	0	}
84
85		/* initialize the bison parser */
86	245	parser_init(&yyextra);
87
88		/* Parse! */
89	245	yyresult = base_yyparse(yyscanner);
90
91		/* Clean up (release memory) */
92	245	scanner_finish(yyscanner);
93
94	245	if (yyresult) /* error */
95	0	return NIL;
96
97	245	return yyextra.parsetree;
98	245	}
99
100
101		/*
102		* Intermediate filter between parser and core lexer (core_yylex in scan.l).
103		*
104		* This filter is needed because in some cases the standard SQL grammar
105		* requires more than one token lookahead. We reduce these cases to one-token
106		* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
107		*
108		* Using a filter is simpler than trying to recognize multiword tokens
109		* directly in scan.l, because we'd have to allow for comments between the
110		* words. Furthermore it's not clear how to do that without re-introducing
111		* scanner backtrack, which would cost more performance than this filter
112		* layer does.
113		*
114		* We also use this filter to convert UIDENT and USCONST sequences into
115		* plain IDENT and SCONST tokens. While that could be handled by additional
116		* productions in the main grammar, it's more efficient to do it like this.
117		*
118		* The filter also provides a convenient place to translate between
119		* the core_YYSTYPE and YYSTYPE representations (which are really the
120		* same thing anyway, but notationally they're different).
121		*/
122		int
123		base_yylex(YYSTYPE lvalp, YYLTYPE llocp, core_yyscan_t yyscanner)
124	22.8M	{
125	22.8M	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
126	22.8M	int cur_token;
127	22.8M	int next_token;
128	22.8M	int cur_token_length;
129	22.8M	YYLTYPE cur_yylloc;
130
131		/* Get next token --- we might already have it */
132	22.8M	if (yyextra->have_lookahead)
133	7.68k	{
134	7.68k	cur_token = yyextra->lookahead_token;
135	7.68k	lvalp->core_yystype = yyextra->lookahead_yylval;
136	7.68k	*llocp = yyextra->lookahead_yylloc;
137	7.68k	if (yyextra->lookahead_end)
138	7.68k	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
139	7.68k	yyextra->have_lookahead = false;
140	7.68k	}
141	22.8M	else
142	22.8M	cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
143
144		/*
145		* If this token isn't one that requires lookahead, just return it. If it
146		* does, determine the token length. (We could get that via strlen(), but
147		* since we have such a small set of possibilities, hardwiring seems
148		* feasible and more efficient --- at least for the fixed-length cases.)
149		*/
150	22.8M	switch (cur_token)
151	22.8M	{
152	7.40k	case NOT:
153	7.40k	cur_token_length = 3;
154	7.40k	break;
155	0	case NULLS_P:
156	0	cur_token_length = 5;
157	0	break;
158	211	case WITH:
159	211	cur_token_length = 4;
160	211	break;
161	74	case UIDENT:
162	79	case USCONST:
163	79	cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
164	79	break;
165	91	case SQL_COMMENT:
166	94	case C_COMMENT:
167	94	return base_yylex(lvalp, llocp, yyscanner);
168	22.8M	default:
169	22.8M	return cur_token;
170	22.8M	}
171
172		/*
173		* Identify end+1 of current token. core_yylex() has temporarily stored a
174		* '\0' here, and will undo that when we call it again. We need to redo
175		* it to fully revert the lookahead call for error reporting purposes.
176		*/
177	7.69k	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
178	7.69k	*llocp + cur_token_length;
179	7.69k	Assert(*(yyextra->lookahead_end) == '\0');
180
181		/*
182		* Save and restore *llocp around the call. It might look like we could
183		* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
184		* does not work because flex actually holds onto the last-passed pointer
185		* internally, and will use that for error reporting. We need any error
186		* reports to point to the current token, not the next one.
187		*/
188	7.69k	cur_yylloc = *llocp;
189
190		/* Get next token, saving outputs into lookahead variables */
191	7.69k	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
192	7.69k	yyextra->lookahead_token = next_token;
193	7.69k	yyextra->lookahead_yylloc = *llocp;
194
195	7.69k	*llocp = cur_yylloc;
196
197		/* Now revert the un-truncation of the current token */
198	7.69k	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
199	7.69k	*(yyextra->lookahead_end) = '\0';
200
201	7.69k	yyextra->have_lookahead = true;
202
203		/* Replace cur_token if needed, based on lookahead */
204	7.69k	switch (cur_token)
205	7.69k	{
206	7.40k	case NOT:
207		/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
208	7.40k	switch (next_token)
209	7.40k	{
210	0	case BETWEEN:
211	35	case IN_P:
212	35	case LIKE:
213	290	case ILIKE:
214	290	case SIMILAR:
215	290	cur_token = NOT_LA;
216	290	break;
217	7.40k	}
218	7.40k	break;
219
220	7.40k	case NULLS_P:
221		/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
222	0	switch (next_token)
223	0	{
224	0	case FIRST_P:
225	0	case LAST_P:
226	0	cur_token = NULLS_LA;
227	0	break;
228	0	}
229	0	break;
230
231	211	case WITH:
232		/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
233	211	switch (next_token)
234	211	{
235	0	case TIME:
236	0	case ORDINALITY:
237	0	cur_token = WITH_LA;
238	0	break;
239	211	}
240	211	break;
241
242	211	case UIDENT:
243	79	case USCONST:
244		/* Look ahead for UESCAPE */
245	79	if (next_token == UESCAPE)
246	0	{
247		/* Yup, so get third token, which had better be SCONST */
248	0	const char *escstr;
249
250		/* Again save and restore llocp /
251	0	cur_yylloc = *llocp;
252
253		/* Un-truncate current token so errors point to third token */
254	0	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
255
256		/* Get third token */
257	0	next_token = core_yylex(&(yyextra->lookahead_yylval),
258	0	llocp, yyscanner);
259
260		/* If we throw error here, it will point to third token */
261	0	if (next_token != SCONST)
262	0	scanner_yyerror("UESCAPE must be followed by a simple string literal",
263	0	yyscanner);
264
265	0	escstr = yyextra->lookahead_yylval.str;
266	0	if (strlen(escstr) != 1 \|\| !check_uescapechar(escstr[0]))
267	0	scanner_yyerror("invalid Unicode escape character",
268	0	yyscanner);
269
270		/* Now restore llocp; errors will point to first token /
271	0	*llocp = cur_yylloc;
272
273		/* Apply Unicode conversion */
274	0	lvalp->core_yystype.str =
275	0	str_udeescape(lvalp->core_yystype.str,
276	0	escstr[0],
277	0	*llocp,
278	0	yyscanner);
279
280		/*
281		* We don't need to revert the un-truncation of UESCAPE. What
282		* we do want to do is clear have_lookahead, thereby consuming
283		* all three tokens.
284		*/
285	0	yyextra->have_lookahead = false;
286	0	}
287	79	else
288	79	{
289		/* No UESCAPE, so convert using default escape character */
290	79	lvalp->core_yystype.str =
291	79	str_udeescape(lvalp->core_yystype.str,
292	79	'\\',
293	79	*llocp,
294	79	yyscanner);
295	79	}
296
297	79	if (cur_token == UIDENT)
298	73	{
299		/* It's an identifier, so truncate as appropriate */
300	73	truncate_identifier(lvalp->core_yystype.str,
301	73	strlen(lvalp->core_yystype.str),
302	73	true);
303	73	cur_token = IDENT;
304	73	}
305	6	else if (cur_token == USCONST)
306	5	{
307	5	cur_token = SCONST;
308	5	}
309	79	break;
310	7.69k	}
311
312	7.69k	return cur_token;
313	7.69k	}
314
315		/* convert hex digit (caller should have verified that) to value */
316		static unsigned int
317		hexval(unsigned char c)
318	270	{
319	270	if (c >= '0' && c <= '9')
320	215	return c - '0';
321	55	if (c >= 'a' && c <= 'f')
322	55	return c - 'a' + 0xA;
323	0	if (c >= 'A' && c <= 'F')
324	0	return c - 'A' + 0xA;
325	0	elog(ERROR, "invalid hexadecimal digit");
326	0	return 0; /* not reached */
327	0	}
328
329		/* is Unicode code point acceptable? */
330		static void
331		check_unicode_value(pg_wchar c)
332	65	{
333	65	if (!is_valid_unicode_codepoint(c))
334	65	ereport(ERROR,
335	65	(errcode(ERRCODE_SYNTAX_ERROR),
336	65	errmsg("invalid Unicode escape value")));
337	65	}
338
339		/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
340		static bool
341		check_uescapechar(unsigned char escape)
342	0	{
343	0	if (isxdigit(escape)
344	0	\|\| escape == '+'
345	0	\|\| escape == '\''
346	0	\|\| escape == '"'
347	0	\|\| scanner_isspace(escape))
348	0	return false;
349	0	else
350	0	return true;
351	0	}
352
353		/*
354		* Process Unicode escapes in "str", producing a palloc'd plain string
355		*
356		* escape: the escape character to use
357		* position: start position of U&'' or U&"" string token
358		* yyscanner: context information needed for error reports
359		*/
360		static char *
361		str_udeescape(const char *str, char escape,
362		int position, core_yyscan_t yyscanner)
363	79	{
364	79	const char *in;
365	79	char *new,
366	79	*out;
367	79	size_t new_len;
368	79	pg_wchar pair_first = 0;
369	79	ScannerCallbackState scbstate;
370
371		/*
372		* Guesstimate that result will be no longer than input, but allow enough
373		* padding for Unicode conversion.
374		*/
375	79	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
376	79	new = palloc(new_len);
377
378	79	in = str;
379	79	out = new;
380	3.68M	while (*in)
381	3.68M	{
382		/* Enlarge string if needed */
383	3.68M	size_t out_dist = out - new;
384
385	3.68M	if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
386	0	{
387	0	new_len *= 2;
388	0	new = repalloc(new, new_len);
389	0	out = new + out_dist;
390	0	}
391
392	3.68M	if (in[0] == escape)
393	2.65k	{
394		/*
395		* Any errors reported while processing this escape sequence will
396		* have an error cursor pointing at the escape.
397		*/
398	2.65k	setup_scanner_errposition_callback(&scbstate, yyscanner,
399	2.65k	in - str + position + 3); /* 3 for U&" */
400	2.65k	if (in[1] == escape)
401	2.58k	{
402	2.58k	if (pair_first)
403	0	goto invalid_pair;
404	2.58k	*out++ = escape;
405	2.58k	in += 2;
406	2.58k	}
407	66	else if (isxdigit((unsigned char) in[1]) &&
408	66	isxdigit((unsigned char) in[2]) &&
409	66	isxdigit((unsigned char) in[3]) &&
410	66	isxdigit((unsigned char) in[4]))
411	60	{
412	60	pg_wchar unicode;
413
414	60	unicode = (hexval(in[1]) << 12) +
415	60	(hexval(in[2]) << 8) +
416	60	(hexval(in[3]) << 4) +
417	60	hexval(in[4]);
418	60	check_unicode_value(unicode);
419	60	if (pair_first)
420	0	{
421	0	if (is_utf16_surrogate_second(unicode))
422	0	{
423	0	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
424	0	pair_first = 0;
425	0	}
426	0	else
427	0	goto invalid_pair;
428	0	}
429	60	else if (is_utf16_surrogate_second(unicode))
430	0	goto invalid_pair;
431
432	60	if (is_utf16_surrogate_first(unicode))
433	0	pair_first = unicode;
434	60	else
435	60	{
436	60	pg_unicode_to_server(unicode, (unsigned char *) out);
437	60	out += strlen(out);
438	60	}
439	60	in += 5;
440	60	}
441	6	else if (in[1] == '+' &&
442	6	isxdigit((unsigned char) in[2]) &&
443	6	isxdigit((unsigned char) in[3]) &&
444	6	isxdigit((unsigned char) in[4]) &&
445	6	isxdigit((unsigned char) in[5]) &&
446	6	isxdigit((unsigned char) in[6]) &&
447	6	isxdigit((unsigned char) in[7]))
448	5	{
449	5	pg_wchar unicode;
450
451	5	unicode = (hexval(in[2]) << 20) +
452	5	(hexval(in[3]) << 16) +
453	5	(hexval(in[4]) << 12) +
454	5	(hexval(in[5]) << 8) +
455	5	(hexval(in[6]) << 4) +
456	5	hexval(in[7]);
457	5	check_unicode_value(unicode);
458	5	if (pair_first)
459	0	{
460	0	if (is_utf16_surrogate_second(unicode))
461	0	{
462	0	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
463	0	pair_first = 0;
464	0	}
465	0	else
466	0	goto invalid_pair;
467	0	}
468	5	else if (is_utf16_surrogate_second(unicode))
469	0	goto invalid_pair;
470
471	5	if (is_utf16_surrogate_first(unicode))
472	0	pair_first = unicode;
473	5	else
474	5	{
475	5	pg_unicode_to_server(unicode, (unsigned char *) out);
476	5	out += strlen(out);
477	5	}
478	5	in += 8;
479	5	}
480	1	else
481	1	ereport(ERROR,
482	2.65k	(errcode(ERRCODE_SYNTAX_ERROR),
483	2.65k	errmsg("invalid Unicode escape"),
484	2.65k	errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
485
486	2.65k	cancel_scanner_errposition_callback(&scbstate);
487	2.65k	}
488	3.68M	else
489	3.68M	{
490	3.68M	if (pair_first)
491	0	goto invalid_pair;
492
493	3.68M	out++ = in++;
494	3.68M	}
495	3.68M	}
496
497		/* unfinished surrogate pair? */
498	79	if (pair_first)
499	0	goto invalid_pair;
500
501	79	*out = '\0';
502	79	return new;
503
504		/*
505		* We might get here with the error callback active, or not. Call
506		* scanner_errposition to make sure an error cursor appears; if the
507		* callback is active, this is duplicative but harmless.
508		*/
509	0	invalid_pair:
510	0	ereport(ERROR,
511	0	(errcode(ERRCODE_SYNTAX_ERROR),
512	0	errmsg("invalid Unicode surrogate pair"),
513	0	scanner_errposition(in - str + position + 3, /* 3 for U&" */
514	0	yyscanner)));
515	0	return NULL; /* keep compiler quiet */
516	0	}