/src/postgres/src/backend/parser/scansup.c

Source
/*-------------------------------------------------------------------------
 *
 * scansup.c
 *    scanner support routines used by the core lexer
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    src/backend/parser/scansup.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <ctype.h>

#include "mb/pg_wchar.h"
#include "parser/scansup.h"


/*
 * downcase_truncate_identifier() --- do appropriate downcasing and
 * truncation of an unquoted identifier.  Optionally warn of truncation.
 *
 * Returns a palloc'd string containing the adjusted identifier.
 *
 * Note: in some usages the passed string is not null-terminated.
 *
 * Note: the API of this function is designed to allow for downcasing
 * transformations that increase the string length, but we don't yet
 * support that.  If you want to implement it, you'll need to fix
 * SplitIdentifierString() in utils/adt/varlena.c.
 */
char *
downcase_truncate_identifier(const char *ident, int len, bool warn)
{
  return downcase_identifier(ident, len, warn, true);
}

/*
 * a workhorse for downcase_truncate_identifier
 */
char *
downcase_identifier(const char *ident, int len, bool warn, bool truncate)
{
  char     *result;
  int     i;
  bool    enc_is_single_byte;

  result = palloc(len + 1);
  enc_is_single_byte = pg_database_encoding_max_length() == 1;

  /*
   * SQL99 specifies Unicode-aware case normalization, which we don't yet
   * have the infrastructure for.  Instead we use tolower() to provide a
   * locale-aware translation.  However, there are some locales where this
   * is not right either (eg, Turkish may do strange things with 'i' and
   * 'I').  Our current compromise is to use tolower() for characters with
   * the high bit set, as long as they aren't part of a multi-byte
   * character, and use an ASCII-only downcasing for 7-bit characters.
   */
  for (i = 0; i < len; i++)
  {
    unsigned char ch = (unsigned char) ident[i];

    if (ch >= 'A' && ch <= 'Z')
      ch += 'a' - 'A';
    else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
      ch = tolower(ch);
    result[i] = (char) ch;
  }
  result[i] = '\0';

  if (i >= NAMEDATALEN && truncate)
    truncate_identifier(result, i, warn);

  return result;
}


/*
 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
 *
 * The given string is modified in-place, if necessary.  A warning is
 * issued if requested.
 *
 * We require the caller to pass in the string length since this saves a
 * strlen() call in some common usages.
 */
void
truncate_identifier(char *ident, int len, bool warn)
{
  if (len >= NAMEDATALEN)
  {
    len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
    if (warn)
      ereport(NOTICE,
          (errcode(ERRCODE_NAME_TOO_LONG),
           errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
              ident, len, ident)));
    ident[len] = '\0';
  }
}

/*
 * scanner_isspace() --- return true if flex scanner considers char whitespace
 *
 * This should be used instead of the potentially locale-dependent isspace()
 * function when it's important to match the lexer's behavior.
 *
 * In principle we might need similar functions for isalnum etc, but for the
 * moment only isspace seems needed.
 */
bool
scanner_isspace(char ch)
{
  /* This must match scan.l's list of {space} characters */
  if (ch == ' ' ||
    ch == '\t' ||
    ch == '\n' ||
    ch == '\r' ||
    ch == '\v' ||
    ch == '\f')
    return true;
  return false;
}

Line	Count	Source
1		/*-------------------------------------------------------------------------
2		*
3		* scansup.c
4		* scanner support routines used by the core lexer
5		*
6		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7		* Portions Copyright (c) 1994, Regents of the University of California
8		*
9		*
10		* IDENTIFICATION
11		* src/backend/parser/scansup.c
12		*
13		*-------------------------------------------------------------------------
14		*/
15		#include "postgres.h"
16
17		#include <ctype.h>
18
19		#include "mb/pg_wchar.h"
20		#include "parser/scansup.h"
21
22
23		/*
24		* downcase_truncate_identifier() --- do appropriate downcasing and
25		* truncation of an unquoted identifier. Optionally warn of truncation.
26		*
27		* Returns a palloc'd string containing the adjusted identifier.
28		*
29		* Note: in some usages the passed string is not null-terminated.
30		*
31		* Note: the API of this function is designed to allow for downcasing
32		* transformations that increase the string length, but we don't yet
33		* support that. If you want to implement it, you'll need to fix
34		* SplitIdentifierString() in utils/adt/varlena.c.
35		*/
36		char *
37		downcase_truncate_identifier(const char *ident, int len, bool warn)
38	5.44M	{
39	5.44M	return downcase_identifier(ident, len, warn, true);
40	5.44M	}
41
42		/*
43		* a workhorse for downcase_truncate_identifier
44		*/
45		char *
46		downcase_identifier(const char *ident, int len, bool warn, bool truncate)
47	5.44M	{
48	5.44M	char *result;
49	5.44M	int i;
50	5.44M	bool enc_is_single_byte;
51
52	5.44M	result = palloc(len + 1);
53	5.44M	enc_is_single_byte = pg_database_encoding_max_length() == 1;
54
55		/*
56		* SQL99 specifies Unicode-aware case normalization, which we don't yet
57		* have the infrastructure for. Instead we use tolower() to provide a
58		* locale-aware translation. However, there are some locales where this
59		* is not right either (eg, Turkish may do strange things with 'i' and
60		* 'I'). Our current compromise is to use tolower() for characters with
61		* the high bit set, as long as they aren't part of a multi-byte
62		* character, and use an ASCII-only downcasing for 7-bit characters.
63		*/
64	21.9M	for (i = 0; i < len; i++)
65	16.4M	{
66	16.4M	unsigned char ch = (unsigned char) ident[i];
67
68	16.4M	if (ch >= 'A' && ch <= 'Z')
69	8.80M	ch += 'a' - 'A';
70	7.67M	else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
71	0	ch = tolower(ch);
72	16.4M	result[i] = (char) ch;
73	16.4M	}
74	5.44M	result[i] = '\0';
75
76	5.44M	if (i >= NAMEDATALEN && truncate)
77	1.63k	truncate_identifier(result, i, warn);
78
79	5.44M	return result;
80	5.44M	}
81
82
83		/*
84		* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
85		*
86		* The given string is modified in-place, if necessary. A warning is
87		* issued if requested.
88		*
89		* We require the caller to pass in the string length since this saves a
90		* strlen() call in some common usages.
91		*/
92		void
93		truncate_identifier(char *ident, int len, bool warn)
94	2.32k	{
95	2.32k	if (len >= NAMEDATALEN)
96	1.92k	{
97	1.92k	len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
98	1.92k	if (warn)
99	1.92k	ereport(NOTICE,
100	1.92k	(errcode(ERRCODE_NAME_TOO_LONG),
101	1.92k	errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
102	1.92k	ident, len, ident)));
103	1.92k	ident[len] = '\0';
104	1.92k	}
105	2.32k	}
106
107		/*
108		* scanner_isspace() --- return true if flex scanner considers char whitespace
109		*
110		* This should be used instead of the potentially locale-dependent isspace()
111		* function when it's important to match the lexer's behavior.
112		*
113		* In principle we might need similar functions for isalnum etc, but for the
114		* moment only isspace seems needed.
115		*/
116		bool
117		scanner_isspace(char ch)
118	72	{
119		/* This must match scan.l's list of {space} characters */
120	72	if (ch == ' ' \|\|
121	68	ch == '\t' \|\|
122	68	ch == '\n' \|\|
123	68	ch == '\r' \|\|
124	68	ch == '\v' \|\|
125	68	ch == '\f')
126	4	return true;
127	68	return false;
128	72	}

Coverage Report

Created: 2025-09-27 06:52