/src/postgres/src/backend/parser/scansup.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * scansup.c |
4 | | * scanner support routines used by the core lexer |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/parser/scansup.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "postgres.h" |
16 | | |
17 | | #include <ctype.h> |
18 | | |
19 | | #include "mb/pg_wchar.h" |
20 | | #include "parser/scansup.h" |
21 | | |
22 | | |
23 | | /* |
24 | | * downcase_truncate_identifier() --- do appropriate downcasing and |
25 | | * truncation of an unquoted identifier. Optionally warn of truncation. |
26 | | * |
27 | | * Returns a palloc'd string containing the adjusted identifier. |
28 | | * |
29 | | * Note: in some usages the passed string is not null-terminated. |
30 | | * |
31 | | * Note: the API of this function is designed to allow for downcasing |
32 | | * transformations that increase the string length, but we don't yet |
33 | | * support that. If you want to implement it, you'll need to fix |
34 | | * SplitIdentifierString() in utils/adt/varlena.c. |
35 | | */ |
36 | | char * |
37 | | downcase_truncate_identifier(const char *ident, int len, bool warn) |
38 | 5.44M | { |
39 | 5.44M | return downcase_identifier(ident, len, warn, true); |
40 | 5.44M | } |
41 | | |
42 | | /* |
43 | | * a workhorse for downcase_truncate_identifier |
44 | | */ |
45 | | char * |
46 | | downcase_identifier(const char *ident, int len, bool warn, bool truncate) |
47 | 5.44M | { |
48 | 5.44M | char *result; |
49 | 5.44M | int i; |
50 | 5.44M | bool enc_is_single_byte; |
51 | | |
52 | 5.44M | result = palloc(len + 1); |
53 | 5.44M | enc_is_single_byte = pg_database_encoding_max_length() == 1; |
54 | | |
55 | | /* |
56 | | * SQL99 specifies Unicode-aware case normalization, which we don't yet |
57 | | * have the infrastructure for. Instead we use tolower() to provide a |
58 | | * locale-aware translation. However, there are some locales where this |
59 | | * is not right either (eg, Turkish may do strange things with 'i' and |
60 | | * 'I'). Our current compromise is to use tolower() for characters with |
61 | | * the high bit set, as long as they aren't part of a multi-byte |
62 | | * character, and use an ASCII-only downcasing for 7-bit characters. |
63 | | */ |
64 | 21.9M | for (i = 0; i < len; i++) |
65 | 16.4M | { |
66 | 16.4M | unsigned char ch = (unsigned char) ident[i]; |
67 | | |
68 | 16.4M | if (ch >= 'A' && ch <= 'Z') |
69 | 8.80M | ch += 'a' - 'A'; |
70 | 7.67M | else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) |
71 | 0 | ch = tolower(ch); |
72 | 16.4M | result[i] = (char) ch; |
73 | 16.4M | } |
74 | 5.44M | result[i] = '\0'; |
75 | | |
76 | 5.44M | if (i >= NAMEDATALEN && truncate) |
77 | 1.63k | truncate_identifier(result, i, warn); |
78 | | |
79 | 5.44M | return result; |
80 | 5.44M | } |
81 | | |
82 | | |
83 | | /* |
84 | | * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes. |
85 | | * |
86 | | * The given string is modified in-place, if necessary. A warning is |
87 | | * issued if requested. |
88 | | * |
89 | | * We require the caller to pass in the string length since this saves a |
90 | | * strlen() call in some common usages. |
91 | | */ |
92 | | void |
93 | | truncate_identifier(char *ident, int len, bool warn) |
94 | 2.32k | { |
95 | 2.32k | if (len >= NAMEDATALEN) |
96 | 1.92k | { |
97 | 1.92k | len = pg_mbcliplen(ident, len, NAMEDATALEN - 1); |
98 | 1.92k | if (warn) |
99 | 1.92k | ereport(NOTICE, |
100 | 1.92k | (errcode(ERRCODE_NAME_TOO_LONG), |
101 | 1.92k | errmsg("identifier \"%s\" will be truncated to \"%.*s\"", |
102 | 1.92k | ident, len, ident))); |
103 | 1.92k | ident[len] = '\0'; |
104 | 1.92k | } |
105 | 2.32k | } |
106 | | |
107 | | /* |
108 | | * scanner_isspace() --- return true if flex scanner considers char whitespace |
109 | | * |
110 | | * This should be used instead of the potentially locale-dependent isspace() |
111 | | * function when it's important to match the lexer's behavior. |
112 | | * |
113 | | * In principle we might need similar functions for isalnum etc, but for the |
114 | | * moment only isspace seems needed. |
115 | | */ |
116 | | bool |
117 | | scanner_isspace(char ch) |
118 | 72 | { |
119 | | /* This must match scan.l's list of {space} characters */ |
120 | 72 | if (ch == ' ' || |
121 | 68 | ch == '\t' || |
122 | 68 | ch == '\n' || |
123 | 68 | ch == '\r' || |
124 | 68 | ch == '\v' || |
125 | 68 | ch == '\f') |
126 | 4 | return true; |
127 | 68 | return false; |
128 | 72 | } |