/src/postgres/src/port/chklocale.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * chklocale.c |
4 | | * Functions for handling locale-related info |
5 | | * |
6 | | * |
7 | | * Copyright (c) 1996-2025, PostgreSQL Global Development Group |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/port/chklocale.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | |
16 | | #ifndef FRONTEND |
17 | | #include "postgres.h" |
18 | | #else |
19 | | #include "postgres_fe.h" |
20 | | #endif |
21 | | |
22 | | #ifndef WIN32 |
23 | | #include <langinfo.h> |
24 | | #endif |
25 | | |
26 | | #include "mb/pg_wchar.h" |
27 | | |
28 | | |
29 | | /* |
30 | | * This table needs to recognize all the CODESET spellings for supported |
31 | | * backend encodings, as well as frontend-only encodings where possible |
32 | | * (the latter case is currently only needed for initdb to recognize |
33 | | * error situations). On Windows, we rely on entries for codepage |
34 | | * numbers (CPnnn). |
35 | | * |
36 | | * Note that we search the table with pg_strcasecmp(), so variant |
37 | | * capitalizations don't need their own entries. |
38 | | */ |
39 | | struct encoding_match |
40 | | { |
41 | | enum pg_enc pg_enc_code; |
42 | | const char *system_enc_name; |
43 | | }; |
44 | | |
45 | | static const struct encoding_match encoding_match_list[] = { |
46 | | {PG_EUC_JP, "EUC-JP"}, |
47 | | {PG_EUC_JP, "eucJP"}, |
48 | | {PG_EUC_JP, "IBM-eucJP"}, |
49 | | {PG_EUC_JP, "sdeckanji"}, |
50 | | {PG_EUC_JP, "CP20932"}, |
51 | | |
52 | | {PG_EUC_CN, "EUC-CN"}, |
53 | | {PG_EUC_CN, "eucCN"}, |
54 | | {PG_EUC_CN, "IBM-eucCN"}, |
55 | | {PG_EUC_CN, "GB2312"}, |
56 | | {PG_EUC_CN, "dechanzi"}, |
57 | | {PG_EUC_CN, "CP20936"}, |
58 | | |
59 | | {PG_EUC_KR, "EUC-KR"}, |
60 | | {PG_EUC_KR, "eucKR"}, |
61 | | {PG_EUC_KR, "IBM-eucKR"}, |
62 | | {PG_EUC_KR, "deckorean"}, |
63 | | {PG_EUC_KR, "5601"}, |
64 | | {PG_EUC_KR, "CP51949"}, |
65 | | |
66 | | {PG_EUC_TW, "EUC-TW"}, |
67 | | {PG_EUC_TW, "eucTW"}, |
68 | | {PG_EUC_TW, "IBM-eucTW"}, |
69 | | {PG_EUC_TW, "cns11643"}, |
70 | | /* No codepage for EUC-TW ? */ |
71 | | |
72 | | {PG_UTF8, "UTF-8"}, |
73 | | {PG_UTF8, "utf8"}, |
74 | | {PG_UTF8, "CP65001"}, |
75 | | |
76 | | {PG_LATIN1, "ISO-8859-1"}, |
77 | | {PG_LATIN1, "ISO8859-1"}, |
78 | | {PG_LATIN1, "iso88591"}, |
79 | | {PG_LATIN1, "CP28591"}, |
80 | | |
81 | | {PG_LATIN2, "ISO-8859-2"}, |
82 | | {PG_LATIN2, "ISO8859-2"}, |
83 | | {PG_LATIN2, "iso88592"}, |
84 | | {PG_LATIN2, "CP28592"}, |
85 | | |
86 | | {PG_LATIN3, "ISO-8859-3"}, |
87 | | {PG_LATIN3, "ISO8859-3"}, |
88 | | {PG_LATIN3, "iso88593"}, |
89 | | {PG_LATIN3, "CP28593"}, |
90 | | |
91 | | {PG_LATIN4, "ISO-8859-4"}, |
92 | | {PG_LATIN4, "ISO8859-4"}, |
93 | | {PG_LATIN4, "iso88594"}, |
94 | | {PG_LATIN4, "CP28594"}, |
95 | | |
96 | | {PG_LATIN5, "ISO-8859-9"}, |
97 | | {PG_LATIN5, "ISO8859-9"}, |
98 | | {PG_LATIN5, "iso88599"}, |
99 | | {PG_LATIN5, "CP28599"}, |
100 | | |
101 | | {PG_LATIN6, "ISO-8859-10"}, |
102 | | {PG_LATIN6, "ISO8859-10"}, |
103 | | {PG_LATIN6, "iso885910"}, |
104 | | |
105 | | {PG_LATIN7, "ISO-8859-13"}, |
106 | | {PG_LATIN7, "ISO8859-13"}, |
107 | | {PG_LATIN7, "iso885913"}, |
108 | | |
109 | | {PG_LATIN8, "ISO-8859-14"}, |
110 | | {PG_LATIN8, "ISO8859-14"}, |
111 | | {PG_LATIN8, "iso885914"}, |
112 | | |
113 | | {PG_LATIN9, "ISO-8859-15"}, |
114 | | {PG_LATIN9, "ISO8859-15"}, |
115 | | {PG_LATIN9, "iso885915"}, |
116 | | {PG_LATIN9, "CP28605"}, |
117 | | |
118 | | {PG_LATIN10, "ISO-8859-16"}, |
119 | | {PG_LATIN10, "ISO8859-16"}, |
120 | | {PG_LATIN10, "iso885916"}, |
121 | | |
122 | | {PG_KOI8R, "KOI8-R"}, |
123 | | {PG_KOI8R, "CP20866"}, |
124 | | |
125 | | {PG_KOI8U, "KOI8-U"}, |
126 | | {PG_KOI8U, "CP21866"}, |
127 | | |
128 | | {PG_WIN866, "CP866"}, |
129 | | {PG_WIN874, "CP874"}, |
130 | | {PG_WIN1250, "CP1250"}, |
131 | | {PG_WIN1251, "CP1251"}, |
132 | | {PG_WIN1251, "ansi-1251"}, |
133 | | {PG_WIN1252, "CP1252"}, |
134 | | {PG_WIN1253, "CP1253"}, |
135 | | {PG_WIN1254, "CP1254"}, |
136 | | {PG_WIN1255, "CP1255"}, |
137 | | {PG_WIN1256, "CP1256"}, |
138 | | {PG_WIN1257, "CP1257"}, |
139 | | {PG_WIN1258, "CP1258"}, |
140 | | |
141 | | {PG_ISO_8859_5, "ISO-8859-5"}, |
142 | | {PG_ISO_8859_5, "ISO8859-5"}, |
143 | | {PG_ISO_8859_5, "iso88595"}, |
144 | | {PG_ISO_8859_5, "CP28595"}, |
145 | | |
146 | | {PG_ISO_8859_6, "ISO-8859-6"}, |
147 | | {PG_ISO_8859_6, "ISO8859-6"}, |
148 | | {PG_ISO_8859_6, "iso88596"}, |
149 | | {PG_ISO_8859_6, "CP28596"}, |
150 | | |
151 | | {PG_ISO_8859_7, "ISO-8859-7"}, |
152 | | {PG_ISO_8859_7, "ISO8859-7"}, |
153 | | {PG_ISO_8859_7, "iso88597"}, |
154 | | {PG_ISO_8859_7, "CP28597"}, |
155 | | |
156 | | {PG_ISO_8859_8, "ISO-8859-8"}, |
157 | | {PG_ISO_8859_8, "ISO8859-8"}, |
158 | | {PG_ISO_8859_8, "iso88598"}, |
159 | | {PG_ISO_8859_8, "CP28598"}, |
160 | | |
161 | | {PG_SJIS, "SJIS"}, |
162 | | {PG_SJIS, "PCK"}, |
163 | | {PG_SJIS, "CP932"}, |
164 | | {PG_SJIS, "SHIFT_JIS"}, |
165 | | |
166 | | {PG_BIG5, "BIG5"}, |
167 | | {PG_BIG5, "BIG5HKSCS"}, |
168 | | {PG_BIG5, "Big5-HKSCS"}, |
169 | | {PG_BIG5, "CP950"}, |
170 | | |
171 | | {PG_GBK, "GBK"}, |
172 | | {PG_GBK, "CP936"}, |
173 | | |
174 | | {PG_UHC, "UHC"}, |
175 | | {PG_UHC, "CP949"}, |
176 | | |
177 | | {PG_JOHAB, "JOHAB"}, |
178 | | {PG_JOHAB, "CP1361"}, |
179 | | |
180 | | {PG_GB18030, "GB18030"}, |
181 | | {PG_GB18030, "CP54936"}, |
182 | | |
183 | | {PG_SHIFT_JIS_2004, "SJIS_2004"}, |
184 | | |
185 | | {PG_SQL_ASCII, "US-ASCII"}, |
186 | | |
187 | | {PG_SQL_ASCII, NULL} /* end marker */ |
188 | | }; |
189 | | |
190 | | #ifdef WIN32 |
191 | | /* |
192 | | * On Windows, use CP<code page number> instead of CODESET. |
193 | | * |
194 | | * This routine uses GetLocaleInfoEx() to parse short locale names like |
195 | | * "de-DE", "fr-FR", etc. If those cannot be parsed correctly process falls |
196 | | * back to the pre-VS-2010 manual parsing done with using |
197 | | * <Language>_<Country>.<CodePage> as a base. |
198 | | * |
199 | | * Returns a malloc()'d string for the caller to free. |
200 | | */ |
201 | | static char * |
202 | | win32_get_codeset(const char *ctype) |
203 | | { |
204 | | char *r = NULL; |
205 | | char *codepage; |
206 | | uint32 cp; |
207 | | WCHAR wctype[LOCALE_NAME_MAX_LENGTH]; |
208 | | |
209 | | memset(wctype, 0, sizeof(wctype)); |
210 | | MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH); |
211 | | |
212 | | if (GetLocaleInfoEx(wctype, |
213 | | LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, |
214 | | (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0) |
215 | | { |
216 | | r = malloc(16); /* excess */ |
217 | | if (r != NULL) |
218 | | { |
219 | | /* |
220 | | * If the return value is CP_ACP that means no ANSI code page is |
221 | | * available, so only Unicode can be used for the locale. |
222 | | */ |
223 | | if (cp == CP_ACP) |
224 | | strcpy(r, "utf8"); |
225 | | else |
226 | | sprintf(r, "CP%u", cp); |
227 | | } |
228 | | } |
229 | | else |
230 | | { |
231 | | /* |
232 | | * Locale format on Win32 is <Language>_<Country>.<CodePage>. For |
233 | | * example, English_United States.1252. If we see digits after the |
234 | | * last dot, assume it's a codepage number. Otherwise, we might be |
235 | | * dealing with a Unix-style locale string; Windows' setlocale() will |
236 | | * take those even though GetLocaleInfoEx() won't, so we end up here. |
237 | | * In that case, just return what's after the last dot and hope we can |
238 | | * find it in our table. |
239 | | */ |
240 | | codepage = strrchr(ctype, '.'); |
241 | | if (codepage != NULL) |
242 | | { |
243 | | size_t ln; |
244 | | |
245 | | codepage++; |
246 | | ln = strlen(codepage); |
247 | | r = malloc(ln + 3); |
248 | | if (r != NULL) |
249 | | { |
250 | | if (strspn(codepage, "0123456789") == ln) |
251 | | sprintf(r, "CP%s", codepage); |
252 | | else |
253 | | strcpy(r, codepage); |
254 | | } |
255 | | } |
256 | | } |
257 | | |
258 | | return r; |
259 | | } |
260 | | |
261 | | #ifndef FRONTEND |
262 | | /* |
263 | | * Given a Windows code page identifier, find the corresponding PostgreSQL |
264 | | * encoding. Issue a warning and return -1 if none found. |
265 | | */ |
266 | | int |
267 | | pg_codepage_to_encoding(UINT cp) |
268 | | { |
269 | | char sys[16]; |
270 | | int i; |
271 | | |
272 | | sprintf(sys, "CP%u", cp); |
273 | | |
274 | | /* Check the table */ |
275 | | for (i = 0; encoding_match_list[i].system_enc_name; i++) |
276 | | if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) |
277 | | return encoding_match_list[i].pg_enc_code; |
278 | | |
279 | | ereport(WARNING, |
280 | | (errmsg("could not determine encoding for codeset \"%s\"", sys))); |
281 | | |
282 | | return -1; |
283 | | } |
284 | | #endif |
285 | | #endif /* WIN32 */ |
286 | | |
287 | | /* |
288 | | * Given a setting for LC_CTYPE, return the Postgres ID of the associated |
289 | | * encoding, if we can determine it. Return -1 if we can't determine it. |
290 | | * |
291 | | * Pass in NULL to get the encoding for the current locale setting. |
292 | | * Pass "" to get the encoding selected by the server's environment. |
293 | | * |
294 | | * If the result is PG_SQL_ASCII, callers should treat it as being compatible |
295 | | * with any desired encoding. |
296 | | * |
297 | | * If running in the backend and write_message is false, this function must |
298 | | * cope with the possibility that elog() and palloc() are not yet usable. |
299 | | */ |
300 | | int |
301 | | pg_get_encoding_from_locale(const char *ctype, bool write_message) |
302 | 0 | { |
303 | 0 | char *sys; |
304 | 0 | int i; |
305 | |
|
306 | 0 | #ifndef WIN32 |
307 | 0 | locale_t loc; |
308 | 0 | #endif |
309 | | |
310 | | /* Get the CODESET property, and also LC_CTYPE if not passed in */ |
311 | 0 | if (!ctype) |
312 | 0 | ctype = setlocale(LC_CTYPE, NULL); |
313 | | |
314 | | |
315 | | /* If locale is C or POSIX, we can allow all encodings */ |
316 | 0 | if (pg_strcasecmp(ctype, "C") == 0 || |
317 | 0 | pg_strcasecmp(ctype, "POSIX") == 0) |
318 | 0 | return PG_SQL_ASCII; |
319 | | |
320 | | |
321 | 0 | #ifndef WIN32 |
322 | 0 | loc = newlocale(LC_CTYPE_MASK, ctype, (locale_t) 0); |
323 | 0 | if (loc == (locale_t) 0) |
324 | 0 | return -1; /* bogus ctype passed in? */ |
325 | | |
326 | 0 | sys = nl_langinfo_l(CODESET, loc); |
327 | 0 | if (sys) |
328 | 0 | sys = strdup(sys); |
329 | |
|
330 | 0 | freelocale(loc); |
331 | | #else |
332 | | sys = win32_get_codeset(ctype); |
333 | | #endif |
334 | |
|
335 | 0 | if (!sys) |
336 | 0 | return -1; /* out of memory; unlikely */ |
337 | | |
338 | | /* Check the table */ |
339 | 0 | for (i = 0; encoding_match_list[i].system_enc_name; i++) |
340 | 0 | { |
341 | 0 | if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) |
342 | 0 | { |
343 | 0 | free(sys); |
344 | 0 | return encoding_match_list[i].pg_enc_code; |
345 | 0 | } |
346 | 0 | } |
347 | | |
348 | | /* Special-case kluges for particular platforms go here */ |
349 | | |
350 | | #ifdef __darwin__ |
351 | | |
352 | | /* |
353 | | * Current macOS has many locales that report an empty string for CODESET, |
354 | | * but they all seem to actually use UTF-8. |
355 | | */ |
356 | | if (strlen(sys) == 0) |
357 | | { |
358 | | free(sys); |
359 | | return PG_UTF8; |
360 | | } |
361 | | #endif |
362 | | |
363 | | /* |
364 | | * We print a warning if we got a CODESET string but couldn't recognize |
365 | | * it. This means we need another entry in the table. |
366 | | */ |
367 | 0 | if (write_message) |
368 | 0 | { |
369 | | #ifdef FRONTEND |
370 | | fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""), |
371 | | ctype, sys); |
372 | | /* keep newline separate so there's only one translatable string */ |
373 | | fputc('\n', stderr); |
374 | | #else |
375 | 0 | ereport(WARNING, |
376 | 0 | (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"", |
377 | 0 | ctype, sys))); |
378 | 0 | #endif |
379 | 0 | } |
380 | | |
381 | 0 | free(sys); |
382 | 0 | return -1; |
383 | 0 | } |