/src/gdal/port/cpl_recode.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | * |
3 | | * Name: cpl_recode.cpp |
4 | | * Project: CPL - Common Portability Library |
5 | | * Purpose: Character set recoding and char/wchar_t conversions. |
6 | | * Author: Andrey Kiselev, dron@ak4719.spb.edu |
7 | | * |
8 | | ********************************************************************** |
9 | | * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu> |
10 | | * Copyright (c) 2008, Frank Warmerdam |
11 | | * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com> |
12 | | * |
13 | | * Permission to use, copy, modify, and distribute this software for any |
14 | | * purpose with or without fee is hereby granted, provided that the above |
15 | | * copyright notice and this permission notice appear in all copies. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
18 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
19 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
20 | | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
21 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
22 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
23 | | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
24 | | **********************************************************************/ |
25 | | |
26 | | #include "cpl_port.h" |
27 | | #include "cpl_string.h" |
28 | | |
29 | | #include <cstring> |
30 | | |
31 | | #include "cpl_conv.h" |
32 | | #include "cpl_character_sets.h" |
33 | | |
34 | | #include "utf8.h" |
35 | | |
36 | | #ifdef CPL_RECODE_ICONV |
37 | | extern void CPLClearRecodeIconvWarningFlags(); |
38 | | extern char *CPLRecodeIconv(const char *, const char *, |
39 | | const char *) CPL_RETURNS_NONNULL; |
40 | | extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *, |
41 | | const char *); |
42 | | extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *); |
43 | | #endif // CPL_RECODE_ICONV |
44 | | |
45 | | extern void CPLClearRecodeStubWarningFlags(); |
46 | | extern char *CPLRecodeStub(const char *, const char *, |
47 | | const char *) CPL_RETURNS_NONNULL; |
48 | | extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *, |
49 | | const char *); |
50 | | extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *); |
51 | | extern int CPLIsUTF8Stub(const char *, int); |
52 | | |
53 | | /************************************************************************/ |
54 | | /* CPLRecode() */ |
55 | | /************************************************************************/ |
56 | | |
57 | | /** |
58 | | * Convert a string from a source encoding to a destination encoding. |
59 | | * |
60 | | * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII |
61 | | * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported : |
62 | | * <ul> |
63 | | * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in |
64 | | * fact)</li> |
65 | | * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li> |
66 | | * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li> |
67 | | * </ul> |
68 | | * |
69 | | * If an error occurs an error may, or may not be posted with CPLError(). |
70 | | * |
71 | | * @param pszSource a NULL terminated string. |
72 | | * @param pszSrcEncoding the source encoding. |
73 | | * @param pszDstEncoding the destination encoding. |
74 | | * |
75 | | * @return a NULL terminated string which should be freed with CPLFree(). |
76 | | * |
77 | | */ |
78 | | |
79 | | char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding, |
80 | | const char *pszDstEncoding) |
81 | | |
82 | 0 | { |
83 | | /* -------------------------------------------------------------------- */ |
84 | | /* Handle a few common short cuts. */ |
85 | | /* -------------------------------------------------------------------- */ |
86 | 0 | if (EQUAL(pszSrcEncoding, pszDstEncoding)) |
87 | 0 | return CPLStrdup(pszSource); |
88 | | |
89 | 0 | if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) && |
90 | 0 | (EQUAL(pszDstEncoding, CPL_ENC_UTF8) || |
91 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) |
92 | 0 | return CPLStrdup(pszSource); |
93 | | |
94 | | // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables |
95 | 0 | if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) && |
96 | 0 | CPLGetConversionTableToUTF8(pszSrcEncoding)) |
97 | 0 | { |
98 | 0 | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
99 | 0 | } |
100 | | |
101 | 0 | #ifdef CPL_RECODE_ICONV |
102 | | /* -------------------------------------------------------------------- */ |
103 | | /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */ |
104 | | /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled */ |
105 | | /* very well by the stub implementation which is faster than the */ |
106 | | /* iconv() route. Use a stub for these two ones and iconv() */ |
107 | | /* everything else. */ |
108 | | /* -------------------------------------------------------------------- */ |
109 | 0 | if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) && |
110 | 0 | EQUAL(pszDstEncoding, CPL_ENC_UTF8)) || |
111 | 0 | (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) && |
112 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) |
113 | 0 | { |
114 | 0 | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
115 | 0 | } |
116 | | #ifdef _WIN32 |
117 | | else if (((EQUAL(pszSrcEncoding, "CP_ACP") || |
118 | | EQUAL(pszSrcEncoding, "CP_OEMCP")) && |
119 | | EQUAL(pszDstEncoding, CPL_ENC_UTF8)) || |
120 | | (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) && |
121 | | (EQUAL(pszDstEncoding, "CP_ACP") || |
122 | | EQUAL(pszDstEncoding, "CP_OEMCP")))) |
123 | | { |
124 | | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
125 | | } |
126 | | #endif |
127 | 0 | else |
128 | 0 | { |
129 | 0 | return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding); |
130 | 0 | } |
131 | | #else // CPL_RECODE_STUB |
132 | | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
133 | | #endif // CPL_RECODE_ICONV |
134 | 0 | } |
135 | | |
136 | | /************************************************************************/ |
137 | | /* CPLRecodeFromWChar() */ |
138 | | /************************************************************************/ |
139 | | |
140 | | /** |
141 | | * Convert wchar_t string to UTF-8. |
142 | | * |
143 | | * Convert a wchar_t string into a multibyte utf-8 string. The only |
144 | | * guaranteed supported source encoding is CPL_ENC_UCS2, and the only |
145 | | * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII |
146 | | * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings |
147 | | * may also be supported. |
148 | | * |
149 | | * Note that the wchar_t type varies in size on different systems. On |
150 | | * win32 it is normally 2 bytes, and on UNIX 4 bytes. |
151 | | * |
152 | | * If an error occurs an error may, or may not be posted with CPLError(). |
153 | | * |
154 | | * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t. |
155 | | * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2. |
156 | | * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8. |
157 | | * |
158 | | * @return a zero terminated multi-byte string which should be freed with |
159 | | * CPLFree(), or NULL if an error occurs. |
160 | | * |
161 | | */ |
162 | | |
163 | | char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource, |
164 | | const char *pszSrcEncoding, |
165 | | const char *pszDstEncoding) |
166 | | |
167 | 0 | { |
168 | 0 | #ifdef CPL_RECODE_ICONV |
169 | | /* -------------------------------------------------------------------- */ |
170 | | /* Conversions from CPL_ENC_UCS2 */ |
171 | | /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */ |
172 | | /* handled by the stub implementation. */ |
173 | | /* -------------------------------------------------------------------- */ |
174 | 0 | if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || |
175 | 0 | EQUAL(pszSrcEncoding, "WCHAR_T")) && |
176 | 0 | (EQUAL(pszDstEncoding, CPL_ENC_UTF8) || |
177 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ASCII) || |
178 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) |
179 | 0 | { |
180 | 0 | return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, |
181 | 0 | pszDstEncoding); |
182 | 0 | } |
183 | | |
184 | 0 | return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding); |
185 | |
|
186 | | #else // CPL_RECODE_STUB |
187 | | return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding); |
188 | | #endif // CPL_RECODE_ICONV |
189 | 0 | } |
190 | | |
191 | | /************************************************************************/ |
192 | | /* CPLRecodeToWChar() */ |
193 | | /************************************************************************/ |
194 | | |
195 | | /** |
196 | | * Convert UTF-8 string to a wchar_t string. |
197 | | * |
198 | | * Convert a 8bit, multi-byte per character input string into a wide |
199 | | * character (wchar_t) string. The only guaranteed supported source encodings |
200 | | * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only |
201 | | * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source |
202 | | * and destination encodings may be supported depending on the underlying |
203 | | * implementation. |
204 | | * |
205 | | * Note that the wchar_t type varies in size on different systems. On |
206 | | * win32 it is normally 2 bytes, and on UNIX 4 bytes. |
207 | | * |
208 | | * If an error occurs an error may, or may not be posted with CPLError(). |
209 | | * |
210 | | * @param pszSource input multi-byte character string. |
211 | | * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8. |
212 | | * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. |
213 | | * |
214 | | * @return the zero terminated wchar_t string (to be freed with CPLFree()) or |
215 | | * NULL on error. |
216 | | * |
217 | | */ |
218 | | |
219 | | wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource, |
220 | | const char *pszSrcEncoding, |
221 | | const char *pszDstEncoding) |
222 | | |
223 | 0 | { |
224 | 0 | #ifdef CPL_RECODE_ICONV |
225 | | /* -------------------------------------------------------------------- */ |
226 | | /* Conversions to CPL_ENC_UCS2 */ |
227 | | /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */ |
228 | | /* handled by the stub implementation. */ |
229 | | /* -------------------------------------------------------------------- */ |
230 | 0 | if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) || |
231 | 0 | EQUAL(pszDstEncoding, "WCHAR_T")) && |
232 | 0 | (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) || |
233 | 0 | EQUAL(pszSrcEncoding, CPL_ENC_ASCII) || |
234 | 0 | EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1))) |
235 | 0 | { |
236 | 0 | return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding); |
237 | 0 | } |
238 | | |
239 | 0 | return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding); |
240 | |
|
241 | | #else // CPL_RECODE_STUB |
242 | | return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding); |
243 | | #endif // CPL_RECODE_ICONV |
244 | 0 | } |
245 | | |
246 | | /************************************************************************/ |
247 | | /* CPLIsASCII() */ |
248 | | /************************************************************************/ |
249 | | |
250 | | /** |
251 | | * Test if a string is encoded as ASCII. |
252 | | * |
253 | | * @param pabyData input string to test |
254 | | * @param nLen length of the input string, or -1 if the function must compute |
255 | | * the string length. In which case it must be null terminated. |
256 | | * @return true if the string is encoded as ASCII. false otherwise |
257 | | * |
258 | | * @since GDAL 3.6.0 |
259 | | */ |
260 | | bool CPLIsASCII(const char *pabyData, size_t nLen) |
261 | 0 | { |
262 | 0 | if (nLen == static_cast<size_t>(-1)) |
263 | 0 | nLen = strlen(pabyData); |
264 | 0 | for (size_t i = 0; i < nLen; ++i) |
265 | 0 | { |
266 | 0 | if (static_cast<unsigned char>(pabyData[i]) > 127) |
267 | 0 | return false; |
268 | 0 | } |
269 | 0 | return true; |
270 | 0 | } |
271 | | |
272 | | /************************************************************************/ |
273 | | /* CPLForceToASCII() */ |
274 | | /************************************************************************/ |
275 | | |
276 | | /** |
277 | | * Return a new string that is made only of ASCII characters. If non-ASCII |
278 | | * characters are found in the input string, they will be replaced by the |
279 | | * provided replacement character. |
280 | | * |
281 | | * This function does not make any assumption on the encoding of the input |
282 | | * string (except it must be nul-terminated if nLen equals -1, or have at |
283 | | * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when |
284 | | * the input string is known to be UTF-8 encoded. |
285 | | * |
286 | | * @param pabyData input string to test |
287 | | * @param nLen length of the input string, or -1 if the function must compute |
288 | | * the string length. In which case it must be null terminated. |
289 | | |
290 | | * @param chReplacementChar character which will be used when the input stream |
291 | | * contains a non ASCII character. Must be valid ASCII! |
292 | | * |
293 | | * @return a new string that must be freed with CPLFree(). |
294 | | * |
295 | | */ |
296 | | char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar) |
297 | 0 | { |
298 | 0 | const size_t nRealLen = |
299 | 0 | (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData); |
300 | 0 | char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1)); |
301 | 0 | const char *pszPtr = pabyData; |
302 | 0 | const char *pszEnd = pabyData + nRealLen; |
303 | 0 | size_t i = 0; |
304 | 0 | while (pszPtr != pszEnd) |
305 | 0 | { |
306 | 0 | if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127) |
307 | 0 | { |
308 | 0 | pszOutputString[i] = chReplacementChar; |
309 | 0 | ++pszPtr; |
310 | 0 | ++i; |
311 | 0 | } |
312 | 0 | else |
313 | 0 | { |
314 | 0 | pszOutputString[i] = *pszPtr; |
315 | 0 | ++pszPtr; |
316 | 0 | ++i; |
317 | 0 | } |
318 | 0 | } |
319 | 0 | pszOutputString[i] = '\0'; |
320 | 0 | return pszOutputString; |
321 | 0 | } |
322 | | |
323 | | /************************************************************************/ |
324 | | /* CPLUTF8ForceToASCII() */ |
325 | | /************************************************************************/ |
326 | | |
327 | | /** |
328 | | * Return a new string that is made only of ASCII characters. If non-ASCII |
329 | | * characters are found in the input string, for which an "equivalent" ASCII |
330 | | * character is not found, they will be replaced by the provided replacement |
331 | | * character. |
332 | | * |
333 | | * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement |
334 | | * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible |
335 | | * replacements for accented characters. |
336 | | |
337 | | * @param pszStr NUL-terminated UTF-8 string. |
338 | | * @param chReplacementChar character which will be used when the input stream |
339 | | * contains a non ASCII character that cannot be |
340 | | * substituted with an equivalent ASCII character. |
341 | | * Must be valid ASCII! |
342 | | * |
343 | | * @return a new string that must be freed with CPLFree(). |
344 | | * |
345 | | * @since GDAL 3.9 |
346 | | */ |
347 | | char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar) |
348 | 0 | { |
349 | 0 | static const struct |
350 | 0 | { |
351 | 0 | short nCodePoint; |
352 | 0 | char chFirst; |
353 | 0 | char chSecond; |
354 | 0 | } aLatinCharacters[] = { |
355 | | // https://en.wikipedia.org/wiki/Latin-1_Supplement |
356 | 0 | {0xC0, 'A', 0}, // Latin Capital Letter A with grave |
357 | 0 | {0xC1, 'A', 0}, // Latin Capital letter A with acute |
358 | 0 | {0xC2, 'A', 0}, // Latin Capital letter A with circumflex |
359 | 0 | {0xC3, 'A', 0}, // Latin Capital letter A with tilde |
360 | 0 | {0xC4, 'A', 0}, // Latin Capital letter A with diaeresis |
361 | 0 | {0xC5, 'A', 0}, // Latin Capital letter A with ring above |
362 | 0 | {0xC6, 'A', 'E'}, // Latin Capital letter AE |
363 | 0 | {0xC7, 'C', 0}, // Latin Capital letter C with cedilla |
364 | 0 | {0xC8, 'E', 0}, // Latin Capital letter E with grave |
365 | 0 | {0xC9, 'E', 0}, // Latin Capital letter E with acute |
366 | 0 | {0xCA, 'E', 0}, // Latin Capital letter E with circumflex |
367 | 0 | {0xCB, 'E', 0}, // Latin Capital letter E with diaeresis |
368 | 0 | {0xCC, 'I', 0}, // Latin Capital letter I with grave |
369 | 0 | {0xCD, 'I', 0}, // Latin Capital letter I with acute |
370 | 0 | {0xCE, 'I', 0}, // Latin Capital letter I with circumflex |
371 | 0 | {0xCF, 'I', 0}, // Latin Capital letter I with diaeresis |
372 | | // { 0xD0, '?', 0 }, // Latin Capital letter Eth |
373 | 0 | {0xD1, 'N', 0}, // Latin Capital letter N with tilde |
374 | 0 | {0xD2, 'O', 0}, // Latin Capital letter O with grave |
375 | 0 | {0xD3, 'O', 0}, // Latin Capital letter O with acute |
376 | 0 | {0xD4, 'O', 0}, // Latin Capital letter O with circumflex |
377 | 0 | {0xD5, 'O', 0}, // Latin Capital letter O with tilde |
378 | 0 | {0xD6, 'O', 0}, // Latin Capital letter O with diaeresis |
379 | 0 | {0xD8, 'O', 0}, // Latin Capital letter O with stroke |
380 | 0 | {0xD9, 'U', 0}, // Latin Capital letter U with grave |
381 | 0 | {0xDA, 'U', 0}, // Latin Capital letter U with acute |
382 | 0 | {0xDB, 'U', 0}, // Latin Capital Letter U with circumflex |
383 | 0 | {0xDC, 'U', 0}, // Latin Capital Letter U with diaeresis |
384 | 0 | {0xDD, 'Y', 0}, // Latin Capital Letter Y with acute |
385 | | // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn |
386 | 0 | {0xDF, 'S', 'S'}, // Latin Small Letter sharp S |
387 | 0 | {0xE0, 'a', 0}, // Latin Small Letter A with grave |
388 | 0 | {0xE1, 'a', 0}, // Latin Small Letter A with acute |
389 | 0 | {0xE2, 'a', 0}, // Latin Small Letter A with circumflex |
390 | 0 | {0xE3, 'a', 0}, // Latin Small Letter A with tilde |
391 | 0 | {0xE4, 'a', 0}, // Latin Small Letter A with diaeresis |
392 | 0 | {0xE5, 'a', 0}, // Latin Small Letter A with ring above |
393 | 0 | {0xE6, 'a', 'e'}, // Latin Small Letter AE |
394 | 0 | {0xE7, 'c', 0}, // Latin Small Letter C with cedilla |
395 | 0 | {0xE8, 'e', 0}, // Latin Small Letter E with grave |
396 | 0 | {0xE9, 'e', 0}, // Latin Small Letter E with acute |
397 | 0 | {0xEA, 'e', 0}, // Latin Small Letter E with circumflex |
398 | 0 | {0xEB, 'e', 0}, // Latin Small Letter E with diaeresis |
399 | 0 | {0xEC, 'i', 0}, // Latin Small Letter I with grave |
400 | 0 | {0xED, 'i', 0}, // Latin Small Letter I with acute |
401 | 0 | {0xEE, 'i', 0}, // Latin Small Letter I with circumflex |
402 | 0 | {0xEF, 'i', 0}, // Latin Small Letter I with diaeresis |
403 | | // { 0xF0, '?', 0 }, // Latin Small Letter Eth |
404 | 0 | {0xF1, 'n', 0}, // Latin Small Letter N with tilde |
405 | 0 | {0xF2, 'o', 0}, // Latin Small Letter O with grave |
406 | 0 | {0xF3, 'o', 0}, // Latin Small Letter O with acute |
407 | 0 | {0xF4, 'o', 0}, // Latin Small Letter O with circumflex |
408 | 0 | {0xF5, 'o', 0}, // Latin Small Letter O with tilde |
409 | 0 | {0xF6, 'o', 0}, // Latin Small Letter O with diaeresis |
410 | 0 | {0xF8, 'o', 0}, // Latin Small Letter O with stroke |
411 | 0 | {0xF9, 'u', 0}, // Latin Small Letter U with grave |
412 | 0 | {0xFA, 'u', 0}, // Latin Small Letter U with acute |
413 | 0 | {0xFB, 'u', 0}, // Latin Small Letter U with circumflex |
414 | 0 | {0xFC, 'u', 0}, // Latin Small Letter U with diaeresis |
415 | 0 | {0xFD, 'y', 0}, // Latin Small Letter Y with acute |
416 | | // { 0xFE, '?', 0 }, // Latin Small Letter Thorn |
417 | 0 | {0xFF, 'u', 0}, // Latin Small Letter Y with diaeresis |
418 | | |
419 | | // https://en.wikipedia.org/wiki/Latin_Extended-A |
420 | 0 | { |
421 | 0 | 0x0100, |
422 | 0 | 'A', |
423 | 0 | 0, |
424 | 0 | }, // Latin Capital letter A with macron |
425 | 0 | { |
426 | 0 | 0x0101, |
427 | 0 | 'a', |
428 | 0 | 0, |
429 | 0 | }, // Latin Small letter A with macron |
430 | 0 | { |
431 | 0 | 0x0102, |
432 | 0 | 'A', |
433 | 0 | 0, |
434 | 0 | }, // Latin Capital letter A with breve |
435 | 0 | { |
436 | 0 | 0x0103, |
437 | 0 | 'a', |
438 | 0 | 0, |
439 | 0 | }, // Latin Small letter A with breve |
440 | 0 | { |
441 | 0 | 0x0104, |
442 | 0 | 'A', |
443 | 0 | 0, |
444 | 0 | }, // Latin Capital letter A with ogonek |
445 | 0 | { |
446 | 0 | 0x0105, |
447 | 0 | 'a', |
448 | 0 | 0, |
449 | 0 | }, // Latin Small letter A with ogonek |
450 | 0 | { |
451 | 0 | 0x0106, |
452 | 0 | 'C', |
453 | 0 | 0, |
454 | 0 | }, // Latin Capital letter C with acute |
455 | 0 | { |
456 | 0 | 0x0107, |
457 | 0 | 'c', |
458 | 0 | 0, |
459 | 0 | }, // Latin Small letter C with acute |
460 | 0 | { |
461 | 0 | 0x0108, |
462 | 0 | 'C', |
463 | 0 | 0, |
464 | 0 | }, // Latin Capital letter C with circumflex |
465 | 0 | { |
466 | 0 | 0x0109, |
467 | 0 | 'c', |
468 | 0 | 0, |
469 | 0 | }, // Latin Small letter C with circumflex |
470 | 0 | { |
471 | 0 | 0x010A, |
472 | 0 | 'C', |
473 | 0 | 0, |
474 | 0 | }, // Latin Capital letter C with dot above |
475 | 0 | { |
476 | 0 | 0x010B, |
477 | 0 | 'c', |
478 | 0 | 0, |
479 | 0 | }, // Latin Small letter C with dot above |
480 | 0 | { |
481 | 0 | 0x010C, |
482 | 0 | 'C', |
483 | 0 | 0, |
484 | 0 | }, // Latin Capital letter C with caron |
485 | 0 | { |
486 | 0 | 0x010D, |
487 | 0 | 'c', |
488 | 0 | 0, |
489 | 0 | }, // Latin Small letter C with caron |
490 | 0 | { |
491 | 0 | 0x010E, |
492 | 0 | 'D', |
493 | 0 | 0, |
494 | 0 | }, // Latin Capital letter D with caron |
495 | 0 | { |
496 | 0 | 0x010F, |
497 | 0 | 'd', |
498 | 0 | 0, |
499 | 0 | }, // Latin Small letter D with caron |
500 | 0 | { |
501 | 0 | 0x0110, |
502 | 0 | 'D', |
503 | 0 | 0, |
504 | 0 | }, // Latin Capital letter D with stroke |
505 | 0 | { |
506 | 0 | 0x0111, |
507 | 0 | 'd', |
508 | 0 | 0, |
509 | 0 | }, // Latin Small letter D with stroke |
510 | 0 | { |
511 | 0 | 0x0112, |
512 | 0 | 'E', |
513 | 0 | 0, |
514 | 0 | }, // Latin Capital letter E with macron |
515 | 0 | { |
516 | 0 | 0x0113, |
517 | 0 | 'e', |
518 | 0 | 0, |
519 | 0 | }, // Latin Small letter E with macron |
520 | 0 | { |
521 | 0 | 0x0114, |
522 | 0 | 'E', |
523 | 0 | 0, |
524 | 0 | }, // Latin Capital letter E with breve |
525 | 0 | { |
526 | 0 | 0x0115, |
527 | 0 | 'e', |
528 | 0 | 0, |
529 | 0 | }, // Latin Small letter E with breve |
530 | 0 | { |
531 | 0 | 0x0116, |
532 | 0 | 'E', |
533 | 0 | 0, |
534 | 0 | }, // Latin Capital letter E with dot above |
535 | 0 | { |
536 | 0 | 0x0117, |
537 | 0 | 'e', |
538 | 0 | 0, |
539 | 0 | }, // Latin Small letter E with dot above |
540 | 0 | { |
541 | 0 | 0x0118, |
542 | 0 | 'E', |
543 | 0 | 0, |
544 | 0 | }, // Latin Capital letter E with ogonek |
545 | 0 | { |
546 | 0 | 0x0119, |
547 | 0 | 'e', |
548 | 0 | 0, |
549 | 0 | }, // Latin Small letter E with ogonek |
550 | 0 | { |
551 | 0 | 0x011A, |
552 | 0 | 'E', |
553 | 0 | 0, |
554 | 0 | }, // Latin Capital letter E with caron |
555 | 0 | { |
556 | 0 | 0x011B, |
557 | 0 | 'e', |
558 | 0 | 0, |
559 | 0 | }, // Latin Small letter E with caron |
560 | 0 | { |
561 | 0 | 0x011C, |
562 | 0 | 'G', |
563 | 0 | 0, |
564 | 0 | }, // Latin Capital letter G with circumflex |
565 | 0 | { |
566 | 0 | 0x011D, |
567 | 0 | 'g', |
568 | 0 | 0, |
569 | 0 | }, // Latin Small letter G with circumflex |
570 | 0 | { |
571 | 0 | 0x011E, |
572 | 0 | 'G', |
573 | 0 | 0, |
574 | 0 | }, // Latin Capital letter G with breve |
575 | 0 | { |
576 | 0 | 0x011F, |
577 | 0 | 'g', |
578 | 0 | 0, |
579 | 0 | }, // Latin Small letter G with breve |
580 | 0 | { |
581 | 0 | 0x0120, |
582 | 0 | 'G', |
583 | 0 | 0, |
584 | 0 | }, // Latin Capital letter G with dot above |
585 | 0 | { |
586 | 0 | 0x0121, |
587 | 0 | 'g', |
588 | 0 | 0, |
589 | 0 | }, // Latin Small letter G with dot above |
590 | 0 | { |
591 | 0 | 0x0122, |
592 | 0 | 'G', |
593 | 0 | 0, |
594 | 0 | }, // Latin Capital letter G with cedilla |
595 | 0 | { |
596 | 0 | 0x0123, |
597 | 0 | 'g', |
598 | 0 | 0, |
599 | 0 | }, // Latin Small letter G with cedilla |
600 | 0 | { |
601 | 0 | 0x0124, |
602 | 0 | 'H', |
603 | 0 | 0, |
604 | 0 | }, // Latin Capital letter H with circumflex |
605 | 0 | { |
606 | 0 | 0x0125, |
607 | 0 | 'h', |
608 | 0 | 0, |
609 | 0 | }, // Latin Small letter H with circumflex |
610 | 0 | { |
611 | 0 | 0x0126, |
612 | 0 | 'H', |
613 | 0 | 0, |
614 | 0 | }, // Latin Capital letter H with stroke |
615 | 0 | { |
616 | 0 | 0x0127, |
617 | 0 | 'h', |
618 | 0 | 0, |
619 | 0 | }, // Latin Small letter H with stroke |
620 | 0 | { |
621 | 0 | 0x0128, |
622 | 0 | 'I', |
623 | 0 | 0, |
624 | 0 | }, // Latin Capital letter I with tilde |
625 | 0 | { |
626 | 0 | 0x0129, |
627 | 0 | 'i', |
628 | 0 | 0, |
629 | 0 | }, // Latin Small letter I with tilde |
630 | 0 | { |
631 | 0 | 0x012A, |
632 | 0 | 'I', |
633 | 0 | 0, |
634 | 0 | }, // Latin Capital letter I with macron |
635 | 0 | { |
636 | 0 | 0x012B, |
637 | 0 | 'i', |
638 | 0 | 0, |
639 | 0 | }, // Latin Small letter I with macron |
640 | 0 | { |
641 | 0 | 0x012C, |
642 | 0 | 'I', |
643 | 0 | 0, |
644 | 0 | }, // Latin Capital letter I with breve |
645 | 0 | { |
646 | 0 | 0x012D, |
647 | 0 | 'i', |
648 | 0 | 0, |
649 | 0 | }, // Latin Small letter I with breve |
650 | 0 | { |
651 | 0 | 0x012E, |
652 | 0 | 'I', |
653 | 0 | 0, |
654 | 0 | }, // Latin Capital letter I with ogonek |
655 | 0 | { |
656 | 0 | 0x012F, |
657 | 0 | 'i', |
658 | 0 | 0, |
659 | 0 | }, // Latin Small letter I with ogonek |
660 | 0 | { |
661 | 0 | 0x0130, |
662 | 0 | 'I', |
663 | 0 | 0, |
664 | 0 | }, // Latin Capital letter I with dot above |
665 | 0 | { |
666 | 0 | 0x0131, |
667 | 0 | 'i', |
668 | 0 | 0, |
669 | 0 | }, // Latin Small letter dotless I |
670 | 0 | { |
671 | 0 | 0x0132, |
672 | 0 | 'I', |
673 | 0 | 'J', |
674 | 0 | }, // Latin Capital Ligature IJ |
675 | 0 | { |
676 | 0 | 0x0133, |
677 | 0 | 'i', |
678 | 0 | 'j', |
679 | 0 | }, // Latin Small Ligature IJ |
680 | 0 | { |
681 | 0 | 0x0134, |
682 | 0 | 'J', |
683 | 0 | 0, |
684 | 0 | }, // Latin Capital letter J with circumflex |
685 | 0 | { |
686 | 0 | 0x0135, |
687 | 0 | 'j', |
688 | 0 | 0, |
689 | 0 | }, // Latin Small letter J with circumflex |
690 | 0 | { |
691 | 0 | 0x0136, |
692 | 0 | 'K', |
693 | 0 | 0, |
694 | 0 | }, // Latin Capital letter K with cedilla |
695 | 0 | { |
696 | 0 | 0x0137, |
697 | 0 | 'k', |
698 | 0 | 0, |
699 | 0 | }, // Latin Small letter K with cedilla |
700 | 0 | { |
701 | 0 | 0x0138, |
702 | 0 | 'k', |
703 | 0 | 0, |
704 | 0 | }, // Latin Small letter Kra |
705 | 0 | { |
706 | 0 | 0x0139, |
707 | 0 | 'L', |
708 | 0 | 0, |
709 | 0 | }, // Latin Capital letter L with acute |
710 | 0 | { |
711 | 0 | 0x013A, |
712 | 0 | 'l', |
713 | 0 | 0, |
714 | 0 | }, // Latin Small letter L with acute |
715 | 0 | { |
716 | 0 | 0x013B, |
717 | 0 | 'L', |
718 | 0 | 0, |
719 | 0 | }, // Latin Capital letter L with cedilla |
720 | 0 | { |
721 | 0 | 0x013C, |
722 | 0 | 'l', |
723 | 0 | 0, |
724 | 0 | }, // Latin Small letter L with cedilla |
725 | 0 | { |
726 | 0 | 0x013D, |
727 | 0 | 'L', |
728 | 0 | 0, |
729 | 0 | }, // Latin Capital letter L with caron |
730 | 0 | { |
731 | 0 | 0x013E, |
732 | 0 | 'l', |
733 | 0 | 0, |
734 | 0 | }, // Latin Small letter L with caron |
735 | 0 | { |
736 | 0 | 0x013F, |
737 | 0 | 'L', |
738 | 0 | 0, |
739 | 0 | }, // Latin Capital letter L with middle dot |
740 | 0 | { |
741 | 0 | 0x0140, |
742 | 0 | 'l', |
743 | 0 | 0, |
744 | 0 | }, // Latin Small letter L with middle dot |
745 | 0 | { |
746 | 0 | 0x0141, |
747 | 0 | 'L', |
748 | 0 | 0, |
749 | 0 | }, // Latin Capital letter L with stroke |
750 | 0 | { |
751 | 0 | 0x0142, |
752 | 0 | 'l', |
753 | 0 | 0, |
754 | 0 | }, // Latin Small letter L with stroke |
755 | 0 | { |
756 | 0 | 0x0143, |
757 | 0 | 'N', |
758 | 0 | 0, |
759 | 0 | }, // Latin Capital letter N with acute |
760 | 0 | { |
761 | 0 | 0x0144, |
762 | 0 | 'n', |
763 | 0 | 0, |
764 | 0 | }, // Latin Small letter N with acute |
765 | 0 | { |
766 | 0 | 0x0145, |
767 | 0 | 'N', |
768 | 0 | 0, |
769 | 0 | }, // Latin Capital letter N with cedilla |
770 | 0 | { |
771 | 0 | 0x0146, |
772 | 0 | 'n', |
773 | 0 | 0, |
774 | 0 | }, // Latin Small letter N with cedilla |
775 | 0 | { |
776 | 0 | 0x0147, |
777 | 0 | 'N', |
778 | 0 | 0, |
779 | 0 | }, // Latin Capital letter N with caron |
780 | 0 | { |
781 | 0 | 0x0148, |
782 | 0 | 'n', |
783 | 0 | 0, |
784 | 0 | }, // Latin Small letter N with caron |
785 | | // { 0x014A , '?' , 0, }, // Latin Capital letter Eng |
786 | | // { 0x014B , '?' , 0, }, // Latin Small letter Eng |
787 | 0 | { |
788 | 0 | 0x014C, |
789 | 0 | 'O', |
790 | 0 | 0, |
791 | 0 | }, // Latin Capital letter O with macron |
792 | 0 | { |
793 | 0 | 0x014D, |
794 | 0 | 'o', |
795 | 0 | 0, |
796 | 0 | }, // Latin Small letter O with macron |
797 | 0 | { |
798 | 0 | 0x014E, |
799 | 0 | 'O', |
800 | 0 | 0, |
801 | 0 | }, // Latin Capital letter O with breve |
802 | 0 | { |
803 | 0 | 0x014F, |
804 | 0 | 'o', |
805 | 0 | 0, |
806 | 0 | }, // Latin Small letter O with breve |
807 | 0 | { |
808 | 0 | 0x0150, |
809 | 0 | 'O', |
810 | 0 | 0, |
811 | 0 | }, // Latin Capital Letter O with double acute |
812 | 0 | { |
813 | 0 | 0x0151, |
814 | 0 | 'o', |
815 | 0 | 0, |
816 | 0 | }, // Latin Small Letter O with double acute |
817 | 0 | { |
818 | 0 | 0x0152, |
819 | 0 | 'O', |
820 | 0 | 'E', |
821 | 0 | }, // Latin Capital Ligature OE |
822 | 0 | { |
823 | 0 | 0x0153, |
824 | 0 | 'o', |
825 | 0 | 'e', |
826 | 0 | }, // Latin Small Ligature OE |
827 | 0 | { |
828 | 0 | 0x0154, |
829 | 0 | 'R', |
830 | 0 | 0, |
831 | 0 | }, // Latin Capital letter R with acute |
832 | 0 | { |
833 | 0 | 0x0155, |
834 | 0 | 'r', |
835 | 0 | 0, |
836 | 0 | }, // Latin Small letter R with acute |
837 | 0 | { |
838 | 0 | 0x0156, |
839 | 0 | 'R', |
840 | 0 | 0, |
841 | 0 | }, // Latin Capital letter R with cedilla |
842 | 0 | { |
843 | 0 | 0x0157, |
844 | 0 | 'r', |
845 | 0 | 0, |
846 | 0 | }, // Latin Small letter R with cedilla |
847 | 0 | { |
848 | 0 | 0x0158, |
849 | 0 | 'R', |
850 | 0 | 0, |
851 | 0 | }, // Latin Capital letter R with caron |
852 | 0 | { |
853 | 0 | 0x0159, |
854 | 0 | 'r', |
855 | 0 | 0, |
856 | 0 | }, // Latin Small letter R with caron |
857 | 0 | { |
858 | 0 | 0x015A, |
859 | 0 | 'S', |
860 | 0 | 0, |
861 | 0 | }, // Latin Capital letter S with acute |
862 | 0 | { |
863 | 0 | 0x015B, |
864 | 0 | 's', |
865 | 0 | 0, |
866 | 0 | }, // Latin Small letter S with acute |
867 | 0 | { |
868 | 0 | 0x015C, |
869 | 0 | 'S', |
870 | 0 | 0, |
871 | 0 | }, // Latin Capital letter S with circumflex |
872 | 0 | { |
873 | 0 | 0x015D, |
874 | 0 | 's', |
875 | 0 | 0, |
876 | 0 | }, // Latin Small letter S with circumflex |
877 | 0 | { |
878 | 0 | 0x015E, |
879 | 0 | 'S', |
880 | 0 | 0, |
881 | 0 | }, // Latin Capital letter S with cedilla |
882 | 0 | { |
883 | 0 | 0x015F, |
884 | 0 | 's', |
885 | 0 | 0, |
886 | 0 | }, // Latin Small letter S with cedilla |
887 | 0 | { |
888 | 0 | 0x0160, |
889 | 0 | 'S', |
890 | 0 | 0, |
891 | 0 | }, // Latin Capital letter S with caron |
892 | 0 | { |
893 | 0 | 0x0161, |
894 | 0 | 's', |
895 | 0 | 0, |
896 | 0 | }, // Latin Small letter S with caron |
897 | 0 | { |
898 | 0 | 0x0162, |
899 | 0 | 'T', |
900 | 0 | 0, |
901 | 0 | }, // Latin Capital letter T with cedilla |
902 | 0 | { |
903 | 0 | 0x0163, |
904 | 0 | 't', |
905 | 0 | 0, |
906 | 0 | }, // Latin Small letter T with cedilla |
907 | 0 | { |
908 | 0 | 0x0164, |
909 | 0 | 'T', |
910 | 0 | 0, |
911 | 0 | }, // Latin Capital letter T with caron |
912 | 0 | { |
913 | 0 | 0x0165, |
914 | 0 | 't', |
915 | 0 | 0, |
916 | 0 | }, // Latin Small letter T with caron |
917 | 0 | { |
918 | 0 | 0x0166, |
919 | 0 | 'T', |
920 | 0 | 0, |
921 | 0 | }, // Latin Capital letter T with stroke |
922 | 0 | { |
923 | 0 | 0x0167, |
924 | 0 | 't', |
925 | 0 | 0, |
926 | 0 | }, // Latin Small letter T with stroke |
927 | 0 | { |
928 | 0 | 0x0168, |
929 | 0 | 'U', |
930 | 0 | 0, |
931 | 0 | }, // Latin Capital letter U with tilde |
932 | 0 | { |
933 | 0 | 0x0169, |
934 | 0 | 'u', |
935 | 0 | 0, |
936 | 0 | }, // Latin Small letter U with tilde |
937 | 0 | { |
938 | 0 | 0x016A, |
939 | 0 | 'U', |
940 | 0 | 0, |
941 | 0 | }, // Latin Capital letter U with macron |
942 | 0 | { |
943 | 0 | 0x016B, |
944 | 0 | 'u', |
945 | 0 | 0, |
946 | 0 | }, // Latin Small letter U with macron |
947 | 0 | { |
948 | 0 | 0x016C, |
949 | 0 | 'U', |
950 | 0 | 0, |
951 | 0 | }, // Latin Capital letter U with breve |
952 | 0 | { |
953 | 0 | 0x016D, |
954 | 0 | 'u', |
955 | 0 | 0, |
956 | 0 | }, // Latin Small letter U with breve |
957 | 0 | { |
958 | 0 | 0x016E, |
959 | 0 | 'U', |
960 | 0 | 0, |
961 | 0 | }, // Latin Capital letter U with ring above |
962 | 0 | { |
963 | 0 | 0x016F, |
964 | 0 | 'u', |
965 | 0 | 0, |
966 | 0 | }, // Latin Small letter U with ring above |
967 | 0 | { |
968 | 0 | 0x0170, |
969 | 0 | 'U', |
970 | 0 | 0, |
971 | 0 | }, // Latin Capital Letter U with double acute |
972 | 0 | { |
973 | 0 | 0x0171, |
974 | 0 | 'u', |
975 | 0 | 0, |
976 | 0 | }, // Latin Small Letter U with double acute |
977 | 0 | { |
978 | 0 | 0x0172, |
979 | 0 | 'U', |
980 | 0 | 0, |
981 | 0 | }, // Latin Capital letter U with ogonek |
982 | 0 | { |
983 | 0 | 0x0173, |
984 | 0 | 'u', |
985 | 0 | 0, |
986 | 0 | }, // Latin Small letter U with ogonek |
987 | 0 | { |
988 | 0 | 0x0174, |
989 | 0 | 'W', |
990 | 0 | 0, |
991 | 0 | }, // Latin Capital letter W with circumflex |
992 | 0 | { |
993 | 0 | 0x0175, |
994 | 0 | 'w', |
995 | 0 | 0, |
996 | 0 | }, // Latin Small letter W with circumflex |
997 | 0 | { |
998 | 0 | 0x0176, |
999 | 0 | 'Y', |
1000 | 0 | 0, |
1001 | 0 | }, // Latin Capital letter Y with circumflex |
1002 | 0 | { |
1003 | 0 | 0x0177, |
1004 | 0 | 'y', |
1005 | 0 | 0, |
1006 | 0 | }, // Latin Small letter Y with circumflex |
1007 | 0 | { |
1008 | 0 | 0x0178, |
1009 | 0 | 'Y', |
1010 | 0 | 0, |
1011 | 0 | }, // Latin Capital letter Y with diaeresis |
1012 | 0 | { |
1013 | 0 | 0x0179, |
1014 | 0 | 'Z', |
1015 | 0 | 0, |
1016 | 0 | }, // Latin Capital letter Z with acute |
1017 | 0 | { |
1018 | 0 | 0x017A, |
1019 | 0 | 'z', |
1020 | 0 | 0, |
1021 | 0 | }, // Latin Small letter Z with acute |
1022 | 0 | { |
1023 | 0 | 0x017B, |
1024 | 0 | 'Z', |
1025 | 0 | 0, |
1026 | 0 | }, // Latin Capital letter Z with dot above |
1027 | 0 | { |
1028 | 0 | 0x017C, |
1029 | 0 | 'z', |
1030 | 0 | 0, |
1031 | 0 | }, // Latin Small letter Z with dot above |
1032 | 0 | { |
1033 | 0 | 0x017D, |
1034 | 0 | 'Z', |
1035 | 0 | 0, |
1036 | 0 | }, // Latin Capital letter Z with caron |
1037 | 0 | { |
1038 | 0 | 0x017E, |
1039 | 0 | 'z', |
1040 | 0 | 0, |
1041 | 0 | }, // Latin Small letter Z with caron |
1042 | 0 | }; |
1043 | |
|
1044 | 0 | const size_t nLen = strlen(pszStr); |
1045 | 0 | char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1)); |
1046 | 0 | const char *pszPtr = pszStr; |
1047 | 0 | const char *pszEnd = pszStr + nLen; |
1048 | 0 | size_t i = 0; |
1049 | 0 | while (pszPtr != pszEnd) |
1050 | 0 | { |
1051 | 0 | if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127) |
1052 | 0 | { |
1053 | 0 | utf8_int32_t codepoint; |
1054 | 0 | if (pszPtr + utf8codepointcalcsize( |
1055 | 0 | reinterpret_cast<const utf8_int8_t *>(pszPtr)) > |
1056 | 0 | pszEnd) |
1057 | 0 | break; |
1058 | 0 | auto pszNext = reinterpret_cast<const char *>(utf8codepoint( |
1059 | 0 | reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint)); |
1060 | 0 | char ch = chReplacementChar; |
1061 | 0 | for (const auto &latin1char : aLatinCharacters) |
1062 | 0 | { |
1063 | 0 | if (codepoint == latin1char.nCodePoint) |
1064 | 0 | { |
1065 | 0 | pszOutputString[i] = latin1char.chFirst; |
1066 | 0 | ++i; |
1067 | 0 | if (latin1char.chSecond) |
1068 | 0 | { |
1069 | 0 | pszOutputString[i] = latin1char.chSecond; |
1070 | 0 | ++i; |
1071 | 0 | } |
1072 | 0 | ch = 0; |
1073 | 0 | break; |
1074 | 0 | } |
1075 | 0 | } |
1076 | 0 | if (ch) |
1077 | 0 | { |
1078 | 0 | pszOutputString[i] = ch; |
1079 | 0 | ++i; |
1080 | 0 | } |
1081 | 0 | pszPtr = pszNext; |
1082 | 0 | } |
1083 | 0 | else |
1084 | 0 | { |
1085 | 0 | pszOutputString[i] = *pszPtr; |
1086 | 0 | ++pszPtr; |
1087 | 0 | ++i; |
1088 | 0 | } |
1089 | 0 | } |
1090 | 0 | pszOutputString[i] = '\0'; |
1091 | 0 | return pszOutputString; |
1092 | 0 | } |
1093 | | |
1094 | | /************************************************************************/ |
1095 | | /* CPLEncodingCharSize() */ |
1096 | | /************************************************************************/ |
1097 | | |
1098 | | /** |
1099 | | * Return bytes per character for encoding. |
1100 | | * |
1101 | | * This function returns the size in bytes of the smallest character |
1102 | | * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this |
1103 | | * is straight forward. For encodings like UTF8 and UTF16 which represent |
1104 | | * some characters as a sequence of atomic character sizes the function |
1105 | | * still returns the atomic character size (1 for UTF8, 2 for UTF16). |
1106 | | * |
1107 | | * This function will return the correct value for well known encodings |
1108 | | * with corresponding CPL_ENC_ values. It may not return the correct value |
1109 | | * for other encodings even if they are supported by the underlying iconv |
1110 | | * or windows transliteration services. Hopefully it will improve over time. |
1111 | | * |
1112 | | * @param pszEncoding the name of the encoding. |
1113 | | * |
1114 | | * @return the size of a minimal character in bytes or -1 if the size is |
1115 | | * unknown. |
1116 | | */ |
1117 | | |
1118 | | int CPLEncodingCharSize(const char *pszEncoding) |
1119 | | |
1120 | 0 | { |
1121 | 0 | if (EQUAL(pszEncoding, CPL_ENC_UTF8)) |
1122 | 0 | return 1; |
1123 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_UTF16) || |
1124 | 0 | EQUAL(pszEncoding, "UTF-16LE")) |
1125 | 0 | return 2; |
1126 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE")) |
1127 | 0 | return 2; |
1128 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_UCS4)) |
1129 | 0 | return 4; |
1130 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_ASCII)) |
1131 | 0 | return 1; |
1132 | 0 | else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-")) |
1133 | 0 | return 1; |
1134 | | |
1135 | 0 | return -1; |
1136 | 0 | } |
1137 | | |
1138 | | /************************************************************************/ |
1139 | | /* CPLClearRecodeWarningFlags() */ |
1140 | | /************************************************************************/ |
1141 | | |
1142 | | void CPLClearRecodeWarningFlags() |
1143 | 0 | { |
1144 | 0 | #ifdef CPL_RECODE_ICONV |
1145 | 0 | CPLClearRecodeIconvWarningFlags(); |
1146 | 0 | #endif |
1147 | 0 | CPLClearRecodeStubWarningFlags(); |
1148 | 0 | } |
1149 | | |
1150 | | /************************************************************************/ |
1151 | | /* CPLStrlenUTF8() */ |
1152 | | /************************************************************************/ |
1153 | | |
1154 | | /** |
1155 | | * Return the number of UTF-8 characters of a nul-terminated string. |
1156 | | * |
1157 | | * This is different from strlen() which returns the number of bytes. |
1158 | | * |
1159 | | * @param pszUTF8Str a nul-terminated UTF-8 string |
1160 | | * |
1161 | | * @return the number of UTF-8 characters. |
1162 | | */ |
1163 | | |
1164 | | int CPLStrlenUTF8(const char *pszUTF8Str) |
1165 | 0 | { |
1166 | 0 | int nCharacterCount = 0; |
1167 | 0 | for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i) |
1168 | 0 | { |
1169 | 0 | if ((pszUTF8Str[i] & 0xc0) != 0x80) |
1170 | 0 | { |
1171 | 0 | if (nCharacterCount == INT_MAX) |
1172 | 0 | { |
1173 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
1174 | 0 | "CPLStrlenUTF8(): nCharacterCount > INT_MAX. Use " |
1175 | 0 | "CPLStrlenUTF8Ex() instead"); |
1176 | 0 | break; |
1177 | 0 | } |
1178 | 0 | ++nCharacterCount; |
1179 | 0 | } |
1180 | 0 | } |
1181 | 0 | return nCharacterCount; |
1182 | 0 | } |
1183 | | |
1184 | | /************************************************************************/ |
1185 | | /* CPLStrlenUTF8Ex() */ |
1186 | | /************************************************************************/ |
1187 | | |
1188 | | /** |
1189 | | * Return the number of UTF-8 characters of a nul-terminated string. |
1190 | | * |
1191 | | * This is different from strlen() which returns the number of bytes. |
1192 | | * |
1193 | | * @param pszUTF8Str a nul-terminated UTF-8 string |
1194 | | * |
1195 | | * @return the number of UTF-8 characters. |
1196 | | */ |
1197 | | |
1198 | | size_t CPLStrlenUTF8Ex(const char *pszUTF8Str) |
1199 | 0 | { |
1200 | 0 | size_t nCharacterCount = 0; |
1201 | 0 | for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i) |
1202 | 0 | { |
1203 | 0 | if ((pszUTF8Str[i] & 0xc0) != 0x80) |
1204 | 0 | { |
1205 | 0 | ++nCharacterCount; |
1206 | 0 | } |
1207 | 0 | } |
1208 | 0 | return nCharacterCount; |
1209 | 0 | } |
1210 | | |
1211 | | /************************************************************************/ |
1212 | | /* CPLCanRecode() */ |
1213 | | /************************************************************************/ |
1214 | | |
1215 | | /** |
1216 | | * Checks if it is possible to recode a string from one encoding to another. |
1217 | | * |
1218 | | * @param pszTestStr a NULL terminated string. |
1219 | | * @param pszSrcEncoding the source encoding. |
1220 | | * @param pszDstEncoding the destination encoding. |
1221 | | * |
1222 | | * @return a TRUE if recode is possible. |
1223 | | * |
1224 | | * @since GDAL 3.1.0 |
1225 | | */ |
1226 | | int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding, |
1227 | | const char *pszDstEncoding) |
1228 | 0 | { |
1229 | 0 | CPLClearRecodeWarningFlags(); |
1230 | 0 | CPLErrorReset(); |
1231 | |
|
1232 | 0 | CPLPushErrorHandler(CPLQuietErrorHandler); |
1233 | 0 | char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding)); |
1234 | 0 | CPLPopErrorHandler(); |
1235 | |
|
1236 | 0 | if (pszRec == nullptr) |
1237 | 0 | { |
1238 | 0 | return FALSE; |
1239 | 0 | } |
1240 | | |
1241 | 0 | CPLFree(pszRec); |
1242 | |
|
1243 | 0 | if (CPLGetLastErrorType() != 0) |
1244 | 0 | { |
1245 | 0 | return FALSE; |
1246 | 0 | } |
1247 | | |
1248 | 0 | return TRUE; |
1249 | 0 | } |