/src/gdal/port/cpl_recode.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * |
3 | | * Name: cpl_recode.cpp |
4 | | * Project: CPL - Common Portability Library |
5 | | * Purpose: Character set recoding and char/wchar_t conversions. |
6 | | * Author: Andrey Kiselev, dron@ak4719.spb.edu |
7 | | * |
8 | | ********************************************************************** |
9 | | * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu> |
10 | | * Copyright (c) 2008, Frank Warmerdam |
11 | | * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com> |
12 | | * |
13 | | * Permission to use, copy, modify, and distribute this software for any |
14 | | * purpose with or without fee is hereby granted, provided that the above |
15 | | * copyright notice and this permission notice appear in all copies. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
18 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
19 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
20 | | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
21 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
22 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
23 | | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
24 | | **********************************************************************/ |
25 | | |
26 | | #include "cpl_port.h" |
27 | | #include "cpl_string.h" |
28 | | |
29 | | #include <cstring> |
30 | | |
31 | | #include "cpl_conv.h" |
32 | | #include "cpl_character_sets.h" |
33 | | |
34 | | #include "utf8.h" |
35 | | |
36 | | #ifdef CPL_RECODE_ICONV |
37 | | extern void CPLClearRecodeIconvWarningFlags(); |
38 | | extern char *CPLRecodeIconv(const char *, const char *, |
39 | | const char *) CPL_RETURNS_NONNULL; |
40 | | extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *, |
41 | | const char *); |
42 | | extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *); |
43 | | #endif // CPL_RECODE_ICONV |
44 | | |
45 | | extern void CPLClearRecodeStubWarningFlags(); |
46 | | extern char *CPLRecodeStub(const char *, const char *, |
47 | | const char *) CPL_RETURNS_NONNULL; |
48 | | extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *, |
49 | | const char *); |
50 | | extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *); |
51 | | extern int CPLIsUTF8Stub(const char *, int); |
52 | | |
53 | | /************************************************************************/ |
54 | | /* CPLRecode() */ |
55 | | /************************************************************************/ |
56 | | |
57 | | /** |
58 | | * Convert a string from a source encoding to a destination encoding. |
59 | | * |
60 | | * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII |
61 | | * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported : |
62 | | * <ul> |
63 | | * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in |
64 | | * fact)</li> |
65 | | * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li> |
66 | | * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li> |
67 | | * </ul> |
68 | | * |
69 | | * If an error occurs an error may, or may not be posted with CPLError(). |
70 | | * |
71 | | * @param pszSource a NULL terminated string. |
72 | | * @param pszSrcEncoding the source encoding. |
73 | | * @param pszDstEncoding the destination encoding. |
74 | | * |
75 | | * @return a NULL terminated string which should be freed with CPLFree(). |
76 | | * |
77 | | * @since GDAL 1.6.0 |
78 | | */ |
79 | | |
80 | | char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding, |
81 | | const char *pszDstEncoding) |
82 | | |
83 | 0 | { |
84 | | /* -------------------------------------------------------------------- */ |
85 | | /* Handle a few common short cuts. */ |
86 | | /* -------------------------------------------------------------------- */ |
87 | 0 | if (EQUAL(pszSrcEncoding, pszDstEncoding)) |
88 | 0 | return CPLStrdup(pszSource); |
89 | | |
90 | 0 | if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) && |
91 | 0 | (EQUAL(pszDstEncoding, CPL_ENC_UTF8) || |
92 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) |
93 | 0 | return CPLStrdup(pszSource); |
94 | | |
95 | | // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables |
96 | 0 | if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) && |
97 | 0 | CPLGetConversionTableToUTF8(pszSrcEncoding)) |
98 | 0 | { |
99 | 0 | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
100 | 0 | } |
101 | | |
102 | 0 | #ifdef CPL_RECODE_ICONV |
103 | | /* -------------------------------------------------------------------- */ |
104 | | /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */ |
105 | | /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled */ |
106 | | /* very well by the stub implementation which is faster than the */ |
107 | | /* iconv() route. Use a stub for these two ones and iconv() */ |
108 | | /* everything else. */ |
109 | | /* -------------------------------------------------------------------- */ |
110 | 0 | if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) && |
111 | 0 | EQUAL(pszDstEncoding, CPL_ENC_UTF8)) || |
112 | 0 | (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) && |
113 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) |
114 | 0 | { |
115 | 0 | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
116 | 0 | } |
117 | | #ifdef _WIN32 |
118 | | else if (((EQUAL(pszSrcEncoding, "CP_ACP") || |
119 | | EQUAL(pszSrcEncoding, "CP_OEMCP")) && |
120 | | EQUAL(pszDstEncoding, CPL_ENC_UTF8)) || |
121 | | (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) && |
122 | | (EQUAL(pszDstEncoding, "CP_ACP") || |
123 | | EQUAL(pszDstEncoding, "CP_OEMCP")))) |
124 | | { |
125 | | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
126 | | } |
127 | | #endif |
128 | 0 | else |
129 | 0 | { |
130 | 0 | return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding); |
131 | 0 | } |
132 | | #else // CPL_RECODE_STUB |
133 | | return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); |
134 | | #endif // CPL_RECODE_ICONV |
135 | 0 | } |
136 | | |
137 | | /************************************************************************/ |
138 | | /* CPLRecodeFromWChar() */ |
139 | | /************************************************************************/ |
140 | | |
141 | | /** |
142 | | * Convert wchar_t string to UTF-8. |
143 | | * |
144 | | * Convert a wchar_t string into a multibyte utf-8 string. The only |
145 | | * guaranteed supported source encoding is CPL_ENC_UCS2, and the only |
146 | | * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII |
147 | | * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings |
148 | | * may also be supported. |
149 | | * |
150 | | * Note that the wchar_t type varies in size on different systems. On |
151 | | * win32 it is normally 2 bytes, and on UNIX 4 bytes. |
152 | | * |
153 | | * If an error occurs an error may, or may not be posted with CPLError(). |
154 | | * |
155 | | * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t. |
156 | | * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2. |
157 | | * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8. |
158 | | * |
159 | | * @return a zero terminated multi-byte string which should be freed with |
160 | | * CPLFree(), or NULL if an error occurs. |
161 | | * |
162 | | * @since GDAL 1.6.0 |
163 | | */ |
164 | | |
165 | | char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource, |
166 | | const char *pszSrcEncoding, |
167 | | const char *pszDstEncoding) |
168 | | |
169 | 0 | { |
170 | 0 | #ifdef CPL_RECODE_ICONV |
171 | | /* -------------------------------------------------------------------- */ |
172 | | /* Conversions from CPL_ENC_UCS2 */ |
173 | | /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */ |
174 | | /* handled by the stub implementation. */ |
175 | | /* -------------------------------------------------------------------- */ |
176 | 0 | if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || |
177 | 0 | EQUAL(pszSrcEncoding, "WCHAR_T")) && |
178 | 0 | (EQUAL(pszDstEncoding, CPL_ENC_UTF8) || |
179 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ASCII) || |
180 | 0 | EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) |
181 | 0 | { |
182 | 0 | return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, |
183 | 0 | pszDstEncoding); |
184 | 0 | } |
185 | | |
186 | 0 | return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding); |
187 | |
|
188 | | #else // CPL_RECODE_STUB |
189 | | return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding); |
190 | | #endif // CPL_RECODE_ICONV |
191 | 0 | } |
192 | | |
193 | | /************************************************************************/ |
194 | | /* CPLRecodeToWChar() */ |
195 | | /************************************************************************/ |
196 | | |
197 | | /** |
198 | | * Convert UTF-8 string to a wchar_t string. |
199 | | * |
200 | | * Convert a 8bit, multi-byte per character input string into a wide |
201 | | * character (wchar_t) string. The only guaranteed supported source encodings |
202 | | * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only |
203 | | * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source |
204 | | * and destination encodings may be supported depending on the underlying |
205 | | * implementation. |
206 | | * |
207 | | * Note that the wchar_t type varies in size on different systems. On |
208 | | * win32 it is normally 2 bytes, and on UNIX 4 bytes. |
209 | | * |
210 | | * If an error occurs an error may, or may not be posted with CPLError(). |
211 | | * |
212 | | * @param pszSource input multi-byte character string. |
213 | | * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8. |
214 | | * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. |
215 | | * |
216 | | * @return the zero terminated wchar_t string (to be freed with CPLFree()) or |
217 | | * NULL on error. |
218 | | * |
219 | | * @since GDAL 1.6.0 |
220 | | */ |
221 | | |
222 | | wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource, |
223 | | const char *pszSrcEncoding, |
224 | | const char *pszDstEncoding) |
225 | | |
226 | 0 | { |
227 | 0 | #ifdef CPL_RECODE_ICONV |
228 | | /* -------------------------------------------------------------------- */ |
229 | | /* Conversions to CPL_ENC_UCS2 */ |
230 | | /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */ |
231 | | /* handled by the stub implementation. */ |
232 | | /* -------------------------------------------------------------------- */ |
233 | 0 | if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) || |
234 | 0 | EQUAL(pszDstEncoding, "WCHAR_T")) && |
235 | 0 | (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) || |
236 | 0 | EQUAL(pszSrcEncoding, CPL_ENC_ASCII) || |
237 | 0 | EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1))) |
238 | 0 | { |
239 | 0 | return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding); |
240 | 0 | } |
241 | | |
242 | 0 | return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding); |
243 | |
|
244 | | #else // CPL_RECODE_STUB |
245 | | return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding); |
246 | | #endif // CPL_RECODE_ICONV |
247 | 0 | } |
248 | | |
249 | | /************************************************************************/ |
250 | | /* CPLIsASCII() */ |
251 | | /************************************************************************/ |
252 | | |
253 | | /** |
254 | | * Test if a string is encoded as ASCII. |
255 | | * |
256 | | * @param pabyData input string to test |
257 | | * @param nLen length of the input string, or -1 if the function must compute |
258 | | * the string length. In which case it must be null terminated. |
259 | | * @return true if the string is encoded as ASCII. false otherwise |
260 | | * |
261 | | * @since GDAL 3.6.0 |
262 | | */ |
263 | | bool CPLIsASCII(const char *pabyData, size_t nLen) |
264 | 0 | { |
265 | 0 | if (nLen == static_cast<size_t>(-1)) |
266 | 0 | nLen = strlen(pabyData); |
267 | 0 | for (size_t i = 0; i < nLen; ++i) |
268 | 0 | { |
269 | 0 | if (static_cast<unsigned char>(pabyData[i]) > 127) |
270 | 0 | return false; |
271 | 0 | } |
272 | 0 | return true; |
273 | 0 | } |
274 | | |
275 | | /************************************************************************/ |
276 | | /* CPLForceToASCII() */ |
277 | | /************************************************************************/ |
278 | | |
279 | | /** |
280 | | * Return a new string that is made only of ASCII characters. If non-ASCII |
281 | | * characters are found in the input string, they will be replaced by the |
282 | | * provided replacement character. |
283 | | * |
284 | | * This function does not make any assumption on the encoding of the input |
285 | | * string (except it must be nul-terminated if nLen equals -1, or have at |
286 | | * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when |
287 | | * the input string is known to be UTF-8 encoded. |
288 | | * |
289 | | * @param pabyData input string to test |
290 | | * @param nLen length of the input string, or -1 if the function must compute |
291 | | * the string length. In which case it must be null terminated. |
292 | | |
293 | | * @param chReplacementChar character which will be used when the input stream |
294 | | * contains a non ASCII character. Must be valid ASCII! |
295 | | * |
296 | | * @return a new string that must be freed with CPLFree(). |
297 | | * |
298 | | * @since GDAL 1.7.0 |
299 | | */ |
300 | | char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar) |
301 | 0 | { |
302 | 0 | const size_t nRealLen = |
303 | 0 | (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData); |
304 | 0 | char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1)); |
305 | 0 | const char *pszPtr = pabyData; |
306 | 0 | const char *pszEnd = pabyData + nRealLen; |
307 | 0 | size_t i = 0; |
308 | 0 | while (pszPtr != pszEnd) |
309 | 0 | { |
310 | 0 | if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127) |
311 | 0 | { |
312 | 0 | pszOutputString[i] = chReplacementChar; |
313 | 0 | ++pszPtr; |
314 | 0 | ++i; |
315 | 0 | } |
316 | 0 | else |
317 | 0 | { |
318 | 0 | pszOutputString[i] = *pszPtr; |
319 | 0 | ++pszPtr; |
320 | 0 | ++i; |
321 | 0 | } |
322 | 0 | } |
323 | 0 | pszOutputString[i] = '\0'; |
324 | 0 | return pszOutputString; |
325 | 0 | } |
326 | | |
327 | | /************************************************************************/ |
328 | | /* CPLUTF8ForceToASCII() */ |
329 | | /************************************************************************/ |
330 | | |
331 | | /** |
332 | | * Return a new string that is made only of ASCII characters. If non-ASCII |
333 | | * characters are found in the input string, for which an "equivalent" ASCII |
334 | | * character is not found, they will be replaced by the provided replacement |
335 | | * character. |
336 | | * |
337 | | * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement |
338 | | * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible |
339 | | * replacements for accented characters. |
340 | | |
341 | | * @param pszStr NUL-terminated UTF-8 string. |
342 | | * @param chReplacementChar character which will be used when the input stream |
343 | | * contains a non ASCII character that cannot be |
344 | | * substituted with an equivalent ASCII character. |
345 | | * Must be valid ASCII! |
346 | | * |
347 | | * @return a new string that must be freed with CPLFree(). |
348 | | * |
349 | | * @since GDAL 3.9 |
350 | | */ |
351 | | char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar) |
352 | 0 | { |
353 | 0 | static const struct |
354 | 0 | { |
355 | 0 | short nCodePoint; |
356 | 0 | char chFirst; |
357 | 0 | char chSecond; |
358 | 0 | } aLatinCharacters[] = { |
359 | | // https://en.wikipedia.org/wiki/Latin-1_Supplement |
360 | 0 | {0xC0, 'A', 0}, // Latin Capital Letter A with grave |
361 | 0 | {0xC1, 'A', 0}, // Latin Capital letter A with acute |
362 | 0 | {0xC2, 'A', 0}, // Latin Capital letter A with circumflex |
363 | 0 | {0xC3, 'A', 0}, // Latin Capital letter A with tilde |
364 | 0 | {0xC4, 'A', 0}, // Latin Capital letter A with diaeresis |
365 | 0 | {0xC5, 'A', 0}, // Latin Capital letter A with ring above |
366 | 0 | {0xC6, 'A', 'E'}, // Latin Capital letter AE |
367 | 0 | {0xC7, 'C', 0}, // Latin Capital letter C with cedilla |
368 | 0 | {0xC8, 'E', 0}, // Latin Capital letter E with grave |
369 | 0 | {0xC9, 'E', 0}, // Latin Capital letter E with acute |
370 | 0 | {0xCA, 'E', 0}, // Latin Capital letter E with circumflex |
371 | 0 | {0xCB, 'E', 0}, // Latin Capital letter E with diaeresis |
372 | 0 | {0xCC, 'I', 0}, // Latin Capital letter I with grave |
373 | 0 | {0xCD, 'I', 0}, // Latin Capital letter I with acute |
374 | 0 | {0xCE, 'I', 0}, // Latin Capital letter I with circumflex |
375 | 0 | {0xCF, 'I', 0}, // Latin Capital letter I with diaeresis |
376 | | // { 0xD0, '?', 0 }, // Latin Capital letter Eth |
377 | 0 | {0xD1, 'N', 0}, // Latin Capital letter N with tilde |
378 | 0 | {0xD2, 'O', 0}, // Latin Capital letter O with grave |
379 | 0 | {0xD3, 'O', 0}, // Latin Capital letter O with acute |
380 | 0 | {0xD4, 'O', 0}, // Latin Capital letter O with circumflex |
381 | 0 | {0xD5, 'O', 0}, // Latin Capital letter O with tilde |
382 | 0 | {0xD6, 'O', 0}, // Latin Capital letter O with diaeresis |
383 | 0 | {0xD8, 'O', 0}, // Latin Capital letter O with stroke |
384 | 0 | {0xD9, 'U', 0}, // Latin Capital letter U with grave |
385 | 0 | {0xDA, 'U', 0}, // Latin Capital letter U with acute |
386 | 0 | {0xDB, 'U', 0}, // Latin Capital Letter U with circumflex |
387 | 0 | {0xDC, 'U', 0}, // Latin Capital Letter U with diaeresis |
388 | 0 | {0xDD, 'Y', 0}, // Latin Capital Letter Y with acute |
389 | | // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn |
390 | 0 | {0xDF, 'S', 'S'}, // Latin Small Letter sharp S |
391 | 0 | {0xE0, 'a', 0}, // Latin Small Letter A with grave |
392 | 0 | {0xE1, 'a', 0}, // Latin Small Letter A with acute |
393 | 0 | {0xE2, 'a', 0}, // Latin Small Letter A with circumflex |
394 | 0 | {0xE3, 'a', 0}, // Latin Small Letter A with tilde |
395 | 0 | {0xE4, 'a', 0}, // Latin Small Letter A with diaeresis |
396 | 0 | {0xE5, 'a', 0}, // Latin Small Letter A with ring above |
397 | 0 | {0xE6, 'a', 'e'}, // Latin Small Letter AE |
398 | 0 | {0xE7, 'c', 0}, // Latin Small Letter C with cedilla |
399 | 0 | {0xE8, 'e', 0}, // Latin Small Letter E with grave |
400 | 0 | {0xE9, 'e', 0}, // Latin Small Letter E with acute |
401 | 0 | {0xEA, 'e', 0}, // Latin Small Letter E with circumflex |
402 | 0 | {0xEB, 'e', 0}, // Latin Small Letter E with diaeresis |
403 | 0 | {0xEC, 'i', 0}, // Latin Small Letter I with grave |
404 | 0 | {0xED, 'i', 0}, // Latin Small Letter I with acute |
405 | 0 | {0xEE, 'i', 0}, // Latin Small Letter I with circumflex |
406 | 0 | {0xEF, 'i', 0}, // Latin Small Letter I with diaeresis |
407 | | // { 0xF0, '?', 0 }, // Latin Small Letter Eth |
408 | 0 | {0xF1, 'n', 0}, // Latin Small Letter N with tilde |
409 | 0 | {0xF2, 'o', 0}, // Latin Small Letter O with grave |
410 | 0 | {0xF3, 'o', 0}, // Latin Small Letter O with acute |
411 | 0 | {0xF4, 'o', 0}, // Latin Small Letter O with circumflex |
412 | 0 | {0xF5, 'o', 0}, // Latin Small Letter O with tilde |
413 | 0 | {0xF6, 'o', 0}, // Latin Small Letter O with diaeresis |
414 | 0 | {0xF8, 'o', 0}, // Latin Small Letter O with stroke |
415 | 0 | {0xF9, 'u', 0}, // Latin Small Letter U with grave |
416 | 0 | {0xFA, 'u', 0}, // Latin Small Letter U with acute |
417 | 0 | {0xFB, 'u', 0}, // Latin Small Letter U with circumflex |
418 | 0 | {0xFC, 'u', 0}, // Latin Small Letter U with diaeresis |
419 | 0 | {0xFD, 'y', 0}, // Latin Small Letter Y with acute |
420 | | // { 0xFE, '?', 0 }, // Latin Small Letter Thorn |
421 | 0 | {0xFF, 'u', 0}, // Latin Small Letter Y with diaeresis |
422 | | |
423 | | // https://en.wikipedia.org/wiki/Latin_Extended-A |
424 | 0 | { |
425 | 0 | 0x0100, |
426 | 0 | 'A', |
427 | 0 | 0, |
428 | 0 | }, // Latin Capital letter A with macron |
429 | 0 | { |
430 | 0 | 0x0101, |
431 | 0 | 'a', |
432 | 0 | 0, |
433 | 0 | }, // Latin Small letter A with macron |
434 | 0 | { |
435 | 0 | 0x0102, |
436 | 0 | 'A', |
437 | 0 | 0, |
438 | 0 | }, // Latin Capital letter A with breve |
439 | 0 | { |
440 | 0 | 0x0103, |
441 | 0 | 'a', |
442 | 0 | 0, |
443 | 0 | }, // Latin Small letter A with breve |
444 | 0 | { |
445 | 0 | 0x0104, |
446 | 0 | 'A', |
447 | 0 | 0, |
448 | 0 | }, // Latin Capital letter A with ogonek |
449 | 0 | { |
450 | 0 | 0x0105, |
451 | 0 | 'a', |
452 | 0 | 0, |
453 | 0 | }, // Latin Small letter A with ogonek |
454 | 0 | { |
455 | 0 | 0x0106, |
456 | 0 | 'C', |
457 | 0 | 0, |
458 | 0 | }, // Latin Capital letter C with acute |
459 | 0 | { |
460 | 0 | 0x0107, |
461 | 0 | 'c', |
462 | 0 | 0, |
463 | 0 | }, // Latin Small letter C with acute |
464 | 0 | { |
465 | 0 | 0x0108, |
466 | 0 | 'C', |
467 | 0 | 0, |
468 | 0 | }, // Latin Capital letter C with circumflex |
469 | 0 | { |
470 | 0 | 0x0109, |
471 | 0 | 'c', |
472 | 0 | 0, |
473 | 0 | }, // Latin Small letter C with circumflex |
474 | 0 | { |
475 | 0 | 0x010A, |
476 | 0 | 'C', |
477 | 0 | 0, |
478 | 0 | }, // Latin Capital letter C with dot above |
479 | 0 | { |
480 | 0 | 0x010B, |
481 | 0 | 'c', |
482 | 0 | 0, |
483 | 0 | }, // Latin Small letter C with dot above |
484 | 0 | { |
485 | 0 | 0x010C, |
486 | 0 | 'C', |
487 | 0 | 0, |
488 | 0 | }, // Latin Capital letter C with caron |
489 | 0 | { |
490 | 0 | 0x010D, |
491 | 0 | 'c', |
492 | 0 | 0, |
493 | 0 | }, // Latin Small letter C with caron |
494 | 0 | { |
495 | 0 | 0x010E, |
496 | 0 | 'D', |
497 | 0 | 0, |
498 | 0 | }, // Latin Capital letter D with caron |
499 | 0 | { |
500 | 0 | 0x010F, |
501 | 0 | 'd', |
502 | 0 | 0, |
503 | 0 | }, // Latin Small letter D with caron |
504 | 0 | { |
505 | 0 | 0x0110, |
506 | 0 | 'D', |
507 | 0 | 0, |
508 | 0 | }, // Latin Capital letter D with stroke |
509 | 0 | { |
510 | 0 | 0x0111, |
511 | 0 | 'd', |
512 | 0 | 0, |
513 | 0 | }, // Latin Small letter D with stroke |
514 | 0 | { |
515 | 0 | 0x0112, |
516 | 0 | 'E', |
517 | 0 | 0, |
518 | 0 | }, // Latin Capital letter E with macron |
519 | 0 | { |
520 | 0 | 0x0113, |
521 | 0 | 'e', |
522 | 0 | 0, |
523 | 0 | }, // Latin Small letter E with macron |
524 | 0 | { |
525 | 0 | 0x0114, |
526 | 0 | 'E', |
527 | 0 | 0, |
528 | 0 | }, // Latin Capital letter E with breve |
529 | 0 | { |
530 | 0 | 0x0115, |
531 | 0 | 'e', |
532 | 0 | 0, |
533 | 0 | }, // Latin Small letter E with breve |
534 | 0 | { |
535 | 0 | 0x0116, |
536 | 0 | 'E', |
537 | 0 | 0, |
538 | 0 | }, // Latin Capital letter E with dot above |
539 | 0 | { |
540 | 0 | 0x0117, |
541 | 0 | 'e', |
542 | 0 | 0, |
543 | 0 | }, // Latin Small letter E with dot above |
544 | 0 | { |
545 | 0 | 0x0118, |
546 | 0 | 'E', |
547 | 0 | 0, |
548 | 0 | }, // Latin Capital letter E with ogonek |
549 | 0 | { |
550 | 0 | 0x0119, |
551 | 0 | 'e', |
552 | 0 | 0, |
553 | 0 | }, // Latin Small letter E with ogonek |
554 | 0 | { |
555 | 0 | 0x011A, |
556 | 0 | 'E', |
557 | 0 | 0, |
558 | 0 | }, // Latin Capital letter E with caron |
559 | 0 | { |
560 | 0 | 0x011B, |
561 | 0 | 'e', |
562 | 0 | 0, |
563 | 0 | }, // Latin Small letter E with caron |
564 | 0 | { |
565 | 0 | 0x011C, |
566 | 0 | 'G', |
567 | 0 | 0, |
568 | 0 | }, // Latin Capital letter G with circumflex |
569 | 0 | { |
570 | 0 | 0x011D, |
571 | 0 | 'g', |
572 | 0 | 0, |
573 | 0 | }, // Latin Small letter G with circumflex |
574 | 0 | { |
575 | 0 | 0x011E, |
576 | 0 | 'G', |
577 | 0 | 0, |
578 | 0 | }, // Latin Capital letter G with breve |
579 | 0 | { |
580 | 0 | 0x011F, |
581 | 0 | 'g', |
582 | 0 | 0, |
583 | 0 | }, // Latin Small letter G with breve |
584 | 0 | { |
585 | 0 | 0x0120, |
586 | 0 | 'G', |
587 | 0 | 0, |
588 | 0 | }, // Latin Capital letter G with dot above |
589 | 0 | { |
590 | 0 | 0x0121, |
591 | 0 | 'g', |
592 | 0 | 0, |
593 | 0 | }, // Latin Small letter G with dot above |
594 | 0 | { |
595 | 0 | 0x0122, |
596 | 0 | 'G', |
597 | 0 | 0, |
598 | 0 | }, // Latin Capital letter G with cedilla |
599 | 0 | { |
600 | 0 | 0x0123, |
601 | 0 | 'g', |
602 | 0 | 0, |
603 | 0 | }, // Latin Small letter G with cedilla |
604 | 0 | { |
605 | 0 | 0x0124, |
606 | 0 | 'H', |
607 | 0 | 0, |
608 | 0 | }, // Latin Capital letter H with circumflex |
609 | 0 | { |
610 | 0 | 0x0125, |
611 | 0 | 'h', |
612 | 0 | 0, |
613 | 0 | }, // Latin Small letter H with circumflex |
614 | 0 | { |
615 | 0 | 0x0126, |
616 | 0 | 'H', |
617 | 0 | 0, |
618 | 0 | }, // Latin Capital letter H with stroke |
619 | 0 | { |
620 | 0 | 0x0127, |
621 | 0 | 'h', |
622 | 0 | 0, |
623 | 0 | }, // Latin Small letter H with stroke |
624 | 0 | { |
625 | 0 | 0x0128, |
626 | 0 | 'I', |
627 | 0 | 0, |
628 | 0 | }, // Latin Capital letter I with tilde |
629 | 0 | { |
630 | 0 | 0x0129, |
631 | 0 | 'i', |
632 | 0 | 0, |
633 | 0 | }, // Latin Small letter I with tilde |
634 | 0 | { |
635 | 0 | 0x012A, |
636 | 0 | 'I', |
637 | 0 | 0, |
638 | 0 | }, // Latin Capital letter I with macron |
639 | 0 | { |
640 | 0 | 0x012B, |
641 | 0 | 'i', |
642 | 0 | 0, |
643 | 0 | }, // Latin Small letter I with macron |
644 | 0 | { |
645 | 0 | 0x012C, |
646 | 0 | 'I', |
647 | 0 | 0, |
648 | 0 | }, // Latin Capital letter I with breve |
649 | 0 | { |
650 | 0 | 0x012D, |
651 | 0 | 'i', |
652 | 0 | 0, |
653 | 0 | }, // Latin Small letter I with breve |
654 | 0 | { |
655 | 0 | 0x012E, |
656 | 0 | 'I', |
657 | 0 | 0, |
658 | 0 | }, // Latin Capital letter I with ogonek |
659 | 0 | { |
660 | 0 | 0x012F, |
661 | 0 | 'i', |
662 | 0 | 0, |
663 | 0 | }, // Latin Small letter I with ogonek |
664 | 0 | { |
665 | 0 | 0x0130, |
666 | 0 | 'I', |
667 | 0 | 0, |
668 | 0 | }, // Latin Capital letter I with dot above |
669 | 0 | { |
670 | 0 | 0x0131, |
671 | 0 | 'i', |
672 | 0 | 0, |
673 | 0 | }, // Latin Small letter dotless I |
674 | 0 | { |
675 | 0 | 0x0132, |
676 | 0 | 'I', |
677 | 0 | 'J', |
678 | 0 | }, // Latin Capital Ligature IJ |
679 | 0 | { |
680 | 0 | 0x0133, |
681 | 0 | 'i', |
682 | 0 | 'j', |
683 | 0 | }, // Latin Small Ligature IJ |
684 | 0 | { |
685 | 0 | 0x0134, |
686 | 0 | 'J', |
687 | 0 | 0, |
688 | 0 | }, // Latin Capital letter J with circumflex |
689 | 0 | { |
690 | 0 | 0x0135, |
691 | 0 | 'j', |
692 | 0 | 0, |
693 | 0 | }, // Latin Small letter J with circumflex |
694 | 0 | { |
695 | 0 | 0x0136, |
696 | 0 | 'K', |
697 | 0 | 0, |
698 | 0 | }, // Latin Capital letter K with cedilla |
699 | 0 | { |
700 | 0 | 0x0137, |
701 | 0 | 'k', |
702 | 0 | 0, |
703 | 0 | }, // Latin Small letter K with cedilla |
704 | 0 | { |
705 | 0 | 0x0138, |
706 | 0 | 'k', |
707 | 0 | 0, |
708 | 0 | }, // Latin Small letter Kra |
709 | 0 | { |
710 | 0 | 0x0139, |
711 | 0 | 'L', |
712 | 0 | 0, |
713 | 0 | }, // Latin Capital letter L with acute |
714 | 0 | { |
715 | 0 | 0x013A, |
716 | 0 | 'l', |
717 | 0 | 0, |
718 | 0 | }, // Latin Small letter L with acute |
719 | 0 | { |
720 | 0 | 0x013B, |
721 | 0 | 'L', |
722 | 0 | 0, |
723 | 0 | }, // Latin Capital letter L with cedilla |
724 | 0 | { |
725 | 0 | 0x013C, |
726 | 0 | 'l', |
727 | 0 | 0, |
728 | 0 | }, // Latin Small letter L with cedilla |
729 | 0 | { |
730 | 0 | 0x013D, |
731 | 0 | 'L', |
732 | 0 | 0, |
733 | 0 | }, // Latin Capital letter L with caron |
734 | 0 | { |
735 | 0 | 0x013E, |
736 | 0 | 'l', |
737 | 0 | 0, |
738 | 0 | }, // Latin Small letter L with caron |
739 | 0 | { |
740 | 0 | 0x013F, |
741 | 0 | 'L', |
742 | 0 | 0, |
743 | 0 | }, // Latin Capital letter L with middle dot |
744 | 0 | { |
745 | 0 | 0x0140, |
746 | 0 | 'l', |
747 | 0 | 0, |
748 | 0 | }, // Latin Small letter L with middle dot |
749 | 0 | { |
750 | 0 | 0x0141, |
751 | 0 | 'L', |
752 | 0 | 0, |
753 | 0 | }, // Latin Capital letter L with stroke |
754 | 0 | { |
755 | 0 | 0x0142, |
756 | 0 | 'l', |
757 | 0 | 0, |
758 | 0 | }, // Latin Small letter L with stroke |
759 | 0 | { |
760 | 0 | 0x0143, |
761 | 0 | 'N', |
762 | 0 | 0, |
763 | 0 | }, // Latin Capital letter N with acute |
764 | 0 | { |
765 | 0 | 0x0144, |
766 | 0 | 'n', |
767 | 0 | 0, |
768 | 0 | }, // Latin Small letter N with acute |
769 | 0 | { |
770 | 0 | 0x0145, |
771 | 0 | 'N', |
772 | 0 | 0, |
773 | 0 | }, // Latin Capital letter N with cedilla |
774 | 0 | { |
775 | 0 | 0x0146, |
776 | 0 | 'n', |
777 | 0 | 0, |
778 | 0 | }, // Latin Small letter N with cedilla |
779 | 0 | { |
780 | 0 | 0x0147, |
781 | 0 | 'N', |
782 | 0 | 0, |
783 | 0 | }, // Latin Capital letter N with caron |
784 | 0 | { |
785 | 0 | 0x0148, |
786 | 0 | 'n', |
787 | 0 | 0, |
788 | 0 | }, // Latin Small letter N with caron |
789 | | // { 0x014A , '?' , 0, }, // Latin Capital letter Eng |
790 | | // { 0x014B , '?' , 0, }, // Latin Small letter Eng |
791 | 0 | { |
792 | 0 | 0x014C, |
793 | 0 | 'O', |
794 | 0 | 0, |
795 | 0 | }, // Latin Capital letter O with macron |
796 | 0 | { |
797 | 0 | 0x014D, |
798 | 0 | 'o', |
799 | 0 | 0, |
800 | 0 | }, // Latin Small letter O with macron |
801 | 0 | { |
802 | 0 | 0x014E, |
803 | 0 | 'O', |
804 | 0 | 0, |
805 | 0 | }, // Latin Capital letter O with breve |
806 | 0 | { |
807 | 0 | 0x014F, |
808 | 0 | 'o', |
809 | 0 | 0, |
810 | 0 | }, // Latin Small letter O with breve |
811 | 0 | { |
812 | 0 | 0x0150, |
813 | 0 | 'O', |
814 | 0 | 0, |
815 | 0 | }, // Latin Capital Letter O with double acute |
816 | 0 | { |
817 | 0 | 0x0151, |
818 | 0 | 'o', |
819 | 0 | 0, |
820 | 0 | }, // Latin Small Letter O with double acute |
821 | 0 | { |
822 | 0 | 0x0152, |
823 | 0 | 'O', |
824 | 0 | 'E', |
825 | 0 | }, // Latin Capital Ligature OE |
826 | 0 | { |
827 | 0 | 0x0153, |
828 | 0 | 'o', |
829 | 0 | 'e', |
830 | 0 | }, // Latin Small Ligature OE |
831 | 0 | { |
832 | 0 | 0x0154, |
833 | 0 | 'R', |
834 | 0 | 0, |
835 | 0 | }, // Latin Capital letter R with acute |
836 | 0 | { |
837 | 0 | 0x0155, |
838 | 0 | 'r', |
839 | 0 | 0, |
840 | 0 | }, // Latin Small letter R with acute |
841 | 0 | { |
842 | 0 | 0x0156, |
843 | 0 | 'R', |
844 | 0 | 0, |
845 | 0 | }, // Latin Capital letter R with cedilla |
846 | 0 | { |
847 | 0 | 0x0157, |
848 | 0 | 'r', |
849 | 0 | 0, |
850 | 0 | }, // Latin Small letter R with cedilla |
851 | 0 | { |
852 | 0 | 0x0158, |
853 | 0 | 'R', |
854 | 0 | 0, |
855 | 0 | }, // Latin Capital letter R with caron |
856 | 0 | { |
857 | 0 | 0x0159, |
858 | 0 | 'r', |
859 | 0 | 0, |
860 | 0 | }, // Latin Small letter R with caron |
861 | 0 | { |
862 | 0 | 0x015A, |
863 | 0 | 'S', |
864 | 0 | 0, |
865 | 0 | }, // Latin Capital letter S with acute |
866 | 0 | { |
867 | 0 | 0x015B, |
868 | 0 | 's', |
869 | 0 | 0, |
870 | 0 | }, // Latin Small letter S with acute |
871 | 0 | { |
872 | 0 | 0x015C, |
873 | 0 | 'S', |
874 | 0 | 0, |
875 | 0 | }, // Latin Capital letter S with circumflex |
876 | 0 | { |
877 | 0 | 0x015D, |
878 | 0 | 's', |
879 | 0 | 0, |
880 | 0 | }, // Latin Small letter S with circumflex |
881 | 0 | { |
882 | 0 | 0x015E, |
883 | 0 | 'S', |
884 | 0 | 0, |
885 | 0 | }, // Latin Capital letter S with cedilla |
886 | 0 | { |
887 | 0 | 0x015F, |
888 | 0 | 's', |
889 | 0 | 0, |
890 | 0 | }, // Latin Small letter S with cedilla |
891 | 0 | { |
892 | 0 | 0x0160, |
893 | 0 | 'S', |
894 | 0 | 0, |
895 | 0 | }, // Latin Capital letter S with caron |
896 | 0 | { |
897 | 0 | 0x0161, |
898 | 0 | 's', |
899 | 0 | 0, |
900 | 0 | }, // Latin Small letter S with caron |
901 | 0 | { |
902 | 0 | 0x0162, |
903 | 0 | 'T', |
904 | 0 | 0, |
905 | 0 | }, // Latin Capital letter T with cedilla |
906 | 0 | { |
907 | 0 | 0x0163, |
908 | 0 | 't', |
909 | 0 | 0, |
910 | 0 | }, // Latin Small letter T with cedilla |
911 | 0 | { |
912 | 0 | 0x0164, |
913 | 0 | 'T', |
914 | 0 | 0, |
915 | 0 | }, // Latin Capital letter T with caron |
916 | 0 | { |
917 | 0 | 0x0165, |
918 | 0 | 't', |
919 | 0 | 0, |
920 | 0 | }, // Latin Small letter T with caron |
921 | 0 | { |
922 | 0 | 0x0166, |
923 | 0 | 'T', |
924 | 0 | 0, |
925 | 0 | }, // Latin Capital letter T with stroke |
926 | 0 | { |
927 | 0 | 0x0167, |
928 | 0 | 't', |
929 | 0 | 0, |
930 | 0 | }, // Latin Small letter T with stroke |
931 | 0 | { |
932 | 0 | 0x0168, |
933 | 0 | 'U', |
934 | 0 | 0, |
935 | 0 | }, // Latin Capital letter U with tilde |
936 | 0 | { |
937 | 0 | 0x0169, |
938 | 0 | 'u', |
939 | 0 | 0, |
940 | 0 | }, // Latin Small letter U with tilde |
941 | 0 | { |
942 | 0 | 0x016A, |
943 | 0 | 'U', |
944 | 0 | 0, |
945 | 0 | }, // Latin Capital letter U with macron |
946 | 0 | { |
947 | 0 | 0x016B, |
948 | 0 | 'u', |
949 | 0 | 0, |
950 | 0 | }, // Latin Small letter U with macron |
951 | 0 | { |
952 | 0 | 0x016C, |
953 | 0 | 'U', |
954 | 0 | 0, |
955 | 0 | }, // Latin Capital letter U with breve |
956 | 0 | { |
957 | 0 | 0x016D, |
958 | 0 | 'u', |
959 | 0 | 0, |
960 | 0 | }, // Latin Small letter U with breve |
961 | 0 | { |
962 | 0 | 0x016E, |
963 | 0 | 'U', |
964 | 0 | 0, |
965 | 0 | }, // Latin Capital letter U with ring above |
966 | 0 | { |
967 | 0 | 0x016F, |
968 | 0 | 'u', |
969 | 0 | 0, |
970 | 0 | }, // Latin Small letter U with ring above |
971 | 0 | { |
972 | 0 | 0x0170, |
973 | 0 | 'U', |
974 | 0 | 0, |
975 | 0 | }, // Latin Capital Letter U with double acute |
976 | 0 | { |
977 | 0 | 0x0171, |
978 | 0 | 'u', |
979 | 0 | 0, |
980 | 0 | }, // Latin Small Letter U with double acute |
981 | 0 | { |
982 | 0 | 0x0172, |
983 | 0 | 'U', |
984 | 0 | 0, |
985 | 0 | }, // Latin Capital letter U with ogonek |
986 | 0 | { |
987 | 0 | 0x0173, |
988 | 0 | 'u', |
989 | 0 | 0, |
990 | 0 | }, // Latin Small letter U with ogonek |
991 | 0 | { |
992 | 0 | 0x0174, |
993 | 0 | 'W', |
994 | 0 | 0, |
995 | 0 | }, // Latin Capital letter W with circumflex |
996 | 0 | { |
997 | 0 | 0x0175, |
998 | 0 | 'w', |
999 | 0 | 0, |
1000 | 0 | }, // Latin Small letter W with circumflex |
1001 | 0 | { |
1002 | 0 | 0x0176, |
1003 | 0 | 'Y', |
1004 | 0 | 0, |
1005 | 0 | }, // Latin Capital letter Y with circumflex |
1006 | 0 | { |
1007 | 0 | 0x0177, |
1008 | 0 | 'y', |
1009 | 0 | 0, |
1010 | 0 | }, // Latin Small letter Y with circumflex |
1011 | 0 | { |
1012 | 0 | 0x0178, |
1013 | 0 | 'Y', |
1014 | 0 | 0, |
1015 | 0 | }, // Latin Capital letter Y with diaeresis |
1016 | 0 | { |
1017 | 0 | 0x0179, |
1018 | 0 | 'Z', |
1019 | 0 | 0, |
1020 | 0 | }, // Latin Capital letter Z with acute |
1021 | 0 | { |
1022 | 0 | 0x017A, |
1023 | 0 | 'z', |
1024 | 0 | 0, |
1025 | 0 | }, // Latin Small letter Z with acute |
1026 | 0 | { |
1027 | 0 | 0x017B, |
1028 | 0 | 'Z', |
1029 | 0 | 0, |
1030 | 0 | }, // Latin Capital letter Z with dot above |
1031 | 0 | { |
1032 | 0 | 0x017C, |
1033 | 0 | 'z', |
1034 | 0 | 0, |
1035 | 0 | }, // Latin Small letter Z with dot above |
1036 | 0 | { |
1037 | 0 | 0x017D, |
1038 | 0 | 'Z', |
1039 | 0 | 0, |
1040 | 0 | }, // Latin Capital letter Z with caron |
1041 | 0 | { |
1042 | 0 | 0x017E, |
1043 | 0 | 'z', |
1044 | 0 | 0, |
1045 | 0 | }, // Latin Small letter Z with caron |
1046 | 0 | }; |
1047 | |
|
1048 | 0 | const size_t nLen = strlen(pszStr); |
1049 | 0 | char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1)); |
1050 | 0 | const char *pszPtr = pszStr; |
1051 | 0 | const char *pszEnd = pszStr + nLen; |
1052 | 0 | size_t i = 0; |
1053 | 0 | while (pszPtr != pszEnd) |
1054 | 0 | { |
1055 | 0 | if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127) |
1056 | 0 | { |
1057 | 0 | utf8_int32_t codepoint; |
1058 | 0 | if (pszPtr + utf8codepointcalcsize( |
1059 | 0 | reinterpret_cast<const utf8_int8_t *>(pszPtr)) > |
1060 | 0 | pszEnd) |
1061 | 0 | break; |
1062 | 0 | auto pszNext = reinterpret_cast<const char *>(utf8codepoint( |
1063 | 0 | reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint)); |
1064 | 0 | char ch = chReplacementChar; |
1065 | 0 | for (const auto &latin1char : aLatinCharacters) |
1066 | 0 | { |
1067 | 0 | if (codepoint == latin1char.nCodePoint) |
1068 | 0 | { |
1069 | 0 | pszOutputString[i] = latin1char.chFirst; |
1070 | 0 | ++i; |
1071 | 0 | if (latin1char.chSecond) |
1072 | 0 | { |
1073 | 0 | pszOutputString[i] = latin1char.chSecond; |
1074 | 0 | ++i; |
1075 | 0 | } |
1076 | 0 | ch = 0; |
1077 | 0 | break; |
1078 | 0 | } |
1079 | 0 | } |
1080 | 0 | if (ch) |
1081 | 0 | { |
1082 | 0 | pszOutputString[i] = ch; |
1083 | 0 | ++i; |
1084 | 0 | } |
1085 | 0 | pszPtr = pszNext; |
1086 | 0 | } |
1087 | 0 | else |
1088 | 0 | { |
1089 | 0 | pszOutputString[i] = *pszPtr; |
1090 | 0 | ++pszPtr; |
1091 | 0 | ++i; |
1092 | 0 | } |
1093 | 0 | } |
1094 | 0 | pszOutputString[i] = '\0'; |
1095 | 0 | return pszOutputString; |
1096 | 0 | } |
1097 | | |
1098 | | /************************************************************************/ |
1099 | | /* CPLEncodingCharSize() */ |
1100 | | /************************************************************************/ |
1101 | | |
1102 | | /** |
1103 | | * Return bytes per character for encoding. |
1104 | | * |
1105 | | * This function returns the size in bytes of the smallest character |
1106 | | * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this |
1107 | | * is straight forward. For encodings like UTF8 and UTF16 which represent |
1108 | | * some characters as a sequence of atomic character sizes the function |
1109 | | * still returns the atomic character size (1 for UTF8, 2 for UTF16). |
1110 | | * |
1111 | | * This function will return the correct value for well known encodings |
1112 | | * with corresponding CPL_ENC_ values. It may not return the correct value |
1113 | | * for other encodings even if they are supported by the underlying iconv |
1114 | | * or windows transliteration services. Hopefully it will improve over time. |
1115 | | * |
1116 | | * @param pszEncoding the name of the encoding. |
1117 | | * |
1118 | | * @return the size of a minimal character in bytes or -1 if the size is |
1119 | | * unknown. |
1120 | | */ |
1121 | | |
1122 | | int CPLEncodingCharSize(const char *pszEncoding) |
1123 | | |
1124 | 0 | { |
1125 | 0 | if (EQUAL(pszEncoding, CPL_ENC_UTF8)) |
1126 | 0 | return 1; |
1127 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_UTF16) || |
1128 | 0 | EQUAL(pszEncoding, "UTF-16LE")) |
1129 | 0 | return 2; |
1130 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE")) |
1131 | 0 | return 2; |
1132 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_UCS4)) |
1133 | 0 | return 4; |
1134 | 0 | else if (EQUAL(pszEncoding, CPL_ENC_ASCII)) |
1135 | 0 | return 1; |
1136 | 0 | else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-")) |
1137 | 0 | return 1; |
1138 | | |
1139 | 0 | return -1; |
1140 | 0 | } |
1141 | | |
1142 | | /************************************************************************/ |
1143 | | /* CPLClearRecodeWarningFlags() */ |
1144 | | /************************************************************************/ |
1145 | | |
1146 | | void CPLClearRecodeWarningFlags() |
1147 | 0 | { |
1148 | 0 | #ifdef CPL_RECODE_ICONV |
1149 | 0 | CPLClearRecodeIconvWarningFlags(); |
1150 | 0 | #endif |
1151 | 0 | CPLClearRecodeStubWarningFlags(); |
1152 | 0 | } |
1153 | | |
1154 | | /************************************************************************/ |
1155 | | /* CPLStrlenUTF8() */ |
1156 | | /************************************************************************/ |
1157 | | |
1158 | | /** |
1159 | | * Return the number of UTF-8 characters of a nul-terminated string. |
1160 | | * |
1161 | | * This is different from strlen() which returns the number of bytes. |
1162 | | * |
1163 | | * @param pszUTF8Str a nul-terminated UTF-8 string |
1164 | | * |
1165 | | * @return the number of UTF-8 characters. |
1166 | | */ |
1167 | | |
1168 | | int CPLStrlenUTF8(const char *pszUTF8Str) |
1169 | 0 | { |
1170 | 0 | int nCharacterCount = 0; |
1171 | 0 | for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i) |
1172 | 0 | { |
1173 | 0 | if ((pszUTF8Str[i] & 0xc0) != 0x80) |
1174 | 0 | { |
1175 | 0 | if (nCharacterCount == INT_MAX) |
1176 | 0 | { |
1177 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
1178 | 0 | "CPLStrlenUTF8(): nCharacterCount > INT_MAX. Use " |
1179 | 0 | "CPLStrlenUTF8Ex() instead"); |
1180 | 0 | break; |
1181 | 0 | } |
1182 | 0 | ++nCharacterCount; |
1183 | 0 | } |
1184 | 0 | } |
1185 | 0 | return nCharacterCount; |
1186 | 0 | } |
1187 | | |
1188 | | /************************************************************************/ |
1189 | | /* CPLStrlenUTF8Ex() */ |
1190 | | /************************************************************************/ |
1191 | | |
1192 | | /** |
1193 | | * Return the number of UTF-8 characters of a nul-terminated string. |
1194 | | * |
1195 | | * This is different from strlen() which returns the number of bytes. |
1196 | | * |
1197 | | * @param pszUTF8Str a nul-terminated UTF-8 string |
1198 | | * |
1199 | | * @return the number of UTF-8 characters. |
1200 | | */ |
1201 | | |
1202 | | size_t CPLStrlenUTF8Ex(const char *pszUTF8Str) |
1203 | 0 | { |
1204 | 0 | size_t nCharacterCount = 0; |
1205 | 0 | for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i) |
1206 | 0 | { |
1207 | 0 | if ((pszUTF8Str[i] & 0xc0) != 0x80) |
1208 | 0 | { |
1209 | 0 | ++nCharacterCount; |
1210 | 0 | } |
1211 | 0 | } |
1212 | 0 | return nCharacterCount; |
1213 | 0 | } |
1214 | | |
1215 | | /************************************************************************/ |
1216 | | /* CPLCanRecode() */ |
1217 | | /************************************************************************/ |
1218 | | |
1219 | | /** |
1220 | | * Checks if it is possible to recode a string from one encoding to another. |
1221 | | * |
1222 | | * @param pszTestStr a NULL terminated string. |
1223 | | * @param pszSrcEncoding the source encoding. |
1224 | | * @param pszDstEncoding the destination encoding. |
1225 | | * |
1226 | | * @return a TRUE if recode is possible. |
1227 | | * |
1228 | | * @since GDAL 3.1.0 |
1229 | | */ |
1230 | | int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding, |
1231 | | const char *pszDstEncoding) |
1232 | 0 | { |
1233 | 0 | CPLClearRecodeWarningFlags(); |
1234 | 0 | CPLErrorReset(); |
1235 | |
|
1236 | 0 | CPLPushErrorHandler(CPLQuietErrorHandler); |
1237 | 0 | char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding)); |
1238 | 0 | CPLPopErrorHandler(); |
1239 | |
|
1240 | 0 | if (pszRec == nullptr) |
1241 | 0 | { |
1242 | 0 | return FALSE; |
1243 | 0 | } |
1244 | | |
1245 | 0 | CPLFree(pszRec); |
1246 | |
|
1247 | 0 | if (CPLGetLastErrorType() != 0) |
1248 | 0 | { |
1249 | 0 | return FALSE; |
1250 | 0 | } |
1251 | | |
1252 | 0 | return TRUE; |
1253 | 0 | } |