/src/gdal/port/cpl_recode_stub.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * |
3 | | * Name: cpl_recode_stub.cpp |
4 | | * Project: CPL - Common Portability Library |
5 | | * Purpose: Character set recoding and char/wchar_t conversions, stub |
6 | | * implementation to be used if iconv() functionality is not |
7 | | * available. |
8 | | * Author: Frank Warmerdam, warmerdam@pobox.com |
9 | | * |
10 | | * The bulk of this code is derived from the utf.c module from FLTK. It |
11 | | * was originally downloaded from: |
12 | | * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c |
13 | | * |
14 | | ********************************************************************** |
15 | | * Copyright (c) 2008, Frank Warmerdam |
16 | | * Copyright 2006 by Bill Spitzak and others. |
17 | | * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com> |
18 | | * |
19 | | * Permission to use, copy, modify, and distribute this software for any |
20 | | * purpose with or without fee is hereby granted, provided that the above |
21 | | * copyright notice and this permission notice appear in all copies. |
22 | | * |
23 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
24 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
25 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
26 | | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
27 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
28 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
29 | | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
30 | | **********************************************************************/ |
31 | | |
32 | | #include "cpl_port.h" |
33 | | #include "cpl_string.h" |
34 | | |
35 | | #include <cstring> |
36 | | |
37 | | #include "cpl_conv.h" |
38 | | #include "cpl_error.h" |
39 | | #include "cpl_character_sets.c" |
40 | | |
41 | | static unsigned utf8decode(const char *p, const char *end, int *len); |
42 | | static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst, |
43 | | unsigned dstlen); |
44 | | static unsigned utf8toa(const char *src, unsigned srclen, char *dst, |
45 | | unsigned dstlen); |
46 | | static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src, |
47 | | unsigned srclen); |
48 | | static unsigned utf8froma(char *dst, unsigned dstlen, const char *src, |
49 | | unsigned srclen); |
50 | | static int utf8test(const char *src, unsigned srclen); |
51 | | |
52 | | #ifdef _WIN32 |
53 | | |
54 | | #include <windows.h> |
55 | | #include <winnls.h> |
56 | | |
57 | | static char *CPLWin32Recode(const char *src, unsigned src_code_page, |
58 | | unsigned dst_code_page) CPL_RETURNS_NONNULL; |
59 | | #endif |
60 | | |
61 | | /* used by cpl_recode.cpp */ |
62 | | extern void CPLClearRecodeStubWarningFlags(); |
63 | | extern char *CPLRecodeStub(const char *, const char *, |
64 | | const char *) CPL_RETURNS_NONNULL; |
65 | | extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *, |
66 | | const char *); |
67 | | extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *); |
68 | | |
69 | | /************************************************************************/ |
70 | | /* ==================================================================== */ |
71 | | /* Stub Implementation not depending on iconv() or WIN32 API. */ |
72 | | /* ==================================================================== */ |
73 | | /************************************************************************/ |
74 | | |
75 | | static bool bHaveWarned1 = false; |
76 | | static bool bHaveWarned2 = false; |
77 | | static bool bHaveWarned3 = false; |
78 | | static bool bHaveWarned4 = false; |
79 | | static bool bHaveWarned5 = false; |
80 | | static bool bHaveWarned6 = false; |
81 | | |
82 | | /************************************************************************/ |
83 | | /* CPLClearRecodeStubWarningFlags() */ |
84 | | /************************************************************************/ |
85 | | |
86 | | void CPLClearRecodeStubWarningFlags() |
87 | 0 | { |
88 | 0 | bHaveWarned1 = false; |
89 | 0 | bHaveWarned2 = false; |
90 | 0 | bHaveWarned3 = false; |
91 | 0 | bHaveWarned4 = false; |
92 | 0 | bHaveWarned5 = false; |
93 | 0 | bHaveWarned6 = false; |
94 | 0 | } |
95 | | |
96 | | /************************************************************************/ |
97 | | /* CPLRecodeStub() */ |
98 | | /************************************************************************/ |
99 | | |
100 | | /** |
101 | | * Convert a string from a source encoding to a destination encoding. |
102 | | * |
103 | | * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII |
104 | | * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported : |
105 | | * <ul> |
106 | | * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in |
107 | | * fact)</li> |
108 | | * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li> |
109 | | * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li> |
110 | | * </ul> |
111 | | * |
112 | | * If an error occurs an error may, or may not be posted with CPLError(). |
113 | | * |
114 | | * @param pszSource a NULL terminated string. |
115 | | * @param pszSrcEncoding the source encoding. |
116 | | * @param pszDstEncoding the destination encoding. |
117 | | * |
118 | | * @return a NULL terminated string which should be freed with CPLFree(). |
119 | | */ |
120 | | |
121 | | char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding, |
122 | | const char *pszDstEncoding) |
123 | | |
124 | 0 | { |
125 | | /* -------------------------------------------------------------------- */ |
126 | | /* If the source or destination is current locale(), we change */ |
127 | | /* it to ISO8859-1 since our stub implementation does not */ |
128 | | /* attempt to address locales properly. */ |
129 | | /* -------------------------------------------------------------------- */ |
130 | |
|
131 | 0 | if (pszSrcEncoding[0] == '\0') |
132 | 0 | pszSrcEncoding = CPL_ENC_ISO8859_1; |
133 | |
|
134 | 0 | if (pszDstEncoding[0] == '\0') |
135 | 0 | pszDstEncoding = CPL_ENC_ISO8859_1; |
136 | | |
137 | | /* -------------------------------------------------------------------- */ |
138 | | /* ISO8859 to UTF8 */ |
139 | | /* -------------------------------------------------------------------- */ |
140 | 0 | if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 && |
141 | 0 | strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0) |
142 | 0 | { |
143 | 0 | const int nCharCount = static_cast<int>(strlen(pszSource)); |
144 | 0 | char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1)); |
145 | |
|
146 | 0 | utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount); |
147 | |
|
148 | 0 | return pszResult; |
149 | 0 | } |
150 | | |
151 | | /* -------------------------------------------------------------------- */ |
152 | | /* UTF8 to ISO8859 */ |
153 | | /* -------------------------------------------------------------------- */ |
154 | 0 | if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 && |
155 | 0 | strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0) |
156 | 0 | { |
157 | 0 | int nCharCount = static_cast<int>(strlen(pszSource)); |
158 | 0 | char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1)); |
159 | |
|
160 | 0 | utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1); |
161 | |
|
162 | 0 | return pszResult; |
163 | 0 | } |
164 | | |
165 | | // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables |
166 | 0 | if (EQUAL(pszDstEncoding, CPL_ENC_UTF8)) |
167 | 0 | { |
168 | 0 | const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding); |
169 | 0 | if (pConvTable) |
170 | 0 | { |
171 | 0 | const auto convTable = *pConvTable; |
172 | 0 | const size_t nCharCount = strlen(pszSource); |
173 | 0 | char *pszResult = |
174 | 0 | static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1)); |
175 | 0 | size_t iDst = 0; |
176 | 0 | unsigned char *pabyResult = |
177 | 0 | reinterpret_cast<unsigned char *>(pszResult); |
178 | 0 | for (size_t i = 0; i < nCharCount; ++i) |
179 | 0 | { |
180 | 0 | const unsigned char nChar = |
181 | 0 | static_cast<unsigned char>(pszSource[i]); |
182 | 0 | if (nChar <= 127) |
183 | 0 | { |
184 | 0 | pszResult[iDst] = pszSource[i]; |
185 | 0 | ++iDst; |
186 | 0 | } |
187 | 0 | else |
188 | 0 | { |
189 | 0 | const unsigned char nShiftedChar = nChar - 128; |
190 | 0 | if (convTable[nShiftedChar][0]) |
191 | 0 | { |
192 | 0 | pabyResult[iDst] = convTable[nShiftedChar][0]; |
193 | 0 | ++iDst; |
194 | 0 | CPLAssert(convTable[nShiftedChar][1]); |
195 | 0 | pabyResult[iDst] = convTable[nShiftedChar][1]; |
196 | 0 | ++iDst; |
197 | 0 | if (convTable[nShiftedChar][2]) |
198 | 0 | { |
199 | 0 | pabyResult[iDst] = convTable[nShiftedChar][2]; |
200 | 0 | ++iDst; |
201 | 0 | } |
202 | 0 | } |
203 | 0 | else |
204 | 0 | { |
205 | | // Skip the invalid sequence in the input string. |
206 | 0 | if (!bHaveWarned2) |
207 | 0 | { |
208 | 0 | bHaveWarned2 = true; |
209 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
210 | 0 | "One or several characters couldn't be " |
211 | 0 | "converted correctly from %s to %s. " |
212 | 0 | "This warning will not be emitted anymore", |
213 | 0 | pszSrcEncoding, pszDstEncoding); |
214 | 0 | } |
215 | 0 | } |
216 | 0 | } |
217 | 0 | } |
218 | | |
219 | 0 | pszResult[iDst] = 0; |
220 | 0 | return pszResult; |
221 | 0 | } |
222 | 0 | } |
223 | | |
224 | | #ifdef _WIN32 |
225 | | const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding) |
226 | | { |
227 | | // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers |
228 | | if (STARTS_WITH(pszEncoding, "CP")) |
229 | | { |
230 | | const int nCode = atoi(pszEncoding + strlen("CP")); |
231 | | if (nCode > 0) |
232 | | return nCode; |
233 | | else if (EQUAL(pszEncoding, "CP_OEMCP")) |
234 | | return CP_OEMCP; |
235 | | else if (EQUAL(pszEncoding, "CP_ACP")) |
236 | | return CP_ACP; |
237 | | } |
238 | | else if (STARTS_WITH(pszEncoding, "WINDOWS-")) |
239 | | { |
240 | | const int nCode = atoi(pszEncoding + strlen("WINDOWS-")); |
241 | | if (nCode > 0) |
242 | | return nCode; |
243 | | } |
244 | | else if (STARTS_WITH(pszEncoding, "ISO-8859-")) |
245 | | { |
246 | | const int nCode = atoi(pszEncoding + strlen("ISO-8859-")); |
247 | | if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15) |
248 | | return 28590 + nCode; |
249 | | } |
250 | | |
251 | | // Return a negative value, since CP_ACP = 0 |
252 | | return -1; |
253 | | }; |
254 | | |
255 | | /* ---------------------------------------------------------------------*/ |
256 | | /* XXX to UTF8 */ |
257 | | /* ---------------------------------------------------------------------*/ |
258 | | if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0) |
259 | | { |
260 | | const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding); |
261 | | if (nCode >= 0) |
262 | | { |
263 | | return CPLWin32Recode(pszSource, nCode, CP_UTF8); |
264 | | } |
265 | | } |
266 | | |
267 | | /* ---------------------------------------------------------------------*/ |
268 | | /* UTF8 to XXX */ |
269 | | /* ---------------------------------------------------------------------*/ |
270 | | if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0) |
271 | | { |
272 | | const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding); |
273 | | if (nCode >= 0) |
274 | | { |
275 | | return CPLWin32Recode(pszSource, CP_UTF8, nCode); |
276 | | } |
277 | | } |
278 | | #endif |
279 | | |
280 | | /* -------------------------------------------------------------------- */ |
281 | | /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */ |
282 | | /* a one-time warning. */ |
283 | | /* -------------------------------------------------------------------- */ |
284 | 0 | if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0) |
285 | 0 | { |
286 | 0 | const int nCharCount = static_cast<int>(strlen(pszSource)); |
287 | 0 | char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1)); |
288 | |
|
289 | 0 | if (!bHaveWarned1) |
290 | 0 | { |
291 | 0 | bHaveWarned1 = true; |
292 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
293 | 0 | "Recode from %s to UTF-8 not supported, " |
294 | 0 | "treated as ISO-8859-1 to UTF-8.", |
295 | 0 | pszSrcEncoding); |
296 | 0 | } |
297 | |
|
298 | 0 | utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount); |
299 | |
|
300 | 0 | return pszResult; |
301 | 0 | } |
302 | | |
303 | | /* -------------------------------------------------------------------- */ |
304 | | /* Everything else is treated as a no-op with a warning. */ |
305 | | /* -------------------------------------------------------------------- */ |
306 | 0 | { |
307 | 0 | if (!bHaveWarned3) |
308 | 0 | { |
309 | 0 | bHaveWarned3 = true; |
310 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
311 | 0 | "Recode from %s to %s not supported, no change applied.", |
312 | 0 | pszSrcEncoding, pszDstEncoding); |
313 | 0 | } |
314 | |
|
315 | 0 | return CPLStrdup(pszSource); |
316 | 0 | } |
317 | 0 | } |
318 | | |
319 | | /************************************************************************/ |
320 | | /* CPLRecodeFromWCharStub() */ |
321 | | /************************************************************************/ |
322 | | |
323 | | /** |
324 | | * Convert wchar_t string to UTF-8. |
325 | | * |
326 | | * Convert a wchar_t string into a multibyte utf-8 string. The only |
327 | | * guaranteed supported source encoding is CPL_ENC_UCS2, and the only |
328 | | * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII |
329 | | * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings |
330 | | * may also be supported. |
331 | | * |
332 | | * Note that the wchar_t type varies in size on different systems. On |
333 | | * win32 it is normally 2 bytes, and on unix 4 bytes. |
334 | | * |
335 | | * If an error occurs an error may, or may not be posted with CPLError(). |
336 | | * |
337 | | * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t. |
338 | | * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2. |
339 | | * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8. |
340 | | * |
341 | | * @return a zero terminated multi-byte string which should be freed with |
342 | | * CPLFree(), or NULL if an error occurs. |
343 | | */ |
344 | | |
345 | | char *CPLRecodeFromWCharStub(const wchar_t *pwszSource, |
346 | | const char *pszSrcEncoding, |
347 | | const char *pszDstEncoding) |
348 | | |
349 | 0 | { |
350 | | /* -------------------------------------------------------------------- */ |
351 | | /* We try to avoid changes of character set. We are just */ |
352 | | /* providing for unicode to unicode. */ |
353 | | /* -------------------------------------------------------------------- */ |
354 | 0 | if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 && |
355 | 0 | strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 && |
356 | 0 | strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 && |
357 | 0 | strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 && |
358 | 0 | strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0) |
359 | 0 | { |
360 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
361 | 0 | "Stub recoding implementation does not support " |
362 | 0 | "CPLRecodeFromWCharStub(...,%s,%s)", |
363 | 0 | pszSrcEncoding, pszDstEncoding); |
364 | 0 | return nullptr; |
365 | 0 | } |
366 | | |
367 | | /* -------------------------------------------------------------------- */ |
368 | | /* What is the source length. */ |
369 | | /* -------------------------------------------------------------------- */ |
370 | 0 | int nSrcLen = 0; |
371 | |
|
372 | 0 | while (pwszSource[nSrcLen] != 0) |
373 | 0 | nSrcLen++; |
374 | | |
375 | | /* -------------------------------------------------------------------- */ |
376 | | /* Allocate destination buffer plenty big. */ |
377 | | /* -------------------------------------------------------------------- */ |
378 | 0 | const int nDstBufSize = nSrcLen * 4 + 1; |
379 | | // Nearly worst case. |
380 | 0 | char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize)); |
381 | |
|
382 | 0 | if (nSrcLen == 0) |
383 | 0 | { |
384 | 0 | pszResult[0] = '\0'; |
385 | 0 | return pszResult; |
386 | 0 | } |
387 | | |
388 | | /* -------------------------------------------------------------------- */ |
389 | | /* Convert, and confirm we had enough space. */ |
390 | | /* -------------------------------------------------------------------- */ |
391 | 0 | const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen); |
392 | 0 | if (nDstLen >= nDstBufSize) |
393 | 0 | { |
394 | 0 | CPLAssert(false); // too small! |
395 | 0 | return nullptr; |
396 | 0 | } |
397 | | |
398 | | /* -------------------------------------------------------------------- */ |
399 | | /* If something other than UTF-8 was requested, recode now. */ |
400 | | /* -------------------------------------------------------------------- */ |
401 | 0 | if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0) |
402 | 0 | return pszResult; |
403 | | |
404 | 0 | char *pszFinalResult = |
405 | 0 | CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding); |
406 | |
|
407 | 0 | CPLFree(pszResult); |
408 | |
|
409 | 0 | return pszFinalResult; |
410 | 0 | } |
411 | | |
412 | | /************************************************************************/ |
413 | | /* CPLRecodeToWCharStub() */ |
414 | | /************************************************************************/ |
415 | | |
416 | | /** |
417 | | * Convert UTF-8 string to a wchar_t string. |
418 | | * |
419 | | * Convert a 8bit, multi-byte per character input string into a wide |
420 | | * character (wchar_t) string. The only guaranteed supported source encodings |
421 | | * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only |
422 | | * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source |
423 | | * and destination encodings may be supported depending on the underlying |
424 | | * implementation. |
425 | | * |
426 | | * Note that the wchar_t type varies in size on different systems. On |
427 | | * win32 it is normally 2 bytes, and on unix 4 bytes. |
428 | | * |
429 | | * If an error occurs an error may, or may not be posted with CPLError(). |
430 | | * |
431 | | * @param pszSource input multi-byte character string. |
432 | | * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8. |
433 | | * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. |
434 | | * |
435 | | * @return the zero terminated wchar_t string (to be freed with CPLFree()) or |
436 | | * NULL on error. |
437 | | * |
438 | | * @since GDAL 1.6.0 |
439 | | */ |
440 | | |
441 | | wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding, |
442 | | const char *pszDstEncoding) |
443 | | |
444 | 0 | { |
445 | 0 | char *pszUTF8Source = const_cast<char *>(pszSource); |
446 | |
|
447 | 0 | if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 && |
448 | 0 | strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0) |
449 | 0 | { |
450 | 0 | pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8); |
451 | 0 | if (pszUTF8Source == nullptr) |
452 | 0 | return nullptr; |
453 | 0 | } |
454 | | |
455 | | /* -------------------------------------------------------------------- */ |
456 | | /* We try to avoid changes of character set. We are just */ |
457 | | /* providing for unicode to unicode. */ |
458 | | /* -------------------------------------------------------------------- */ |
459 | 0 | if (strcmp(pszDstEncoding, "WCHAR_T") != 0 && |
460 | 0 | strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 && |
461 | 0 | strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 && |
462 | 0 | strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0) |
463 | 0 | { |
464 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
465 | 0 | "Stub recoding implementation does not support " |
466 | 0 | "CPLRecodeToWCharStub(...,%s,%s)", |
467 | 0 | pszSrcEncoding, pszDstEncoding); |
468 | 0 | if (pszUTF8Source != pszSource) |
469 | 0 | CPLFree(pszUTF8Source); |
470 | 0 | return nullptr; |
471 | 0 | } |
472 | | |
473 | | /* -------------------------------------------------------------------- */ |
474 | | /* Do the UTF-8 to UCS-2 recoding. */ |
475 | | /* -------------------------------------------------------------------- */ |
476 | 0 | int nSrcLen = static_cast<int>(strlen(pszUTF8Source)); |
477 | 0 | wchar_t *pwszResult = |
478 | 0 | static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1)); |
479 | |
|
480 | 0 | utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1); |
481 | |
|
482 | 0 | if (pszUTF8Source != pszSource) |
483 | 0 | CPLFree(pszUTF8Source); |
484 | |
|
485 | 0 | return pwszResult; |
486 | 0 | } |
487 | | |
488 | | /************************************************************************/ |
489 | | /* CPLIsUTF8() */ |
490 | | /************************************************************************/ |
491 | | |
492 | | /** |
493 | | * Test if a string is encoded as UTF-8. |
494 | | * |
495 | | * @param pabyData input string to test |
496 | | * @param nLen length of the input string, or -1 if the function must compute |
497 | | * the string length. In which case it must be null terminated. |
498 | | * @return TRUE if the string is encoded as UTF-8. FALSE otherwise |
499 | | * |
500 | | * @since GDAL 1.7.0 |
501 | | */ |
502 | | int CPLIsUTF8(const char *pabyData, int nLen) |
503 | 0 | { |
504 | 0 | if (nLen < 0) |
505 | 0 | nLen = static_cast<int>(strlen(pabyData)); |
506 | 0 | return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0; |
507 | 0 | } |
508 | | |
509 | | /************************************************************************/ |
510 | | /* ==================================================================== */ |
511 | | /* UTF.C code from FLTK with some modifications. */ |
512 | | /* ==================================================================== */ |
513 | | /************************************************************************/ |
514 | | |
515 | | /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero |
516 | | they are instead turned into the Unicode REPLACEMENT CHARACTER, of |
517 | | value 0xfffd. |
518 | | If this is on utf8decode will correctly map most (perhaps all) |
519 | | human-readable text that is in ISO-8859-1. This may allow you |
520 | | to completely ignore character sets in your code because virtually |
521 | | everything is either ISO-8859-1 or UTF-8. |
522 | | */ |
523 | | #define ERRORS_TO_ISO8859_1 1 |
524 | | |
525 | | /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the |
526 | | Unicode index for Microsoft's CP1252 character set. You should |
527 | | also set ERRORS_TO_ISO8859_1. With this a huge amount of more |
528 | | available text (such as all web pages) are correctly converted |
529 | | to Unicode. |
530 | | */ |
531 | | #define ERRORS_TO_CP1252 1 |
532 | | |
533 | | /* A number of Unicode code points are in fact illegal and should not |
534 | | be produced by a UTF-8 converter. Turn this on will replace the |
535 | | bytes in those encodings with errors. If you do this then converting |
536 | | arbitrary 16-bit data to UTF-8 and then back is not an identity, |
537 | | which will probably break a lot of software. |
538 | | */ |
539 | | #define STRICT_RFC3629 0 |
540 | | |
541 | | #if ERRORS_TO_CP1252 |
542 | | // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated |
543 | | // to Unicode: |
544 | | constexpr unsigned short cp1252[32] = { |
545 | | 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, |
546 | | 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f, |
547 | | 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, |
548 | | 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178}; |
549 | | #endif |
550 | | |
551 | | /************************************************************************/ |
552 | | /* utf8decode() */ |
553 | | /************************************************************************/ |
554 | | |
555 | | /* |
556 | | Decode a single UTF-8 encoded character starting at \e p. The |
557 | | resulting Unicode value (in the range 0-0x10ffff) is returned, |
558 | | and \e len is set the number of bytes in the UTF-8 encoding |
559 | | (adding \e len to \e p will point at the next character). |
560 | | |
561 | | If \a p points at an illegal UTF-8 encoding, including one that |
562 | | would go past \e end, or where a code is uses more bytes than |
563 | | necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as |
564 | | though it is in the Microsoft CP1252 character set and \e len is set to 1. |
565 | | Treating errors this way allows this to decode almost any |
566 | | ISO-8859-1 or CP1252 text that has been mistakenly placed where |
567 | | UTF-8 is expected, and has proven very useful. |
568 | | |
569 | | If you want errors to be converted to error characters (as the |
570 | | standards recommend), adding a test to see if the length is |
571 | | unexpectedly 1 will work: |
572 | | |
573 | | \code |
574 | | if( *p & 0x80 ) |
575 | | { // What should be a multibyte encoding. |
576 | | code = utf8decode(p, end, &len); |
577 | | if( len<2 ) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER. |
578 | | } |
579 | | else |
580 | | { // Handle the 1-byte utf8 encoding: |
581 | | code = *p; |
582 | | len = 1; |
583 | | } |
584 | | \endcode |
585 | | |
586 | | Direct testing for the 1-byte case (as shown above) will also |
587 | | speed up the scanning of strings where the majority of characters |
588 | | are ASCII. |
589 | | */ |
590 | | static unsigned utf8decode(const char *p, const char *end, int *len) |
591 | 0 | { |
592 | 0 | unsigned char c = *reinterpret_cast<const unsigned char *>(p); |
593 | 0 | if (c < 0x80) |
594 | 0 | { |
595 | 0 | *len = 1; |
596 | 0 | return c; |
597 | 0 | #if ERRORS_TO_CP1252 |
598 | 0 | } |
599 | 0 | else if (c < 0xa0) |
600 | 0 | { |
601 | 0 | *len = 1; |
602 | 0 | return cp1252[c - 0x80]; |
603 | 0 | #endif |
604 | 0 | } |
605 | 0 | else if (c < 0xc2) |
606 | 0 | { |
607 | 0 | goto FAIL; |
608 | 0 | } |
609 | 0 | if (p + 1 >= end || (p[1] & 0xc0) != 0x80) |
610 | 0 | goto FAIL; |
611 | 0 | if (c < 0xe0) |
612 | 0 | { |
613 | 0 | *len = 2; |
614 | 0 | return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f)); |
615 | 0 | } |
616 | 0 | else if (c == 0xe0) |
617 | 0 | { |
618 | 0 | if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0) |
619 | 0 | goto FAIL; |
620 | 0 | goto UTF8_3; |
621 | | #if STRICT_RFC3629 |
622 | | } |
623 | | else if (c == 0xed) |
624 | | { |
625 | | // RFC 3629 says surrogate chars are illegal. |
626 | | if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0) |
627 | | goto FAIL; |
628 | | goto UTF8_3; |
629 | | } |
630 | | else if (c == 0xef) |
631 | | { |
632 | | // 0xfffe and 0xffff are also illegal characters. |
633 | | if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf && |
634 | | (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe) |
635 | | goto FAIL; |
636 | | goto UTF8_3; |
637 | | #endif |
638 | 0 | } |
639 | 0 | else if (c < 0xf0) |
640 | 0 | { |
641 | 0 | UTF8_3: |
642 | 0 | if (p + 2 >= end || (p[2] & 0xc0) != 0x80) |
643 | 0 | goto FAIL; |
644 | 0 | *len = 3; |
645 | 0 | return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f)); |
646 | 0 | } |
647 | 0 | else if (c == 0xf0) |
648 | 0 | { |
649 | 0 | if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90) |
650 | 0 | goto FAIL; |
651 | 0 | goto UTF8_4; |
652 | 0 | } |
653 | 0 | else if (c < 0xf4) |
654 | 0 | { |
655 | 0 | UTF8_4: |
656 | 0 | if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80) |
657 | 0 | goto FAIL; |
658 | 0 | *len = 4; |
659 | | #if STRICT_RFC3629 |
660 | | // RFC 3629 says all codes ending in fffe or ffff are illegal: |
661 | | if ((p[1] & 0xf) == 0xf && |
662 | | (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf && |
663 | | (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe) |
664 | | goto FAIL; |
665 | | #endif |
666 | 0 | return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) + |
667 | 0 | ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f)); |
668 | 0 | } |
669 | 0 | else if (c == 0xf4) |
670 | 0 | { |
671 | 0 | if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f) |
672 | 0 | goto FAIL; // After 0x10ffff. |
673 | 0 | goto UTF8_4; |
674 | 0 | } |
675 | 0 | else |
676 | 0 | { |
677 | 0 | FAIL: |
678 | 0 | *len = 1; |
679 | 0 | #if ERRORS_TO_ISO8859_1 |
680 | 0 | return c; |
681 | | #else |
682 | | return 0xfffd; // Unicode REPLACEMENT CHARACTER |
683 | | #endif |
684 | 0 | } |
685 | 0 | } |
686 | | |
687 | | /************************************************************************/ |
688 | | /* utf8towc() */ |
689 | | /************************************************************************/ |
690 | | |
691 | | /* Convert a UTF-8 sequence into an array of wchar_t. These |
692 | | are used by some system calls, especially on Windows. |
693 | | |
694 | | \a src points at the UTF-8, and \a srclen is the number of bytes to |
695 | | convert. |
696 | | |
697 | | \a dst points at an array to write, and \a dstlen is the number of |
698 | | locations in this array. At most \a dstlen-1 words will be |
699 | | written there, plus a 0 terminating word. Thus this function |
700 | | will never overwrite the buffer and will always return a |
701 | | zero-terminated string. If \a dstlen is zero then \a dst can be |
702 | | null and no data is written, but the length is returned. |
703 | | |
704 | | The return value is the number of words that \e would be written |
705 | | to \a dst if it were long enough, not counting the terminating |
706 | | zero. If the return value is greater or equal to \a dstlen it |
707 | | indicates truncation, you can then allocate a new array of size |
708 | | return+1 and call this again. |
709 | | |
710 | | Errors in the UTF-8 are converted as though each byte in the |
711 | | erroneous string is in the Microsoft CP1252 encoding. This allows |
712 | | ISO-8859-1 text mistakenly identified as UTF-8 to be printed |
713 | | correctly. |
714 | | |
715 | | Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux |
716 | | and most other systems. Where wchar_t is 16 bits, Unicode |
717 | | characters in the range 0x10000 to 0x10ffff are converted to |
718 | | "surrogate pairs" which take two words each (this is called UTF-16 |
719 | | encoding). If wchar_t is 32 bits this rather nasty problem is |
720 | | avoided. |
721 | | */ |
722 | | static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst, |
723 | | unsigned dstlen) |
724 | 0 | { |
725 | 0 | const char *p = src; |
726 | 0 | const char *e = src + srclen; |
727 | 0 | unsigned count = 0; |
728 | 0 | if (dstlen) |
729 | 0 | while (true) |
730 | 0 | { |
731 | 0 | if (p >= e) |
732 | 0 | { |
733 | 0 | dst[count] = 0; |
734 | 0 | return count; |
735 | 0 | } |
736 | 0 | if (!(*p & 0x80)) |
737 | 0 | { |
738 | | // ASCII |
739 | 0 | dst[count] = *p++; |
740 | 0 | } |
741 | 0 | else |
742 | 0 | { |
743 | 0 | int len = 0; |
744 | 0 | unsigned ucs = utf8decode(p, e, &len); |
745 | 0 | p += len; |
746 | | #ifdef _WIN32 |
747 | | if (ucs < 0x10000) |
748 | | { |
749 | | dst[count] = static_cast<wchar_t>(ucs); |
750 | | } |
751 | | else |
752 | | { |
753 | | // Make a surrogate pair: |
754 | | if (count + 2 >= dstlen) |
755 | | { |
756 | | dst[count] = 0; |
757 | | count += 2; |
758 | | break; |
759 | | } |
760 | | dst[count] = static_cast<wchar_t>( |
761 | | (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800); |
762 | | dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00); |
763 | | } |
764 | | #else |
765 | 0 | dst[count] = static_cast<wchar_t>(ucs); |
766 | 0 | #endif |
767 | 0 | } |
768 | 0 | if (++count == dstlen) |
769 | 0 | { |
770 | 0 | dst[count - 1] = 0; |
771 | 0 | break; |
772 | 0 | } |
773 | 0 | } |
774 | | // We filled dst, measure the rest: |
775 | 0 | while (p < e) |
776 | 0 | { |
777 | 0 | if (!(*p & 0x80)) |
778 | 0 | { |
779 | 0 | p++; |
780 | 0 | } |
781 | 0 | else |
782 | 0 | { |
783 | 0 | int len = 0; |
784 | | #ifdef _WIN32 |
785 | | const unsigned ucs = utf8decode(p, e, &len); |
786 | | p += len; |
787 | | if (ucs >= 0x10000) |
788 | | ++count; |
789 | | #else |
790 | 0 | utf8decode(p, e, &len); |
791 | 0 | p += len; |
792 | 0 | #endif |
793 | 0 | } |
794 | 0 | ++count; |
795 | 0 | } |
796 | |
|
797 | 0 | return count; |
798 | 0 | } |
799 | | |
800 | | /************************************************************************/ |
801 | | /* utf8toa() */ |
802 | | /************************************************************************/ |
803 | | /* Convert a UTF-8 sequence into an array of 1-byte characters. |
804 | | |
805 | | If the UTF-8 decodes to a character greater than 0xff then it is |
806 | | replaced with '?'. |
807 | | |
808 | | Errors in the UTF-8 are converted as individual bytes, same as |
809 | | utf8decode() does. This allows ISO-8859-1 text mistakenly identified |
810 | | as UTF-8 to be printed correctly (and possibly CP1512 on Windows). |
811 | | |
812 | | \a src points at the UTF-8, and \a srclen is the number of bytes to |
813 | | convert. |
814 | | |
815 | | Up to \a dstlen bytes are written to \a dst, including a null |
816 | | terminator. The return value is the number of bytes that would be |
817 | | written, not counting the null terminator. If greater or equal to |
818 | | \a dstlen then if you malloc a new array of size n+1 you will have |
819 | | the space needed for the entire string. If \a dstlen is zero then |
820 | | nothing is written and this call just measures the storage space |
821 | | needed. |
822 | | */ |
823 | | static unsigned int utf8toa(const char *src, unsigned srclen, char *dst, |
824 | | unsigned dstlen) |
825 | 0 | { |
826 | 0 | const char *p = src; |
827 | 0 | const char *e = src + srclen; |
828 | 0 | unsigned int count = 0; |
829 | 0 | if (dstlen) |
830 | 0 | while (true) |
831 | 0 | { |
832 | 0 | if (p >= e) |
833 | 0 | { |
834 | 0 | dst[count] = 0; |
835 | 0 | return count; |
836 | 0 | } |
837 | 0 | unsigned char c = *reinterpret_cast<const unsigned char *>(p); |
838 | 0 | if (c < 0xC2) |
839 | 0 | { |
840 | | // ASCII or bad code. |
841 | 0 | dst[count] = c; |
842 | 0 | p++; |
843 | 0 | } |
844 | 0 | else |
845 | 0 | { |
846 | 0 | int len = 0; |
847 | 0 | const unsigned int ucs = utf8decode(p, e, &len); |
848 | 0 | p += len; |
849 | 0 | if (ucs < 0x100) |
850 | 0 | { |
851 | 0 | dst[count] = static_cast<char>(ucs); |
852 | 0 | } |
853 | 0 | else |
854 | 0 | { |
855 | 0 | if (!bHaveWarned4) |
856 | 0 | { |
857 | 0 | bHaveWarned4 = true; |
858 | 0 | CPLError( |
859 | 0 | CE_Warning, CPLE_AppDefined, |
860 | 0 | "One or several characters couldn't be converted " |
861 | 0 | "correctly from UTF-8 to ISO-8859-1. " |
862 | 0 | "This warning will not be emitted anymore."); |
863 | 0 | } |
864 | 0 | dst[count] = '?'; |
865 | 0 | } |
866 | 0 | } |
867 | 0 | if (++count >= dstlen) |
868 | 0 | { |
869 | 0 | dst[count - 1] = 0; |
870 | 0 | break; |
871 | 0 | } |
872 | 0 | } |
873 | | // We filled dst, measure the rest: |
874 | 0 | while (p < e) |
875 | 0 | { |
876 | 0 | if (!(*p & 0x80)) |
877 | 0 | { |
878 | 0 | p++; |
879 | 0 | } |
880 | 0 | else |
881 | 0 | { |
882 | 0 | int len = 0; |
883 | 0 | utf8decode(p, e, &len); |
884 | 0 | p += len; |
885 | 0 | } |
886 | 0 | ++count; |
887 | 0 | } |
888 | 0 | return count; |
889 | 0 | } |
890 | | |
891 | | /************************************************************************/ |
892 | | /* utf8fromwc() */ |
893 | | /************************************************************************/ |
894 | | /* Turn "wide characters" as returned by some system calls |
895 | | (especially on Windows) into UTF-8. |
896 | | |
897 | | Up to \a dstlen bytes are written to \a dst, including a null |
898 | | terminator. The return value is the number of bytes that would be |
899 | | written, not counting the null terminator. If greater or equal to |
900 | | \a dstlen then if you malloc a new array of size n+1 you will have |
901 | | the space needed for the entire string. If \a dstlen is zero then |
902 | | nothing is written and this call just measures the storage space |
903 | | needed. |
904 | | |
905 | | \a srclen is the number of words in \a src to convert. On Windows |
906 | | this is not necessarily the number of characters, due to there |
907 | | possibly being "surrogate pairs" in the UTF-16 encoding used. |
908 | | On Unix wchar_t is 32 bits and each location is a character. |
909 | | |
910 | | On Unix if a src word is greater than 0x10ffff then this is an |
911 | | illegal character according to RFC 3629. These are converted as |
912 | | though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the |
913 | | range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also |
914 | | illegal according to RFC 3629. However I encode these as though |
915 | | they are legal, so that utf8towc will return the original data. |
916 | | |
917 | | On Windows "surrogate pairs" are converted to a single character |
918 | | and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate |
919 | | pairs are converted as though they are individual characters. |
920 | | */ |
921 | | static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src, |
922 | | unsigned srclen) |
923 | 0 | { |
924 | 0 | unsigned int i = 0; |
925 | 0 | unsigned int count = 0; |
926 | 0 | if (dstlen) |
927 | 0 | while (true) |
928 | 0 | { |
929 | 0 | if (i >= srclen) |
930 | 0 | { |
931 | 0 | dst[count] = 0; |
932 | 0 | return count; |
933 | 0 | } |
934 | 0 | unsigned int ucs = src[i++]; |
935 | 0 | if (ucs < 0x80U) |
936 | 0 | { |
937 | 0 | dst[count++] = static_cast<char>(ucs); |
938 | 0 | if (count >= dstlen) |
939 | 0 | { |
940 | 0 | dst[count - 1] = 0; |
941 | 0 | break; |
942 | 0 | } |
943 | 0 | } |
944 | 0 | else if (ucs < 0x800U) |
945 | 0 | { |
946 | | // 2 bytes. |
947 | 0 | if (count + 2 >= dstlen) |
948 | 0 | { |
949 | 0 | dst[count] = 0; |
950 | 0 | count += 2; |
951 | 0 | break; |
952 | 0 | } |
953 | 0 | dst[count++] = 0xc0 | static_cast<char>(ucs >> 6); |
954 | 0 | dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F); |
955 | | #ifdef _WIN32 |
956 | | } |
957 | | else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen && |
958 | | src[i] >= 0xdc00 && src[i] <= 0xdfff) |
959 | | { |
960 | | // Surrogate pair. |
961 | | unsigned int ucs2 = src[i++]; |
962 | | ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff); |
963 | | // All surrogate pairs turn into 4-byte utf8. |
964 | | #else |
965 | 0 | } |
966 | 0 | else if (ucs >= 0x10000) |
967 | 0 | { |
968 | 0 | if (ucs > 0x10ffff) |
969 | 0 | { |
970 | 0 | ucs = 0xfffd; |
971 | 0 | goto J1; |
972 | 0 | } |
973 | 0 | #endif |
974 | 0 | if (count + 4 >= dstlen) |
975 | 0 | { |
976 | 0 | dst[count] = 0; |
977 | 0 | count += 4; |
978 | 0 | break; |
979 | 0 | } |
980 | 0 | dst[count++] = 0xf0 | static_cast<char>(ucs >> 18); |
981 | 0 | dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F); |
982 | 0 | dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F); |
983 | 0 | dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F); |
984 | 0 | } |
985 | 0 | else |
986 | 0 | { |
987 | 0 | #ifndef _WIN32 |
988 | 0 | J1: |
989 | 0 | #endif |
990 | | // All others are 3 bytes: |
991 | 0 | if (count + 3 >= dstlen) |
992 | 0 | { |
993 | 0 | dst[count] = 0; |
994 | 0 | count += 3; |
995 | 0 | break; |
996 | 0 | } |
997 | 0 | dst[count++] = 0xe0 | static_cast<char>(ucs >> 12); |
998 | 0 | dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F); |
999 | 0 | dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F); |
1000 | 0 | } |
1001 | 0 | } |
1002 | | |
1003 | | // We filled dst, measure the rest: |
1004 | 0 | while (i < srclen) |
1005 | 0 | { |
1006 | 0 | unsigned int ucs = src[i++]; |
1007 | 0 | if (ucs < 0x80U) |
1008 | 0 | { |
1009 | 0 | count++; |
1010 | 0 | } |
1011 | 0 | else if (ucs < 0x800U) |
1012 | 0 | { |
1013 | | // 2 bytes. |
1014 | 0 | count += 2; |
1015 | | #ifdef _WIN32 |
1016 | | } |
1017 | | else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 && |
1018 | | src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff) |
1019 | | { |
1020 | | // Surrogate pair. |
1021 | | ++i; |
1022 | | #else |
1023 | 0 | } |
1024 | 0 | else if (ucs >= 0x10000 && ucs <= 0x10ffff) |
1025 | 0 | { |
1026 | 0 | #endif |
1027 | 0 | count += 4; |
1028 | 0 | } |
1029 | 0 | else |
1030 | 0 | { |
1031 | 0 | count += 3; |
1032 | 0 | } |
1033 | 0 | } |
1034 | 0 | return count; |
1035 | 0 | } |
1036 | | |
1037 | | /************************************************************************/ |
1038 | | /* utf8froma() */ |
1039 | | /************************************************************************/ |
1040 | | |
1041 | | /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8. |
1042 | | |
1043 | | It is possible this should convert Microsoft's CP1252 to UTF-8 |
1044 | | instead. This would translate the codes in the range 0x80-0x9f |
1045 | | to different characters. Currently it does not do this. |
1046 | | |
1047 | | Up to \a dstlen bytes are written to \a dst, including a null |
1048 | | terminator. The return value is the number of bytes that would be |
1049 | | written, not counting the null terminator. If greater or equal to |
1050 | | \a dstlen then if you malloc a new array of size n+1 you will have |
1051 | | the space needed for the entire string. If \a dstlen is zero then |
1052 | | nothing is written and this call just measures the storage space |
1053 | | needed. |
1054 | | |
1055 | | \a srclen is the number of bytes in \a src to convert. |
1056 | | |
1057 | | If the return value equals \a srclen then this indicates that |
1058 | | no conversion is necessary, as only ASCII characters are in the |
1059 | | string. |
1060 | | */ |
1061 | | static unsigned utf8froma(char *dst, unsigned dstlen, const char *src, |
1062 | | unsigned srclen) |
1063 | 0 | { |
1064 | 0 | const char *p = src; |
1065 | 0 | const char *e = src + srclen; |
1066 | 0 | unsigned count = 0; |
1067 | 0 | if (dstlen) |
1068 | 0 | while (true) |
1069 | 0 | { |
1070 | 0 | if (p >= e) |
1071 | 0 | { |
1072 | 0 | dst[count] = 0; |
1073 | 0 | return count; |
1074 | 0 | } |
1075 | 0 | unsigned char ucs = *reinterpret_cast<const unsigned char *>(p); |
1076 | 0 | p++; |
1077 | 0 | if (ucs < 0x80U) |
1078 | 0 | { |
1079 | 0 | dst[count++] = ucs; |
1080 | 0 | if (count >= dstlen) |
1081 | 0 | { |
1082 | 0 | dst[count - 1] = 0; |
1083 | 0 | break; |
1084 | 0 | } |
1085 | 0 | } |
1086 | 0 | else |
1087 | 0 | { |
1088 | | // 2 bytes (note that CP1252 translate could make 3 bytes!) |
1089 | 0 | if (count + 2 >= dstlen) |
1090 | 0 | { |
1091 | 0 | dst[count] = 0; |
1092 | 0 | count += 2; |
1093 | 0 | break; |
1094 | 0 | } |
1095 | 0 | dst[count++] = 0xc0 | (ucs >> 6); |
1096 | 0 | dst[count++] = 0x80 | (ucs & 0x3F); |
1097 | 0 | } |
1098 | 0 | } |
1099 | | |
1100 | | // We filled dst, measure the rest: |
1101 | 0 | while (p < e) |
1102 | 0 | { |
1103 | 0 | unsigned char ucs = *reinterpret_cast<const unsigned char *>(p); |
1104 | 0 | p++; |
1105 | 0 | if (ucs < 0x80U) |
1106 | 0 | { |
1107 | 0 | count++; |
1108 | 0 | } |
1109 | 0 | else |
1110 | 0 | { |
1111 | 0 | count += 2; |
1112 | 0 | } |
1113 | 0 | } |
1114 | |
|
1115 | 0 | return count; |
1116 | 0 | } |
1117 | | |
1118 | | #ifdef _WIN32 |
1119 | | |
1120 | | /************************************************************************/ |
1121 | | /* CPLWin32Recode() */ |
1122 | | /************************************************************************/ |
1123 | | |
1124 | | /* Convert an CODEPAGE (i.e. normal c-string) byte stream |
1125 | | to another CODEPAGE (i.e. normal c-string) byte stream. |
1126 | | |
1127 | | \a src is target c-string byte stream (including a null terminator). |
1128 | | \a src_code_page is target c-string byte code page. |
1129 | | \a dst_code_page is destination c-string byte code page. |
1130 | | |
1131 | | UTF7 65000 |
1132 | | UTF8 65001 |
1133 | | OEM-US 437 |
1134 | | OEM-ALABIC 720 |
1135 | | OEM-GREEK 737 |
1136 | | OEM-BALTIC 775 |
1137 | | OEM-MLATIN1 850 |
1138 | | OEM-LATIN2 852 |
1139 | | OEM-CYRILLIC 855 |
1140 | | OEM-TURKISH 857 |
1141 | | OEM-MLATIN1P 858 |
1142 | | OEM-HEBREW 862 |
1143 | | OEM-RUSSIAN 866 |
1144 | | |
1145 | | THAI 874 |
1146 | | SJIS 932 |
1147 | | GBK 936 |
1148 | | KOREA 949 |
1149 | | BIG5 950 |
1150 | | |
1151 | | EUROPE 1250 |
1152 | | CYRILLIC 1251 |
1153 | | LATIN1 1252 |
1154 | | GREEK 1253 |
1155 | | TURKISH 1254 |
1156 | | HEBREW 1255 |
1157 | | ARABIC 1256 |
1158 | | BALTIC 1257 |
1159 | | VIETNAM 1258 |
1160 | | |
1161 | | ISO-LATIN1 28591 |
1162 | | ISO-LATIN2 28592 |
1163 | | ISO-LATIN3 28593 |
1164 | | ISO-BALTIC 28594 |
1165 | | ISO-CYRILLIC 28595 |
1166 | | ISO-ARABIC 28596 |
1167 | | ISO-HEBREW 28598 |
1168 | | ISO-TURKISH 28599 |
1169 | | ISO-LATIN9 28605 |
1170 | | |
1171 | | ISO-2022-JP 50220 |
1172 | | |
1173 | | */ |
1174 | | |
1175 | | char *CPLWin32Recode(const char *src, unsigned src_code_page, |
1176 | | unsigned dst_code_page) |
1177 | | { |
1178 | | // Convert from source code page to Unicode. |
1179 | | |
1180 | | // Compute the length in wide characters. |
1181 | | int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1, |
1182 | | nullptr, 0); |
1183 | | if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION) |
1184 | | { |
1185 | | if (!bHaveWarned5) |
1186 | | { |
1187 | | bHaveWarned5 = true; |
1188 | | CPLError( |
1189 | | CE_Warning, CPLE_AppDefined, |
1190 | | "One or several characters could not be translated from CP%d. " |
1191 | | "This warning will not be emitted anymore.", |
1192 | | src_code_page); |
1193 | | } |
1194 | | |
1195 | | // Retry now without MB_ERR_INVALID_CHARS flag. |
1196 | | wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0); |
1197 | | } |
1198 | | |
1199 | | // Do the actual conversion. |
1200 | | wchar_t *tbuf = |
1201 | | static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1)); |
1202 | | tbuf[wlen] = 0; |
1203 | | MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1); |
1204 | | |
1205 | | // Convert from Unicode to destination code page. |
1206 | | |
1207 | | // Compute the length in chars. |
1208 | | BOOL bUsedDefaultChar = FALSE; |
1209 | | int len = 0; |
1210 | | if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8) |
1211 | | len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0, |
1212 | | nullptr, nullptr); |
1213 | | else |
1214 | | len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0, |
1215 | | nullptr, &bUsedDefaultChar); |
1216 | | if (bUsedDefaultChar) |
1217 | | { |
1218 | | if (!bHaveWarned6) |
1219 | | { |
1220 | | bHaveWarned6 = true; |
1221 | | CPLError( |
1222 | | CE_Warning, CPLE_AppDefined, |
1223 | | "One or several characters could not be translated to CP%d. " |
1224 | | "This warning will not be emitted anymore.", |
1225 | | dst_code_page); |
1226 | | } |
1227 | | } |
1228 | | |
1229 | | // Do the actual conversion. |
1230 | | char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1)); |
1231 | | WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr, |
1232 | | nullptr); |
1233 | | pszResult[len] = 0; |
1234 | | |
1235 | | CPLFree(tbuf); |
1236 | | |
1237 | | return pszResult; |
1238 | | } |
1239 | | |
1240 | | #endif |
1241 | | |
1242 | | /* |
1243 | | ** For now we disable the rest which is locale() related. We may need |
1244 | | ** parts of it later. |
1245 | | */ |
1246 | | |
1247 | | #ifdef notdef |
1248 | | |
1249 | | #ifdef _WIN32 |
1250 | | #include <windows.h> |
1251 | | #endif |
1252 | | |
1253 | | /*! Return true if the "locale" seems to indicate that UTF-8 encoding |
1254 | | is used. If true the utf8tomb and utf8frommb don't do anything |
1255 | | useful. |
1256 | | |
1257 | | <i>It is highly recommended that you change your system so this |
1258 | | does return true.</i> On Windows this is done by setting the |
1259 | | "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE |
1260 | | to a string containing the letters "utf" or "UTF" in it, or by |
1261 | | deleting all $LC* and $LANG environment variables. In the future |
1262 | | it is likely that all non-Asian Unix systems will return true, |
1263 | | due to the compatibility of UTF-8 with ISO-8859-1. |
1264 | | */ |
1265 | | int utf8locale(void) |
1266 | | { |
1267 | | static int ret = 2; |
1268 | | if (ret == 2) |
1269 | | { |
1270 | | #ifdef _WIN32 |
1271 | | ret = GetACP() == CP_UTF8; |
1272 | | #else |
1273 | | char *s; |
1274 | | ret = 1; // assume UTF-8 if no locale |
1275 | | if (((s = getenv("LC_CTYPE")) && *s) || |
1276 | | ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s)) |
1277 | | { |
1278 | | ret = strstr(s, "utf") || strstr(s, "UTF"); |
1279 | | } |
1280 | | #endif |
1281 | | } |
1282 | | |
1283 | | return ret; |
1284 | | } |
1285 | | |
1286 | | /*! Convert the UTF-8 used by FLTK to the locale-specific encoding |
1287 | | used for filenames (and sometimes used for data in files). |
1288 | | Unfortunately due to stupid design you will have to do this as |
1289 | | needed for filenames. This is a bug on both Unix and Windows. |
1290 | | |
1291 | | Up to \a dstlen bytes are written to \a dst, including a null |
1292 | | terminator. The return value is the number of bytes that would be |
1293 | | written, not counting the null terminator. If greater or equal to |
1294 | | \a dstlen then if you malloc a new array of size n+1 you will have |
1295 | | the space needed for the entire string. If \a dstlen is zero then |
1296 | | nothing is written and this call just measures the storage space |
1297 | | needed. |
1298 | | |
1299 | | If utf8locale() returns true then this does not change the data. |
1300 | | It is copied and truncated as necessary to |
1301 | | the destination buffer and \a srclen is always returned. */ |
1302 | | unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen) |
1303 | | { |
1304 | | if (!utf8locale()) |
1305 | | { |
1306 | | #ifdef _WIN32 |
1307 | | wchar_t lbuf[1024] = {}; |
1308 | | wchar_t *buf = lbuf; |
1309 | | unsigned length = utf8towc(src, srclen, buf, 1024); |
1310 | | unsigned ret; |
1311 | | if (length >= 1024) |
1312 | | { |
1313 | | buf = |
1314 | | static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t))); |
1315 | | utf8towc(src, srclen, buf, length + 1); |
1316 | | } |
1317 | | if (dstlen) |
1318 | | { |
1319 | | // apparently this does not null-terminate, even though msdn |
1320 | | // documentation claims it does: |
1321 | | ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, |
1322 | | 0); |
1323 | | dst[ret] = 0; |
1324 | | } |
1325 | | // if it overflows or measuring length, get the actual length: |
1326 | | if (dstlen == 0 || ret >= dstlen - 1) |
1327 | | ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0); |
1328 | | if (buf != lbuf) |
1329 | | free((void *)buf); |
1330 | | return ret; |
1331 | | #else |
1332 | | wchar_t lbuf[1024] = {}; |
1333 | | wchar_t *buf = lbuf; |
1334 | | unsigned length = utf8towc(src, srclen, buf, 1024); |
1335 | | if (length >= 1024) |
1336 | | { |
1337 | | buf = |
1338 | | static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t))); |
1339 | | utf8towc(src, srclen, buf, length + 1); |
1340 | | } |
1341 | | int ret = 0; |
1342 | | if (dstlen) |
1343 | | { |
1344 | | ret = wcstombs(dst, buf, dstlen); |
1345 | | if (ret >= dstlen - 1) |
1346 | | ret = wcstombs(0, buf, 0); |
1347 | | } |
1348 | | else |
1349 | | { |
1350 | | ret = wcstombs(0, buf, 0); |
1351 | | } |
1352 | | if (buf != lbuf) |
1353 | | free((void *)buf); |
1354 | | if (ret >= 0) |
1355 | | return (unsigned)ret; |
1356 | | // On any errors we return the UTF-8 as raw text... |
1357 | | #endif |
1358 | | } |
1359 | | // Identity transform: |
1360 | | if (srclen < dstlen) |
1361 | | { |
1362 | | memcpy(dst, src, srclen); |
1363 | | dst[srclen] = 0; |
1364 | | } |
1365 | | else |
1366 | | { |
1367 | | memcpy(dst, src, dstlen - 1); |
1368 | | dst[dstlen - 1] = 0; |
1369 | | } |
1370 | | return srclen; |
1371 | | } |
1372 | | |
1373 | | /*! Convert a filename from the locale-specific multibyte encoding |
1374 | | used by Windows to UTF-8 as used by FLTK. |
1375 | | |
1376 | | Up to \a dstlen bytes are written to \a dst, including a null |
1377 | | terminator. The return value is the number of bytes that would be |
1378 | | written, not counting the null terminator. If greater or equal to |
1379 | | \a dstlen then if you malloc a new array of size n+1 you will have |
1380 | | the space needed for the entire string. If \a dstlen is zero then |
1381 | | nothing is written and this call just measures the storage space |
1382 | | needed. |
1383 | | |
1384 | | On Unix or on Windows when a UTF-8 locale is in effect, this |
1385 | | does not change the data. It is copied and truncated as necessary to |
1386 | | the destination buffer and \a srclen is always returned. |
1387 | | You may also want to check if utf8test() returns non-zero, so that |
1388 | | the filesystem can store filenames in UTF-8 encoding regardless of |
1389 | | the locale. |
1390 | | */ |
1391 | | unsigned utf8frommb(char *dst, unsigned dstlen, const char *src, |
1392 | | unsigned srclen) |
1393 | | { |
1394 | | if (!utf8locale()) |
1395 | | { |
1396 | | #ifdef _WIN32 |
1397 | | wchar_t lbuf[1024] = {}; |
1398 | | wchar_t *buf = lbuf; |
1399 | | unsigned ret; |
1400 | | const unsigned length = |
1401 | | MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024); |
1402 | | if (length >= 1024) |
1403 | | { |
1404 | | length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0); |
1405 | | buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t))); |
1406 | | MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length); |
1407 | | } |
1408 | | ret = utf8fromwc(dst, dstlen, buf, length); |
1409 | | if (buf != lbuf) |
1410 | | free(buf); |
1411 | | return ret; |
1412 | | #else |
1413 | | wchar_t lbuf[1024] = {}; |
1414 | | wchar_t *buf = lbuf; |
1415 | | const int length = mbstowcs(buf, src, 1024); |
1416 | | if (length >= 1024) |
1417 | | { |
1418 | | length = mbstowcs(0, src, 0) + 1; |
1419 | | buf = |
1420 | | static_cast<wchar_t *>(malloc(length * sizeof(unsigned short))); |
1421 | | mbstowcs(buf, src, length); |
1422 | | } |
1423 | | if (length >= 0) |
1424 | | { |
1425 | | const unsigned ret = utf8fromwc(dst, dstlen, buf, length); |
1426 | | if (buf != lbuf) |
1427 | | free(buf); |
1428 | | return ret; |
1429 | | } |
1430 | | // Errors in conversion return the UTF-8 unchanged. |
1431 | | #endif |
1432 | | } |
1433 | | // Identity transform: |
1434 | | if (srclen < dstlen) |
1435 | | { |
1436 | | memcpy(dst, src, srclen); |
1437 | | dst[srclen] = 0; |
1438 | | } |
1439 | | else |
1440 | | { |
1441 | | memcpy(dst, src, dstlen - 1); |
1442 | | dst[dstlen - 1] = 0; |
1443 | | } |
1444 | | return srclen; |
1445 | | } |
1446 | | |
1447 | | #endif // def notdef - disabled locale specific stuff. |
1448 | | |
1449 | | /*! Examines the first \a srclen bytes in \a src and return a verdict |
1450 | | on whether it is UTF-8 or not. |
1451 | | - Returns 0 if there is any illegal UTF-8 sequences, using the |
1452 | | same rules as utf8decode(). Note that some UCS values considered |
1453 | | illegal by RFC 3629, such as 0xffff, are considered legal by this. |
1454 | | - Returns 1 if there are only single-byte characters (i.e. no bytes |
1455 | | have the high bit set). This is legal UTF-8, but also indicates |
1456 | | plain ASCII. It also returns 1 if \a srclen is zero. |
1457 | | - Returns 2 if there are only characters less than 0x800. |
1458 | | - Returns 3 if there are only characters less than 0x10000. |
1459 | | - Returns 4 if there are characters in the 0x10000 to 0x10ffff range. |
1460 | | |
1461 | | Because there are many illegal sequences in UTF-8, it is almost |
1462 | | impossible for a string in another encoding to be confused with |
1463 | | UTF-8. This is very useful for transitioning Unix to UTF-8 |
1464 | | filenames, you can simply test each filename with this to decide |
1465 | | if it is UTF-8 or in the locale encoding. My hope is that if |
1466 | | this is done we will be able to cleanly transition to a locale-less |
1467 | | encoding. |
1468 | | */ |
1469 | | |
1470 | | static int utf8test(const char *src, unsigned srclen) |
1471 | 0 | { |
1472 | 0 | int ret = 1; |
1473 | 0 | const char *p = src; |
1474 | 0 | const char *e = src + srclen; |
1475 | 0 | while (p < e) |
1476 | 0 | { |
1477 | 0 | if (*p == 0) |
1478 | 0 | return 0; |
1479 | 0 | if (*p & 0x80) |
1480 | 0 | { |
1481 | 0 | int len = 0; |
1482 | 0 | utf8decode(p, e, &len); |
1483 | 0 | if (len < 2) |
1484 | 0 | return 0; |
1485 | 0 | if (len > ret) |
1486 | 0 | ret = len; |
1487 | 0 | p += len; |
1488 | 0 | } |
1489 | 0 | else |
1490 | 0 | { |
1491 | 0 | p++; |
1492 | 0 | } |
1493 | 0 | } |
1494 | 0 | return ret; |
1495 | 0 | } |