Coverage Report

Created: 2025-06-13 06:29

/src/gdal/port/cpl_recode_stub.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 *
3
 * Name:     cpl_recode_stub.cpp
4
 * Project:  CPL - Common Portability Library
5
 * Purpose:  Character set recoding and char/wchar_t conversions, stub
6
 *           implementation to be used if iconv() functionality is not
7
 *           available.
8
 * Author:   Frank Warmerdam, warmerdam@pobox.com
9
 *
10
 * The bulk of this code is derived from the utf.c module from FLTK. It
11
 * was originally downloaded from:
12
 *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13
 *
14
 **********************************************************************
15
 * Copyright (c) 2008, Frank Warmerdam
16
 * Copyright 2006 by Bill Spitzak and others.
17
 * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18
 *
19
 * Permission to use, copy, modify, and distribute this software for any
20
 * purpose with or without fee is hereby granted, provided that the above
21
 * copyright notice and this permission notice appear in all copies.
22
 *
23
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30
 **********************************************************************/
31
32
#include "cpl_port.h"
33
#include "cpl_string.h"
34
35
#include <cstring>
36
37
#include "cpl_conv.h"
38
#include "cpl_error.h"
39
#include "cpl_character_sets.c"
40
41
static unsigned utf8decode(const char *p, const char *end, int *len);
42
static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
43
                         unsigned dstlen);
44
static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
45
                        unsigned dstlen);
46
static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
47
                           unsigned srclen);
48
static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
49
                          unsigned srclen);
50
static int utf8test(const char *src, unsigned srclen);
51
52
#ifdef _WIN32
53
54
#include <windows.h>
55
#include <winnls.h>
56
57
static char *CPLWin32Recode(const char *src, unsigned src_code_page,
58
                            unsigned dst_code_page) CPL_RETURNS_NONNULL;
59
#endif
60
61
/* used by cpl_recode.cpp */
62
extern void CPLClearRecodeStubWarningFlags();
63
extern char *CPLRecodeStub(const char *, const char *,
64
                           const char *) CPL_RETURNS_NONNULL;
65
extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
66
                                    const char *);
67
extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
68
69
/************************************************************************/
70
/* ==================================================================== */
71
/*      Stub Implementation not depending on iconv() or WIN32 API.      */
72
/* ==================================================================== */
73
/************************************************************************/
74
75
static bool bHaveWarned1 = false;
76
static bool bHaveWarned2 = false;
77
static bool bHaveWarned3 = false;
78
static bool bHaveWarned4 = false;
79
static bool bHaveWarned5 = false;
80
static bool bHaveWarned6 = false;
81
82
/************************************************************************/
83
/*                 CPLClearRecodeStubWarningFlags()                     */
84
/************************************************************************/
85
86
void CPLClearRecodeStubWarningFlags()
87
0
{
88
0
    bHaveWarned1 = false;
89
0
    bHaveWarned2 = false;
90
0
    bHaveWarned3 = false;
91
0
    bHaveWarned4 = false;
92
0
    bHaveWarned5 = false;
93
0
    bHaveWarned6 = false;
94
0
}
95
96
/************************************************************************/
97
/*                           CPLRecodeStub()                            */
98
/************************************************************************/
99
100
/**
101
 * Convert a string from a source encoding to a destination encoding.
102
 *
103
 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
104
 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
105
 * <ul>
106
 *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
107
 *  fact)</li>
108
 *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
109
 *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
110
 * </ul>
111
 *
112
 * If an error occurs an error may, or may not be posted with CPLError().
113
 *
114
 * @param pszSource a NULL terminated string.
115
 * @param pszSrcEncoding the source encoding.
116
 * @param pszDstEncoding the destination encoding.
117
 *
118
 * @return a NULL terminated string which should be freed with CPLFree().
119
 */
120
121
char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
122
                    const char *pszDstEncoding)
123
124
0
{
125
    /* -------------------------------------------------------------------- */
126
    /*      If the source or destination is current locale(), we change     */
127
    /*      it to ISO8859-1 since our stub implementation does not          */
128
    /*      attempt to address locales properly.                            */
129
    /* -------------------------------------------------------------------- */
130
131
0
    if (pszSrcEncoding[0] == '\0')
132
0
        pszSrcEncoding = CPL_ENC_ISO8859_1;
133
134
0
    if (pszDstEncoding[0] == '\0')
135
0
        pszDstEncoding = CPL_ENC_ISO8859_1;
136
137
    /* -------------------------------------------------------------------- */
138
    /*      ISO8859 to UTF8                                                 */
139
    /* -------------------------------------------------------------------- */
140
0
    if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
141
0
        strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
142
0
    {
143
0
        const int nCharCount = static_cast<int>(strlen(pszSource));
144
0
        char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
145
146
0
        utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
147
148
0
        return pszResult;
149
0
    }
150
151
    /* -------------------------------------------------------------------- */
152
    /*      UTF8 to ISO8859                                                 */
153
    /* -------------------------------------------------------------------- */
154
0
    if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
155
0
        strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
156
0
    {
157
0
        int nCharCount = static_cast<int>(strlen(pszSource));
158
0
        char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
159
160
0
        utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
161
162
0
        return pszResult;
163
0
    }
164
165
    // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
166
0
    if (EQUAL(pszDstEncoding, CPL_ENC_UTF8))
167
0
    {
168
0
        const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding);
169
0
        if (pConvTable)
170
0
        {
171
0
            const auto convTable = *pConvTable;
172
0
            const size_t nCharCount = strlen(pszSource);
173
0
            char *pszResult =
174
0
                static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1));
175
0
            size_t iDst = 0;
176
0
            unsigned char *pabyResult =
177
0
                reinterpret_cast<unsigned char *>(pszResult);
178
0
            for (size_t i = 0; i < nCharCount; ++i)
179
0
            {
180
0
                const unsigned char nChar =
181
0
                    static_cast<unsigned char>(pszSource[i]);
182
0
                if (nChar <= 127)
183
0
                {
184
0
                    pszResult[iDst] = pszSource[i];
185
0
                    ++iDst;
186
0
                }
187
0
                else
188
0
                {
189
0
                    const unsigned char nShiftedChar = nChar - 128;
190
0
                    if (convTable[nShiftedChar][0])
191
0
                    {
192
0
                        pabyResult[iDst] = convTable[nShiftedChar][0];
193
0
                        ++iDst;
194
0
                        CPLAssert(convTable[nShiftedChar][1]);
195
0
                        pabyResult[iDst] = convTable[nShiftedChar][1];
196
0
                        ++iDst;
197
0
                        if (convTable[nShiftedChar][2])
198
0
                        {
199
0
                            pabyResult[iDst] = convTable[nShiftedChar][2];
200
0
                            ++iDst;
201
0
                        }
202
0
                    }
203
0
                    else
204
0
                    {
205
                        // Skip the invalid sequence in the input string.
206
0
                        if (!bHaveWarned2)
207
0
                        {
208
0
                            bHaveWarned2 = true;
209
0
                            CPLError(CE_Warning, CPLE_AppDefined,
210
0
                                     "One or several characters couldn't be "
211
0
                                     "converted correctly from %s to %s. "
212
0
                                     "This warning will not be emitted anymore",
213
0
                                     pszSrcEncoding, pszDstEncoding);
214
0
                        }
215
0
                    }
216
0
                }
217
0
            }
218
219
0
            pszResult[iDst] = 0;
220
0
            return pszResult;
221
0
        }
222
0
    }
223
224
#ifdef _WIN32
225
    const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding)
226
    {
227
        // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers
228
        if (STARTS_WITH(pszEncoding, "CP"))
229
        {
230
            const int nCode = atoi(pszEncoding + strlen("CP"));
231
            if (nCode > 0)
232
                return nCode;
233
            else if (EQUAL(pszEncoding, "CP_OEMCP"))
234
                return CP_OEMCP;
235
            else if (EQUAL(pszEncoding, "CP_ACP"))
236
                return CP_ACP;
237
        }
238
        else if (STARTS_WITH(pszEncoding, "WINDOWS-"))
239
        {
240
            const int nCode = atoi(pszEncoding + strlen("WINDOWS-"));
241
            if (nCode > 0)
242
                return nCode;
243
        }
244
        else if (STARTS_WITH(pszEncoding, "ISO-8859-"))
245
        {
246
            const int nCode = atoi(pszEncoding + strlen("ISO-8859-"));
247
            if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15)
248
                return 28590 + nCode;
249
        }
250
251
        // Return a negative value, since CP_ACP = 0
252
        return -1;
253
    };
254
255
    /* ---------------------------------------------------------------------*/
256
    /*     XXX to UTF8                                                      */
257
    /* ---------------------------------------------------------------------*/
258
    if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
259
    {
260
        const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding);
261
        if (nCode >= 0)
262
        {
263
            return CPLWin32Recode(pszSource, nCode, CP_UTF8);
264
        }
265
    }
266
267
    /* ---------------------------------------------------------------------*/
268
    /*      UTF8 to XXX                                                     */
269
    /* ---------------------------------------------------------------------*/
270
    if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0)
271
    {
272
        const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding);
273
        if (nCode >= 0)
274
        {
275
            return CPLWin32Recode(pszSource, CP_UTF8, nCode);
276
        }
277
    }
278
#endif
279
280
    /* -------------------------------------------------------------------- */
281
    /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
282
    /*      a one-time warning.                                             */
283
    /* -------------------------------------------------------------------- */
284
0
    if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
285
0
    {
286
0
        const int nCharCount = static_cast<int>(strlen(pszSource));
287
0
        char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
288
289
0
        if (!bHaveWarned1)
290
0
        {
291
0
            bHaveWarned1 = true;
292
0
            CPLError(CE_Warning, CPLE_AppDefined,
293
0
                     "Recode from %s to UTF-8 not supported, "
294
0
                     "treated as ISO-8859-1 to UTF-8.",
295
0
                     pszSrcEncoding);
296
0
        }
297
298
0
        utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
299
300
0
        return pszResult;
301
0
    }
302
303
    /* -------------------------------------------------------------------- */
304
    /*      Everything else is treated as a no-op with a warning.           */
305
    /* -------------------------------------------------------------------- */
306
0
    {
307
0
        if (!bHaveWarned3)
308
0
        {
309
0
            bHaveWarned3 = true;
310
0
            CPLError(CE_Warning, CPLE_AppDefined,
311
0
                     "Recode from %s to %s not supported, no change applied.",
312
0
                     pszSrcEncoding, pszDstEncoding);
313
0
        }
314
315
0
        return CPLStrdup(pszSource);
316
0
    }
317
0
}
318
319
/************************************************************************/
320
/*                       CPLRecodeFromWCharStub()                       */
321
/************************************************************************/
322
323
/**
324
 * Convert wchar_t string to UTF-8.
325
 *
326
 * Convert a wchar_t string into a multibyte utf-8 string.  The only
327
 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
328
 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
329
 * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
330
 * may also be supported.
331
 *
332
 * Note that the wchar_t type varies in size on different systems. On
333
 * win32 it is normally 2 bytes, and on unix 4 bytes.
334
 *
335
 * If an error occurs an error may, or may not be posted with CPLError().
336
 *
337
 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
338
 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
339
 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
340
 *
341
 * @return a zero terminated multi-byte string which should be freed with
342
 * CPLFree(), or NULL if an error occurs.
343
 */
344
345
char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
346
                             const char *pszSrcEncoding,
347
                             const char *pszDstEncoding)
348
349
0
{
350
    /* -------------------------------------------------------------------- */
351
    /*      We try to avoid changes of character set.  We are just          */
352
    /*      providing for unicode to unicode.                               */
353
    /* -------------------------------------------------------------------- */
354
0
    if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
355
0
        strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
356
0
        strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
357
0
        strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
358
0
        strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
359
0
    {
360
0
        CPLError(CE_Failure, CPLE_AppDefined,
361
0
                 "Stub recoding implementation does not support "
362
0
                 "CPLRecodeFromWCharStub(...,%s,%s)",
363
0
                 pszSrcEncoding, pszDstEncoding);
364
0
        return nullptr;
365
0
    }
366
367
    /* -------------------------------------------------------------------- */
368
    /*      What is the source length.                                      */
369
    /* -------------------------------------------------------------------- */
370
0
    int nSrcLen = 0;
371
372
0
    while (pwszSource[nSrcLen] != 0)
373
0
        nSrcLen++;
374
375
    /* -------------------------------------------------------------------- */
376
    /*      Allocate destination buffer plenty big.                         */
377
    /* -------------------------------------------------------------------- */
378
0
    const int nDstBufSize = nSrcLen * 4 + 1;
379
    // Nearly worst case.
380
0
    char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
381
382
0
    if (nSrcLen == 0)
383
0
    {
384
0
        pszResult[0] = '\0';
385
0
        return pszResult;
386
0
    }
387
388
    /* -------------------------------------------------------------------- */
389
    /*      Convert, and confirm we had enough space.                       */
390
    /* -------------------------------------------------------------------- */
391
0
    const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
392
0
    if (nDstLen >= nDstBufSize)
393
0
    {
394
0
        CPLAssert(false);  // too small!
395
0
        return nullptr;
396
0
    }
397
398
    /* -------------------------------------------------------------------- */
399
    /*      If something other than UTF-8 was requested, recode now.        */
400
    /* -------------------------------------------------------------------- */
401
0
    if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
402
0
        return pszResult;
403
404
0
    char *pszFinalResult =
405
0
        CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
406
407
0
    CPLFree(pszResult);
408
409
0
    return pszFinalResult;
410
0
}
411
412
/************************************************************************/
413
/*                        CPLRecodeToWCharStub()                        */
414
/************************************************************************/
415
416
/**
417
 * Convert UTF-8 string to a wchar_t string.
418
 *
419
 * Convert a 8bit, multi-byte per character input string into a wide
420
 * character (wchar_t) string.  The only guaranteed supported source encodings
421
 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
422
 * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
423
 * and destination encodings may be supported depending on the underlying
424
 * implementation.
425
 *
426
 * Note that the wchar_t type varies in size on different systems. On
427
 * win32 it is normally 2 bytes, and on unix 4 bytes.
428
 *
429
 * If an error occurs an error may, or may not be posted with CPLError().
430
 *
431
 * @param pszSource input multi-byte character string.
432
 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
433
 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
434
 *
435
 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
436
 * NULL on error.
437
 *
438
 * @since GDAL 1.6.0
439
 */
440
441
wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
442
                              const char *pszDstEncoding)
443
444
0
{
445
0
    char *pszUTF8Source = const_cast<char *>(pszSource);
446
447
0
    if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
448
0
        strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
449
0
    {
450
0
        pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
451
0
        if (pszUTF8Source == nullptr)
452
0
            return nullptr;
453
0
    }
454
455
    /* -------------------------------------------------------------------- */
456
    /*      We try to avoid changes of character set.  We are just          */
457
    /*      providing for unicode to unicode.                               */
458
    /* -------------------------------------------------------------------- */
459
0
    if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
460
0
        strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
461
0
        strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
462
0
        strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
463
0
    {
464
0
        CPLError(CE_Failure, CPLE_AppDefined,
465
0
                 "Stub recoding implementation does not support "
466
0
                 "CPLRecodeToWCharStub(...,%s,%s)",
467
0
                 pszSrcEncoding, pszDstEncoding);
468
0
        if (pszUTF8Source != pszSource)
469
0
            CPLFree(pszUTF8Source);
470
0
        return nullptr;
471
0
    }
472
473
    /* -------------------------------------------------------------------- */
474
    /*      Do the UTF-8 to UCS-2 recoding.                                 */
475
    /* -------------------------------------------------------------------- */
476
0
    int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
477
0
    wchar_t *pwszResult =
478
0
        static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
479
480
0
    utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
481
482
0
    if (pszUTF8Source != pszSource)
483
0
        CPLFree(pszUTF8Source);
484
485
0
    return pwszResult;
486
0
}
487
488
/************************************************************************/
489
/*                                 CPLIsUTF8()                          */
490
/************************************************************************/
491
492
/**
493
 * Test if a string is encoded as UTF-8.
494
 *
495
 * @param pabyData input string to test
496
 * @param nLen length of the input string, or -1 if the function must compute
497
 *             the string length. In which case it must be null terminated.
498
 * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
499
 *
500
 * @since GDAL 1.7.0
501
 */
502
int CPLIsUTF8(const char *pabyData, int nLen)
503
0
{
504
0
    if (nLen < 0)
505
0
        nLen = static_cast<int>(strlen(pabyData));
506
0
    return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
507
0
}
508
509
/************************************************************************/
510
/* ==================================================================== */
511
/*      UTF.C code from FLTK with some modifications.                   */
512
/* ==================================================================== */
513
/************************************************************************/
514
515
/* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
516
   they are instead turned into the Unicode REPLACEMENT CHARACTER, of
517
   value 0xfffd.
518
   If this is on utf8decode will correctly map most (perhaps all)
519
   human-readable text that is in ISO-8859-1. This may allow you
520
   to completely ignore character sets in your code because virtually
521
   everything is either ISO-8859-1 or UTF-8.
522
*/
523
#define ERRORS_TO_ISO8859_1 1
524
525
/* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
526
   Unicode index for Microsoft's CP1252 character set. You should
527
   also set ERRORS_TO_ISO8859_1. With this a huge amount of more
528
   available text (such as all web pages) are correctly converted
529
   to Unicode.
530
*/
531
#define ERRORS_TO_CP1252 1
532
533
/* A number of Unicode code points are in fact illegal and should not
534
   be produced by a UTF-8 converter. Turn this on will replace the
535
   bytes in those encodings with errors. If you do this then converting
536
   arbitrary 16-bit data to UTF-8 and then back is not an identity,
537
   which will probably break a lot of software.
538
*/
539
#define STRICT_RFC3629 0
540
541
#if ERRORS_TO_CP1252
542
// Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
543
// to Unicode:
544
constexpr unsigned short cp1252[32] = {
545
    0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
546
    0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
547
    0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
548
    0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
549
#endif
550
551
/************************************************************************/
552
/*                             utf8decode()                             */
553
/************************************************************************/
554
555
/*
556
    Decode a single UTF-8 encoded character starting at \e p. The
557
    resulting Unicode value (in the range 0-0x10ffff) is returned,
558
    and \e len is set the number of bytes in the UTF-8 encoding
559
    (adding \e len to \e p will point at the next character).
560
561
    If \a p points at an illegal UTF-8 encoding, including one that
562
    would go past \e end, or where a code is uses more bytes than
563
    necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
564
though it is in the Microsoft CP1252 character set and \e len is set to 1.
565
    Treating errors this way allows this to decode almost any
566
    ISO-8859-1 or CP1252 text that has been mistakenly placed where
567
    UTF-8 is expected, and has proven very useful.
568
569
    If you want errors to be converted to error characters (as the
570
    standards recommend), adding a test to see if the length is
571
    unexpectedly 1 will work:
572
573
\code
574
    if( *p & 0x80 )
575
    {  // What should be a multibyte encoding.
576
      code = utf8decode(p, end, &len);
577
      if( len<2 ) code = 0xFFFD;  // Turn errors into REPLACEMENT CHARACTER.
578
    }
579
    else
580
    {  // Handle the 1-byte utf8 encoding:
581
      code = *p;
582
      len = 1;
583
    }
584
\endcode
585
586
    Direct testing for the 1-byte case (as shown above) will also
587
    speed up the scanning of strings where the majority of characters
588
    are ASCII.
589
*/
590
static unsigned utf8decode(const char *p, const char *end, int *len)
591
0
{
592
0
    unsigned char c = *reinterpret_cast<const unsigned char *>(p);
593
0
    if (c < 0x80)
594
0
    {
595
0
        *len = 1;
596
0
        return c;
597
0
#if ERRORS_TO_CP1252
598
0
    }
599
0
    else if (c < 0xa0)
600
0
    {
601
0
        *len = 1;
602
0
        return cp1252[c - 0x80];
603
0
#endif
604
0
    }
605
0
    else if (c < 0xc2)
606
0
    {
607
0
        goto FAIL;
608
0
    }
609
0
    if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
610
0
        goto FAIL;
611
0
    if (c < 0xe0)
612
0
    {
613
0
        *len = 2;
614
0
        return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
615
0
    }
616
0
    else if (c == 0xe0)
617
0
    {
618
0
        if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
619
0
            goto FAIL;
620
0
        goto UTF8_3;
621
#if STRICT_RFC3629
622
    }
623
    else if (c == 0xed)
624
    {
625
        // RFC 3629 says surrogate chars are illegal.
626
        if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
627
            goto FAIL;
628
        goto UTF8_3;
629
    }
630
    else if (c == 0xef)
631
    {
632
        // 0xfffe and 0xffff are also illegal characters.
633
        if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
634
            (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
635
            goto FAIL;
636
        goto UTF8_3;
637
#endif
638
0
    }
639
0
    else if (c < 0xf0)
640
0
    {
641
0
    UTF8_3:
642
0
        if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
643
0
            goto FAIL;
644
0
        *len = 3;
645
0
        return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
646
0
    }
647
0
    else if (c == 0xf0)
648
0
    {
649
0
        if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
650
0
            goto FAIL;
651
0
        goto UTF8_4;
652
0
    }
653
0
    else if (c < 0xf4)
654
0
    {
655
0
    UTF8_4:
656
0
        if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
657
0
            goto FAIL;
658
0
        *len = 4;
659
#if STRICT_RFC3629
660
        // RFC 3629 says all codes ending in fffe or ffff are illegal:
661
        if ((p[1] & 0xf) == 0xf &&
662
            (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
663
            (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
664
            goto FAIL;
665
#endif
666
0
        return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
667
0
               ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
668
0
    }
669
0
    else if (c == 0xf4)
670
0
    {
671
0
        if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
672
0
            goto FAIL;  // After 0x10ffff.
673
0
        goto UTF8_4;
674
0
    }
675
0
    else
676
0
    {
677
0
    FAIL:
678
0
        *len = 1;
679
0
#if ERRORS_TO_ISO8859_1
680
0
        return c;
681
#else
682
        return 0xfffd;  // Unicode REPLACEMENT CHARACTER
683
#endif
684
0
    }
685
0
}
686
687
/************************************************************************/
688
/*                              utf8towc()                              */
689
/************************************************************************/
690
691
/*  Convert a UTF-8 sequence into an array of wchar_t. These
692
    are used by some system calls, especially on Windows.
693
694
    \a src points at the UTF-8, and \a srclen is the number of bytes to
695
    convert.
696
697
    \a dst points at an array to write, and \a dstlen is the number of
698
    locations in this array. At most \a dstlen-1 words will be
699
    written there, plus a 0 terminating word. Thus this function
700
    will never overwrite the buffer and will always return a
701
    zero-terminated string. If \a dstlen is zero then \a dst can be
702
    null and no data is written, but the length is returned.
703
704
    The return value is the number of words that \e would be written
705
    to \a dst if it were long enough, not counting the terminating
706
    zero. If the return value is greater or equal to \a dstlen it
707
    indicates truncation, you can then allocate a new array of size
708
    return+1 and call this again.
709
710
    Errors in the UTF-8 are converted as though each byte in the
711
    erroneous string is in the Microsoft CP1252 encoding. This allows
712
    ISO-8859-1 text mistakenly identified as UTF-8 to be printed
713
    correctly.
714
715
    Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
716
    and most other systems. Where wchar_t is 16 bits, Unicode
717
    characters in the range 0x10000 to 0x10ffff are converted to
718
    "surrogate pairs" which take two words each (this is called UTF-16
719
    encoding). If wchar_t is 32 bits this rather nasty problem is
720
    avoided.
721
*/
722
static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
723
                         unsigned dstlen)
724
0
{
725
0
    const char *p = src;
726
0
    const char *e = src + srclen;
727
0
    unsigned count = 0;
728
0
    if (dstlen)
729
0
        while (true)
730
0
        {
731
0
            if (p >= e)
732
0
            {
733
0
                dst[count] = 0;
734
0
                return count;
735
0
            }
736
0
            if (!(*p & 0x80))
737
0
            {
738
                // ASCII
739
0
                dst[count] = *p++;
740
0
            }
741
0
            else
742
0
            {
743
0
                int len = 0;
744
0
                unsigned ucs = utf8decode(p, e, &len);
745
0
                p += len;
746
#ifdef _WIN32
747
                if (ucs < 0x10000)
748
                {
749
                    dst[count] = static_cast<wchar_t>(ucs);
750
                }
751
                else
752
                {
753
                    // Make a surrogate pair:
754
                    if (count + 2 >= dstlen)
755
                    {
756
                        dst[count] = 0;
757
                        count += 2;
758
                        break;
759
                    }
760
                    dst[count] = static_cast<wchar_t>(
761
                        (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
762
                    dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
763
                }
764
#else
765
0
                dst[count] = static_cast<wchar_t>(ucs);
766
0
#endif
767
0
            }
768
0
            if (++count == dstlen)
769
0
            {
770
0
                dst[count - 1] = 0;
771
0
                break;
772
0
            }
773
0
        }
774
    // We filled dst, measure the rest:
775
0
    while (p < e)
776
0
    {
777
0
        if (!(*p & 0x80))
778
0
        {
779
0
            p++;
780
0
        }
781
0
        else
782
0
        {
783
0
            int len = 0;
784
#ifdef _WIN32
785
            const unsigned ucs = utf8decode(p, e, &len);
786
            p += len;
787
            if (ucs >= 0x10000)
788
                ++count;
789
#else
790
0
            utf8decode(p, e, &len);
791
0
            p += len;
792
0
#endif
793
0
        }
794
0
        ++count;
795
0
    }
796
797
0
    return count;
798
0
}
799
800
/************************************************************************/
801
/*                              utf8toa()                               */
802
/************************************************************************/
803
/* Convert a UTF-8 sequence into an array of 1-byte characters.
804
805
    If the UTF-8 decodes to a character greater than 0xff then it is
806
    replaced with '?'.
807
808
    Errors in the UTF-8 are converted as individual bytes, same as
809
    utf8decode() does. This allows ISO-8859-1 text mistakenly identified
810
    as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
811
812
    \a src points at the UTF-8, and \a srclen is the number of bytes to
813
    convert.
814
815
    Up to \a dstlen bytes are written to \a dst, including a null
816
    terminator. The return value is the number of bytes that would be
817
    written, not counting the null terminator. If greater or equal to
818
    \a dstlen then if you malloc a new array of size n+1 you will have
819
    the space needed for the entire string. If \a dstlen is zero then
820
    nothing is written and this call just measures the storage space
821
    needed.
822
*/
823
static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
824
                            unsigned dstlen)
825
0
{
826
0
    const char *p = src;
827
0
    const char *e = src + srclen;
828
0
    unsigned int count = 0;
829
0
    if (dstlen)
830
0
        while (true)
831
0
        {
832
0
            if (p >= e)
833
0
            {
834
0
                dst[count] = 0;
835
0
                return count;
836
0
            }
837
0
            unsigned char c = *reinterpret_cast<const unsigned char *>(p);
838
0
            if (c < 0xC2)
839
0
            {
840
                // ASCII or bad code.
841
0
                dst[count] = c;
842
0
                p++;
843
0
            }
844
0
            else
845
0
            {
846
0
                int len = 0;
847
0
                const unsigned int ucs = utf8decode(p, e, &len);
848
0
                p += len;
849
0
                if (ucs < 0x100)
850
0
                {
851
0
                    dst[count] = static_cast<char>(ucs);
852
0
                }
853
0
                else
854
0
                {
855
0
                    if (!bHaveWarned4)
856
0
                    {
857
0
                        bHaveWarned4 = true;
858
0
                        CPLError(
859
0
                            CE_Warning, CPLE_AppDefined,
860
0
                            "One or several characters couldn't be converted "
861
0
                            "correctly from UTF-8 to ISO-8859-1.  "
862
0
                            "This warning will not be emitted anymore.");
863
0
                    }
864
0
                    dst[count] = '?';
865
0
                }
866
0
            }
867
0
            if (++count >= dstlen)
868
0
            {
869
0
                dst[count - 1] = 0;
870
0
                break;
871
0
            }
872
0
        }
873
    // We filled dst, measure the rest:
874
0
    while (p < e)
875
0
    {
876
0
        if (!(*p & 0x80))
877
0
        {
878
0
            p++;
879
0
        }
880
0
        else
881
0
        {
882
0
            int len = 0;
883
0
            utf8decode(p, e, &len);
884
0
            p += len;
885
0
        }
886
0
        ++count;
887
0
    }
888
0
    return count;
889
0
}
890
891
/************************************************************************/
892
/*                             utf8fromwc()                             */
893
/************************************************************************/
894
/* Turn "wide characters" as returned by some system calls
895
    (especially on Windows) into UTF-8.
896
897
    Up to \a dstlen bytes are written to \a dst, including a null
898
    terminator. The return value is the number of bytes that would be
899
    written, not counting the null terminator. If greater or equal to
900
    \a dstlen then if you malloc a new array of size n+1 you will have
901
    the space needed for the entire string. If \a dstlen is zero then
902
    nothing is written and this call just measures the storage space
903
    needed.
904
905
    \a srclen is the number of words in \a src to convert. On Windows
906
    this is not necessarily the number of characters, due to there
907
    possibly being "surrogate pairs" in the UTF-16 encoding used.
908
    On Unix wchar_t is 32 bits and each location is a character.
909
910
    On Unix if a src word is greater than 0x10ffff then this is an
911
    illegal character according to RFC 3629. These are converted as
912
    though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
913
    range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
914
    illegal according to RFC 3629. However I encode these as though
915
    they are legal, so that utf8towc will return the original data.
916
917
    On Windows "surrogate pairs" are converted to a single character
918
    and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
919
    pairs are converted as though they are individual characters.
920
*/
921
static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
922
                               unsigned srclen)
923
0
{
924
0
    unsigned int i = 0;
925
0
    unsigned int count = 0;
926
0
    if (dstlen)
927
0
        while (true)
928
0
        {
929
0
            if (i >= srclen)
930
0
            {
931
0
                dst[count] = 0;
932
0
                return count;
933
0
            }
934
0
            unsigned int ucs = src[i++];
935
0
            if (ucs < 0x80U)
936
0
            {
937
0
                dst[count++] = static_cast<char>(ucs);
938
0
                if (count >= dstlen)
939
0
                {
940
0
                    dst[count - 1] = 0;
941
0
                    break;
942
0
                }
943
0
            }
944
0
            else if (ucs < 0x800U)
945
0
            {
946
                // 2 bytes.
947
0
                if (count + 2 >= dstlen)
948
0
                {
949
0
                    dst[count] = 0;
950
0
                    count += 2;
951
0
                    break;
952
0
                }
953
0
                dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
954
0
                dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
955
#ifdef _WIN32
956
            }
957
            else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
958
                     src[i] >= 0xdc00 && src[i] <= 0xdfff)
959
            {
960
                // Surrogate pair.
961
                unsigned int ucs2 = src[i++];
962
                ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
963
                // All surrogate pairs turn into 4-byte utf8.
964
#else
965
0
            }
966
0
            else if (ucs >= 0x10000)
967
0
            {
968
0
                if (ucs > 0x10ffff)
969
0
                {
970
0
                    ucs = 0xfffd;
971
0
                    goto J1;
972
0
                }
973
0
#endif
974
0
                if (count + 4 >= dstlen)
975
0
                {
976
0
                    dst[count] = 0;
977
0
                    count += 4;
978
0
                    break;
979
0
                }
980
0
                dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
981
0
                dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
982
0
                dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
983
0
                dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
984
0
            }
985
0
            else
986
0
            {
987
0
#ifndef _WIN32
988
0
            J1:
989
0
#endif
990
                // All others are 3 bytes:
991
0
                if (count + 3 >= dstlen)
992
0
                {
993
0
                    dst[count] = 0;
994
0
                    count += 3;
995
0
                    break;
996
0
                }
997
0
                dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
998
0
                dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
999
0
                dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
1000
0
            }
1001
0
        }
1002
1003
    // We filled dst, measure the rest:
1004
0
    while (i < srclen)
1005
0
    {
1006
0
        unsigned int ucs = src[i++];
1007
0
        if (ucs < 0x80U)
1008
0
        {
1009
0
            count++;
1010
0
        }
1011
0
        else if (ucs < 0x800U)
1012
0
        {
1013
            // 2 bytes.
1014
0
            count += 2;
1015
#ifdef _WIN32
1016
        }
1017
        else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
1018
                 src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
1019
        {
1020
            // Surrogate pair.
1021
            ++i;
1022
#else
1023
0
        }
1024
0
        else if (ucs >= 0x10000 && ucs <= 0x10ffff)
1025
0
        {
1026
0
#endif
1027
0
            count += 4;
1028
0
        }
1029
0
        else
1030
0
        {
1031
0
            count += 3;
1032
0
        }
1033
0
    }
1034
0
    return count;
1035
0
}
1036
1037
/************************************************************************/
1038
/*                             utf8froma()                              */
1039
/************************************************************************/
1040
1041
/* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
1042
1043
    It is possible this should convert Microsoft's CP1252 to UTF-8
1044
    instead. This would translate the codes in the range 0x80-0x9f
1045
    to different characters. Currently it does not do this.
1046
1047
    Up to \a dstlen bytes are written to \a dst, including a null
1048
    terminator. The return value is the number of bytes that would be
1049
    written, not counting the null terminator. If greater or equal to
1050
    \a dstlen then if you malloc a new array of size n+1 you will have
1051
    the space needed for the entire string. If \a dstlen is zero then
1052
    nothing is written and this call just measures the storage space
1053
    needed.
1054
1055
    \a srclen is the number of bytes in \a src to convert.
1056
1057
    If the return value equals \a srclen then this indicates that
1058
    no conversion is necessary, as only ASCII characters are in the
1059
    string.
1060
*/
1061
static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
1062
                          unsigned srclen)
1063
0
{
1064
0
    const char *p = src;
1065
0
    const char *e = src + srclen;
1066
0
    unsigned count = 0;
1067
0
    if (dstlen)
1068
0
        while (true)
1069
0
        {
1070
0
            if (p >= e)
1071
0
            {
1072
0
                dst[count] = 0;
1073
0
                return count;
1074
0
            }
1075
0
            unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1076
0
            p++;
1077
0
            if (ucs < 0x80U)
1078
0
            {
1079
0
                dst[count++] = ucs;
1080
0
                if (count >= dstlen)
1081
0
                {
1082
0
                    dst[count - 1] = 0;
1083
0
                    break;
1084
0
                }
1085
0
            }
1086
0
            else
1087
0
            {
1088
                // 2 bytes (note that CP1252 translate could make 3 bytes!)
1089
0
                if (count + 2 >= dstlen)
1090
0
                {
1091
0
                    dst[count] = 0;
1092
0
                    count += 2;
1093
0
                    break;
1094
0
                }
1095
0
                dst[count++] = 0xc0 | (ucs >> 6);
1096
0
                dst[count++] = 0x80 | (ucs & 0x3F);
1097
0
            }
1098
0
        }
1099
1100
    // We filled dst, measure the rest:
1101
0
    while (p < e)
1102
0
    {
1103
0
        unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1104
0
        p++;
1105
0
        if (ucs < 0x80U)
1106
0
        {
1107
0
            count++;
1108
0
        }
1109
0
        else
1110
0
        {
1111
0
            count += 2;
1112
0
        }
1113
0
    }
1114
1115
0
    return count;
1116
0
}
1117
1118
#ifdef _WIN32
1119
1120
/************************************************************************/
1121
/*                            CPLWin32Recode()                          */
1122
/************************************************************************/
1123
1124
/* Convert an CODEPAGE (i.e. normal c-string) byte stream
1125
     to another CODEPAGE (i.e. normal c-string) byte stream.
1126
1127
    \a src is target c-string byte stream (including a null terminator).
1128
    \a src_code_page is target c-string byte code page.
1129
    \a dst_code_page is destination c-string byte code page.
1130
1131
   UTF7          65000
1132
   UTF8          65001
1133
   OEM-US          437
1134
   OEM-ALABIC      720
1135
   OEM-GREEK       737
1136
   OEM-BALTIC      775
1137
   OEM-MLATIN1     850
1138
   OEM-LATIN2      852
1139
   OEM-CYRILLIC    855
1140
   OEM-TURKISH     857
1141
   OEM-MLATIN1P    858
1142
   OEM-HEBREW      862
1143
   OEM-RUSSIAN     866
1144
1145
   THAI            874
1146
   SJIS            932
1147
   GBK             936
1148
   KOREA           949
1149
   BIG5            950
1150
1151
   EUROPE         1250
1152
   CYRILLIC       1251
1153
   LATIN1         1252
1154
   GREEK          1253
1155
   TURKISH        1254
1156
   HEBREW         1255
1157
   ARABIC         1256
1158
   BALTIC         1257
1159
   VIETNAM        1258
1160
1161
   ISO-LATIN1    28591
1162
   ISO-LATIN2    28592
1163
   ISO-LATIN3    28593
1164
   ISO-BALTIC    28594
1165
   ISO-CYRILLIC  28595
1166
   ISO-ARABIC    28596
1167
   ISO-HEBREW    28598
1168
   ISO-TURKISH   28599
1169
   ISO-LATIN9    28605
1170
1171
   ISO-2022-JP   50220
1172
1173
*/
1174
1175
char *CPLWin32Recode(const char *src, unsigned src_code_page,
1176
                     unsigned dst_code_page)
1177
{
1178
    // Convert from source code page to Unicode.
1179
1180
    // Compute the length in wide characters.
1181
    int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
1182
                                   nullptr, 0);
1183
    if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
1184
    {
1185
        if (!bHaveWarned5)
1186
        {
1187
            bHaveWarned5 = true;
1188
            CPLError(
1189
                CE_Warning, CPLE_AppDefined,
1190
                "One or several characters could not be translated from CP%d. "
1191
                "This warning will not be emitted anymore.",
1192
                src_code_page);
1193
        }
1194
1195
        // Retry now without MB_ERR_INVALID_CHARS flag.
1196
        wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
1197
    }
1198
1199
    // Do the actual conversion.
1200
    wchar_t *tbuf =
1201
        static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1202
    tbuf[wlen] = 0;
1203
    MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
1204
1205
    // Convert from Unicode to destination code page.
1206
1207
    // Compute the length in chars.
1208
    BOOL bUsedDefaultChar = FALSE;
1209
    int len = 0;
1210
    if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
1211
        len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1212
                                  nullptr, nullptr);
1213
    else
1214
        len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1215
                                  nullptr, &bUsedDefaultChar);
1216
    if (bUsedDefaultChar)
1217
    {
1218
        if (!bHaveWarned6)
1219
        {
1220
            bHaveWarned6 = true;
1221
            CPLError(
1222
                CE_Warning, CPLE_AppDefined,
1223
                "One or several characters could not be translated to CP%d. "
1224
                "This warning will not be emitted anymore.",
1225
                dst_code_page);
1226
        }
1227
    }
1228
1229
    // Do the actual conversion.
1230
    char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1231
    WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
1232
                        nullptr);
1233
    pszResult[len] = 0;
1234
1235
    CPLFree(tbuf);
1236
1237
    return pszResult;
1238
}
1239
1240
#endif
1241
1242
/*
1243
** For now we disable the rest which is locale() related.  We may need
1244
** parts of it later.
1245
*/
1246
1247
#ifdef notdef
1248
1249
#ifdef _WIN32
1250
#include <windows.h>
1251
#endif
1252
1253
/*! Return true if the "locale" seems to indicate that UTF-8 encoding
1254
    is used. If true the utf8tomb and utf8frommb don't do anything
1255
    useful.
1256
1257
    <i>It is highly recommended that you change your system so this
1258
    does return true.</i> On Windows this is done by setting the
1259
    "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
1260
    to a string containing the letters "utf" or "UTF" in it, or by
1261
    deleting all $LC* and $LANG environment variables. In the future
1262
    it is likely that all non-Asian Unix systems will return true,
1263
    due to the compatibility of UTF-8 with ISO-8859-1.
1264
*/
1265
int utf8locale(void)
1266
{
1267
    static int ret = 2;
1268
    if (ret == 2)
1269
    {
1270
#ifdef _WIN32
1271
        ret = GetACP() == CP_UTF8;
1272
#else
1273
        char *s;
1274
        ret = 1;  // assume UTF-8 if no locale
1275
        if (((s = getenv("LC_CTYPE")) && *s) ||
1276
            ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
1277
        {
1278
            ret = strstr(s, "utf") || strstr(s, "UTF");
1279
        }
1280
#endif
1281
    }
1282
1283
    return ret;
1284
}
1285
1286
/*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1287
    used for filenames (and sometimes used for data in files).
1288
    Unfortunately due to stupid design you will have to do this as
1289
    needed for filenames. This is a bug on both Unix and Windows.
1290
1291
    Up to \a dstlen bytes are written to \a dst, including a null
1292
    terminator. The return value is the number of bytes that would be
1293
    written, not counting the null terminator. If greater or equal to
1294
    \a dstlen then if you malloc a new array of size n+1 you will have
1295
    the space needed for the entire string. If \a dstlen is zero then
1296
    nothing is written and this call just measures the storage space
1297
    needed.
1298
1299
    If utf8locale() returns true then this does not change the data.
1300
    It is copied and truncated as necessary to
1301
    the destination buffer and \a srclen is always returned.  */
1302
unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
1303
{
1304
    if (!utf8locale())
1305
    {
1306
#ifdef _WIN32
1307
        wchar_t lbuf[1024] = {};
1308
        wchar_t *buf = lbuf;
1309
        unsigned length = utf8towc(src, srclen, buf, 1024);
1310
        unsigned ret;
1311
        if (length >= 1024)
1312
        {
1313
            buf =
1314
                static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1315
            utf8towc(src, srclen, buf, length + 1);
1316
        }
1317
        if (dstlen)
1318
        {
1319
            // apparently this does not null-terminate, even though msdn
1320
            // documentation claims it does:
1321
            ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
1322
                                      0);
1323
            dst[ret] = 0;
1324
        }
1325
        // if it overflows or measuring length, get the actual length:
1326
        if (dstlen == 0 || ret >= dstlen - 1)
1327
            ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1328
        if (buf != lbuf)
1329
            free((void *)buf);
1330
        return ret;
1331
#else
1332
        wchar_t lbuf[1024] = {};
1333
        wchar_t *buf = lbuf;
1334
        unsigned length = utf8towc(src, srclen, buf, 1024);
1335
        if (length >= 1024)
1336
        {
1337
            buf =
1338
                static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1339
            utf8towc(src, srclen, buf, length + 1);
1340
        }
1341
        int ret = 0;
1342
        if (dstlen)
1343
        {
1344
            ret = wcstombs(dst, buf, dstlen);
1345
            if (ret >= dstlen - 1)
1346
                ret = wcstombs(0, buf, 0);
1347
        }
1348
        else
1349
        {
1350
            ret = wcstombs(0, buf, 0);
1351
        }
1352
        if (buf != lbuf)
1353
            free((void *)buf);
1354
        if (ret >= 0)
1355
            return (unsigned)ret;
1356
            // On any errors we return the UTF-8 as raw text...
1357
#endif
1358
    }
1359
    // Identity transform:
1360
    if (srclen < dstlen)
1361
    {
1362
        memcpy(dst, src, srclen);
1363
        dst[srclen] = 0;
1364
    }
1365
    else
1366
    {
1367
        memcpy(dst, src, dstlen - 1);
1368
        dst[dstlen - 1] = 0;
1369
    }
1370
    return srclen;
1371
}
1372
1373
/*! Convert a filename from the locale-specific multibyte encoding
1374
    used by Windows to UTF-8 as used by FLTK.
1375
1376
    Up to \a dstlen bytes are written to \a dst, including a null
1377
    terminator. The return value is the number of bytes that would be
1378
    written, not counting the null terminator. If greater or equal to
1379
    \a dstlen then if you malloc a new array of size n+1 you will have
1380
    the space needed for the entire string. If \a dstlen is zero then
1381
    nothing is written and this call just measures the storage space
1382
    needed.
1383
1384
    On Unix or on Windows when a UTF-8 locale is in effect, this
1385
    does not change the data. It is copied and truncated as necessary to
1386
    the destination buffer and \a srclen is always returned.
1387
    You may also want to check if utf8test() returns non-zero, so that
1388
    the filesystem can store filenames in UTF-8 encoding regardless of
1389
    the locale.
1390
*/
1391
unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
1392
                    unsigned srclen)
1393
{
1394
    if (!utf8locale())
1395
    {
1396
#ifdef _WIN32
1397
        wchar_t lbuf[1024] = {};
1398
        wchar_t *buf = lbuf;
1399
        unsigned ret;
1400
        const unsigned length =
1401
            MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1402
        if (length >= 1024)
1403
        {
1404
            length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1405
            buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1406
            MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1407
        }
1408
        ret = utf8fromwc(dst, dstlen, buf, length);
1409
        if (buf != lbuf)
1410
            free(buf);
1411
        return ret;
1412
#else
1413
        wchar_t lbuf[1024] = {};
1414
        wchar_t *buf = lbuf;
1415
        const int length = mbstowcs(buf, src, 1024);
1416
        if (length >= 1024)
1417
        {
1418
            length = mbstowcs(0, src, 0) + 1;
1419
            buf =
1420
                static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
1421
            mbstowcs(buf, src, length);
1422
        }
1423
        if (length >= 0)
1424
        {
1425
            const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1426
            if (buf != lbuf)
1427
                free(buf);
1428
            return ret;
1429
        }
1430
        // Errors in conversion return the UTF-8 unchanged.
1431
#endif
1432
    }
1433
    // Identity transform:
1434
    if (srclen < dstlen)
1435
    {
1436
        memcpy(dst, src, srclen);
1437
        dst[srclen] = 0;
1438
    }
1439
    else
1440
    {
1441
        memcpy(dst, src, dstlen - 1);
1442
        dst[dstlen - 1] = 0;
1443
    }
1444
    return srclen;
1445
}
1446
1447
#endif  // def notdef - disabled locale specific stuff.
1448
1449
/*! Examines the first \a srclen bytes in \a src and return a verdict
1450
    on whether it is UTF-8 or not.
1451
    - Returns 0 if there is any illegal UTF-8 sequences, using the
1452
      same rules as utf8decode(). Note that some UCS values considered
1453
      illegal by RFC 3629, such as 0xffff, are considered legal by this.
1454
    - Returns 1 if there are only single-byte characters (i.e. no bytes
1455
      have the high bit set). This is legal UTF-8, but also indicates
1456
      plain ASCII. It also returns 1 if \a srclen is zero.
1457
    - Returns 2 if there are only characters less than 0x800.
1458
    - Returns 3 if there are only characters less than 0x10000.
1459
    - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1460
1461
    Because there are many illegal sequences in UTF-8, it is almost
1462
    impossible for a string in another encoding to be confused with
1463
    UTF-8. This is very useful for transitioning Unix to UTF-8
1464
    filenames, you can simply test each filename with this to decide
1465
    if it is UTF-8 or in the locale encoding. My hope is that if
1466
    this is done we will be able to cleanly transition to a locale-less
1467
    encoding.
1468
*/
1469
1470
static int utf8test(const char *src, unsigned srclen)
1471
0
{
1472
0
    int ret = 1;
1473
0
    const char *p = src;
1474
0
    const char *e = src + srclen;
1475
0
    while (p < e)
1476
0
    {
1477
0
        if (*p == 0)
1478
0
            return 0;
1479
0
        if (*p & 0x80)
1480
0
        {
1481
0
            int len = 0;
1482
0
            utf8decode(p, e, &len);
1483
0
            if (len < 2)
1484
0
                return 0;
1485
0
            if (len > ret)
1486
0
                ret = len;
1487
0
            p += len;
1488
0
        }
1489
0
        else
1490
0
        {
1491
0
            p++;
1492
0
        }
1493
0
    }
1494
0
    return ret;
1495
0
}