Coverage Report

Created: 2025-11-16 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/port/cpl_recode_stub.cpp
Line
Count
Source
1
/**********************************************************************
2
 *
3
 * Name:     cpl_recode_stub.cpp
4
 * Project:  CPL - Common Portability Library
5
 * Purpose:  Character set recoding and char/wchar_t conversions, stub
6
 *           implementation to be used if iconv() functionality is not
7
 *           available.
8
 * Author:   Frank Warmerdam, warmerdam@pobox.com
9
 *
10
 * The bulk of this code is derived from the utf.c module from FLTK. It
11
 * was originally downloaded from:
12
 *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13
 *
14
 **********************************************************************
15
 * Copyright (c) 2008, Frank Warmerdam
16
 * Copyright 2006 by Bill Spitzak and others.
17
 * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18
 *
19
 * Permission to use, copy, modify, and distribute this software for any
20
 * purpose with or without fee is hereby granted, provided that the above
21
 * copyright notice and this permission notice appear in all copies.
22
 *
23
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30
 **********************************************************************/
31
32
#include "cpl_port.h"
33
#include "cpl_string.h"
34
35
#include <cstring>
36
37
#include "cpl_conv.h"
38
#include "cpl_error.h"
39
#include "cpl_character_sets.c"
40
41
static unsigned utf8decode(const char *p, const char *end, int *len);
42
static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
43
                         unsigned dstlen);
44
static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
45
                        unsigned dstlen);
46
static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
47
                           unsigned srclen);
48
static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
49
                          unsigned srclen);
50
static int utf8test(const char *src, unsigned srclen);
51
52
#ifdef _WIN32
53
54
#include <windows.h>
55
#include <winnls.h>
56
57
static char *CPLWin32Recode(const char *src, unsigned src_code_page,
58
                            unsigned dst_code_page) CPL_RETURNS_NONNULL;
59
#endif
60
61
/* used by cpl_recode.cpp */
62
extern void CPLClearRecodeStubWarningFlags();
63
extern char *CPLRecodeStub(const char *, const char *,
64
                           const char *) CPL_RETURNS_NONNULL;
65
extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
66
                                    const char *);
67
extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
68
69
/************************************************************************/
70
/* ==================================================================== */
71
/*      Stub Implementation not depending on iconv() or WIN32 API.      */
72
/* ==================================================================== */
73
/************************************************************************/
74
75
static bool bHaveWarned1 = false;
76
static bool bHaveWarned2 = false;
77
static bool bHaveWarned3 = false;
78
static bool bHaveWarned4 = false;
79
static bool bHaveWarned5 = false;
80
static bool bHaveWarned6 = false;
81
82
/************************************************************************/
83
/*                 CPLClearRecodeStubWarningFlags()                     */
84
/************************************************************************/
85
86
void CPLClearRecodeStubWarningFlags()
87
0
{
88
0
    bHaveWarned1 = false;
89
0
    bHaveWarned2 = false;
90
0
    bHaveWarned3 = false;
91
0
    bHaveWarned4 = false;
92
0
    bHaveWarned5 = false;
93
0
    bHaveWarned6 = false;
94
0
}
95
96
/************************************************************************/
97
/*                           CPLRecodeStub()                            */
98
/************************************************************************/
99
100
/**
101
 * Convert a string from a source encoding to a destination encoding.
102
 *
103
 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
104
 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
105
 * <ul>
106
 *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
107
 *  fact)</li>
108
 *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
109
 *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
110
 * </ul>
111
 *
112
 * If an error occurs an error may, or may not be posted with CPLError().
113
 *
114
 * @param pszSource a NULL terminated string.
115
 * @param pszSrcEncoding the source encoding.
116
 * @param pszDstEncoding the destination encoding.
117
 *
118
 * @return a NULL terminated string which should be freed with CPLFree().
119
 */
120
121
char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
122
                    const char *pszDstEncoding)
123
124
0
{
125
    /* -------------------------------------------------------------------- */
126
    /*      If the source or destination is current locale(), we change     */
127
    /*      it to ISO8859-1 since our stub implementation does not          */
128
    /*      attempt to address locales properly.                            */
129
    /* -------------------------------------------------------------------- */
130
131
0
    if (pszSrcEncoding[0] == '\0')
132
0
        pszSrcEncoding = CPL_ENC_ISO8859_1;
133
134
0
    if (pszDstEncoding[0] == '\0')
135
0
        pszDstEncoding = CPL_ENC_ISO8859_1;
136
137
    /* -------------------------------------------------------------------- */
138
    /*      ISO8859 to UTF8                                                 */
139
    /* -------------------------------------------------------------------- */
140
0
    if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
141
0
        strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
142
0
    {
143
0
        const int nCharCount = static_cast<int>(strlen(pszSource));
144
0
        char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
145
146
0
        utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
147
148
0
        return pszResult;
149
0
    }
150
151
    /* -------------------------------------------------------------------- */
152
    /*      UTF8 to ISO8859                                                 */
153
    /* -------------------------------------------------------------------- */
154
0
    if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
155
0
        strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
156
0
    {
157
0
        int nCharCount = static_cast<int>(strlen(pszSource));
158
0
        char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
159
160
0
        utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
161
162
0
        return pszResult;
163
0
    }
164
165
    // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
166
0
    if (EQUAL(pszDstEncoding, CPL_ENC_UTF8))
167
0
    {
168
0
        const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding);
169
0
        if (pConvTable)
170
0
        {
171
0
            const auto convTable = *pConvTable;
172
0
            const size_t nCharCount = strlen(pszSource);
173
0
            char *pszResult =
174
0
                static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1));
175
0
            size_t iDst = 0;
176
0
            unsigned char *pabyResult =
177
0
                reinterpret_cast<unsigned char *>(pszResult);
178
0
            for (size_t i = 0; i < nCharCount; ++i)
179
0
            {
180
0
                const unsigned char nChar =
181
0
                    static_cast<unsigned char>(pszSource[i]);
182
0
                if (nChar <= 127)
183
0
                {
184
0
                    pszResult[iDst] = pszSource[i];
185
0
                    ++iDst;
186
0
                }
187
0
                else
188
0
                {
189
0
                    const unsigned char nShiftedChar = nChar - 128;
190
0
                    if (convTable[nShiftedChar][0])
191
0
                    {
192
0
                        pabyResult[iDst] = convTable[nShiftedChar][0];
193
0
                        ++iDst;
194
0
                        CPLAssert(convTable[nShiftedChar][1]);
195
0
                        pabyResult[iDst] = convTable[nShiftedChar][1];
196
0
                        ++iDst;
197
0
                        if (convTable[nShiftedChar][2])
198
0
                        {
199
0
                            pabyResult[iDst] = convTable[nShiftedChar][2];
200
0
                            ++iDst;
201
0
                        }
202
0
                    }
203
0
                    else
204
0
                    {
205
                        // Skip the invalid sequence in the input string.
206
0
                        if (!bHaveWarned2)
207
0
                        {
208
0
                            bHaveWarned2 = true;
209
0
                            CPLError(CE_Warning, CPLE_AppDefined,
210
0
                                     "One or several characters couldn't be "
211
0
                                     "converted correctly from %s to %s. "
212
0
                                     "This warning will not be emitted anymore",
213
0
                                     pszSrcEncoding, pszDstEncoding);
214
0
                        }
215
0
                    }
216
0
                }
217
0
            }
218
219
0
            pszResult[iDst] = 0;
220
0
            return pszResult;
221
0
        }
222
0
    }
223
224
#ifdef _WIN32
225
    const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding)
226
    {
227
        // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers
228
        if (STARTS_WITH(pszEncoding, "CP"))
229
        {
230
            const int nCode = atoi(pszEncoding + strlen("CP"));
231
            if (nCode > 0)
232
                return nCode;
233
            else if (EQUAL(pszEncoding, "CP_OEMCP"))
234
                return CP_OEMCP;
235
            else if (EQUAL(pszEncoding, "CP_ACP"))
236
                return CP_ACP;
237
        }
238
        else if (STARTS_WITH(pszEncoding, "WINDOWS-"))
239
        {
240
            const int nCode = atoi(pszEncoding + strlen("WINDOWS-"));
241
            if (nCode > 0)
242
                return nCode;
243
        }
244
        else if (STARTS_WITH(pszEncoding, "ISO-8859-"))
245
        {
246
            const int nCode = atoi(pszEncoding + strlen("ISO-8859-"));
247
            if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15)
248
                return 28590 + nCode;
249
        }
250
251
        // Return a negative value, since CP_ACP = 0
252
        return -1;
253
    };
254
255
    /* ---------------------------------------------------------------------*/
256
    /*     XXX to UTF8                                                      */
257
    /* ---------------------------------------------------------------------*/
258
    if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
259
    {
260
        const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding);
261
        if (nCode >= 0)
262
        {
263
            return CPLWin32Recode(pszSource, nCode, CP_UTF8);
264
        }
265
    }
266
267
    /* ---------------------------------------------------------------------*/
268
    /*      UTF8 to XXX                                                     */
269
    /* ---------------------------------------------------------------------*/
270
    if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0)
271
    {
272
        const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding);
273
        if (nCode >= 0)
274
        {
275
            return CPLWin32Recode(pszSource, CP_UTF8, nCode);
276
        }
277
    }
278
#endif
279
280
    /* -------------------------------------------------------------------- */
281
    /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
282
    /*      a one-time warning.                                             */
283
    /* -------------------------------------------------------------------- */
284
0
    if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
285
0
    {
286
0
        const int nCharCount = static_cast<int>(strlen(pszSource));
287
0
        char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
288
289
0
        if (!bHaveWarned1)
290
0
        {
291
0
            bHaveWarned1 = true;
292
0
            CPLError(CE_Warning, CPLE_AppDefined,
293
0
                     "Recode from %s to UTF-8 not supported, "
294
0
                     "treated as ISO-8859-1 to UTF-8.",
295
0
                     pszSrcEncoding);
296
0
        }
297
298
0
        utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
299
300
0
        return pszResult;
301
0
    }
302
303
    /* -------------------------------------------------------------------- */
304
    /*      Everything else is treated as a no-op with a warning.           */
305
    /* -------------------------------------------------------------------- */
306
0
    {
307
0
        if (!bHaveWarned3)
308
0
        {
309
0
            bHaveWarned3 = true;
310
0
            CPLError(CE_Warning, CPLE_AppDefined,
311
0
                     "Recode from %s to %s not supported, no change applied.",
312
0
                     pszSrcEncoding, pszDstEncoding);
313
0
        }
314
315
0
        return CPLStrdup(pszSource);
316
0
    }
317
0
}
318
319
/************************************************************************/
320
/*                       CPLRecodeFromWCharStub()                       */
321
/************************************************************************/
322
323
/**
324
 * Convert wchar_t string to UTF-8.
325
 *
326
 * Convert a wchar_t string into a multibyte utf-8 string.  The only
327
 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
328
 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
329
 * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
330
 * may also be supported.
331
 *
332
 * Note that the wchar_t type varies in size on different systems. On
333
 * win32 it is normally 2 bytes, and on unix 4 bytes.
334
 *
335
 * If an error occurs an error may, or may not be posted with CPLError().
336
 *
337
 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
338
 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
339
 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
340
 *
341
 * @return a zero terminated multi-byte string which should be freed with
342
 * CPLFree(), or NULL if an error occurs.
343
 */
344
345
char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
346
                             const char *pszSrcEncoding,
347
                             const char *pszDstEncoding)
348
349
0
{
350
    /* -------------------------------------------------------------------- */
351
    /*      We try to avoid changes of character set.  We are just          */
352
    /*      providing for unicode to unicode.                               */
353
    /* -------------------------------------------------------------------- */
354
0
    if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
355
0
        strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
356
0
        strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
357
0
        strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
358
0
        strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
359
0
    {
360
0
        CPLError(CE_Failure, CPLE_AppDefined,
361
0
                 "Stub recoding implementation does not support "
362
0
                 "CPLRecodeFromWCharStub(...,%s,%s)",
363
0
                 pszSrcEncoding, pszDstEncoding);
364
0
        return nullptr;
365
0
    }
366
367
    /* -------------------------------------------------------------------- */
368
    /*      What is the source length.                                      */
369
    /* -------------------------------------------------------------------- */
370
0
    int nSrcLen = 0;
371
372
0
    while (pwszSource[nSrcLen] != 0)
373
0
        nSrcLen++;
374
375
    /* -------------------------------------------------------------------- */
376
    /*      Allocate destination buffer plenty big.                         */
377
    /* -------------------------------------------------------------------- */
378
0
    const int nDstBufSize = nSrcLen * 4 + 1;
379
    // Nearly worst case.
380
0
    char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
381
382
0
    if (nSrcLen == 0)
383
0
    {
384
0
        pszResult[0] = '\0';
385
0
        return pszResult;
386
0
    }
387
388
    /* -------------------------------------------------------------------- */
389
    /*      Convert, and confirm we had enough space.                       */
390
    /* -------------------------------------------------------------------- */
391
0
    const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
392
0
    if (nDstLen >= nDstBufSize)
393
0
    {
394
0
        CPLAssert(false);  // too small!
395
0
        return nullptr;
396
0
    }
397
398
    /* -------------------------------------------------------------------- */
399
    /*      If something other than UTF-8 was requested, recode now.        */
400
    /* -------------------------------------------------------------------- */
401
0
    if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
402
0
        return pszResult;
403
404
0
    char *pszFinalResult =
405
0
        CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
406
407
0
    CPLFree(pszResult);
408
409
0
    return pszFinalResult;
410
0
}
411
412
/************************************************************************/
413
/*                        CPLRecodeToWCharStub()                        */
414
/************************************************************************/
415
416
/**
417
 * Convert UTF-8 string to a wchar_t string.
418
 *
419
 * Convert a 8bit, multi-byte per character input string into a wide
420
 * character (wchar_t) string.  The only guaranteed supported source encodings
421
 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
422
 * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
423
 * and destination encodings may be supported depending on the underlying
424
 * implementation.
425
 *
426
 * Note that the wchar_t type varies in size on different systems. On
427
 * win32 it is normally 2 bytes, and on unix 4 bytes.
428
 *
429
 * If an error occurs an error may, or may not be posted with CPLError().
430
 *
431
 * @param pszSource input multi-byte character string.
432
 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
433
 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
434
 *
435
 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
436
 * NULL on error.
437
 *
438
 */
439
440
wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
441
                              const char *pszDstEncoding)
442
443
0
{
444
0
    char *pszUTF8Source = const_cast<char *>(pszSource);
445
446
0
    if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
447
0
        strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
448
0
    {
449
0
        pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
450
0
        if (pszUTF8Source == nullptr)
451
0
            return nullptr;
452
0
    }
453
454
    /* -------------------------------------------------------------------- */
455
    /*      We try to avoid changes of character set.  We are just          */
456
    /*      providing for unicode to unicode.                               */
457
    /* -------------------------------------------------------------------- */
458
0
    if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
459
0
        strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
460
0
        strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
461
0
        strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
462
0
    {
463
0
        CPLError(CE_Failure, CPLE_AppDefined,
464
0
                 "Stub recoding implementation does not support "
465
0
                 "CPLRecodeToWCharStub(...,%s,%s)",
466
0
                 pszSrcEncoding, pszDstEncoding);
467
0
        if (pszUTF8Source != pszSource)
468
0
            CPLFree(pszUTF8Source);
469
0
        return nullptr;
470
0
    }
471
472
    /* -------------------------------------------------------------------- */
473
    /*      Do the UTF-8 to UCS-2 recoding.                                 */
474
    /* -------------------------------------------------------------------- */
475
0
    int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
476
0
    wchar_t *pwszResult =
477
0
        static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
478
479
0
    utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
480
481
0
    if (pszUTF8Source != pszSource)
482
0
        CPLFree(pszUTF8Source);
483
484
0
    return pwszResult;
485
0
}
486
487
/************************************************************************/
488
/*                                 CPLIsUTF8()                          */
489
/************************************************************************/
490
491
/**
492
 * Test if a string is encoded as UTF-8.
493
 *
494
 * @param pabyData input string to test
495
 * @param nLen length of the input string, or -1 if the function must compute
496
 *             the string length. In which case it must be null terminated.
497
 * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
498
 *
499
 */
500
int CPLIsUTF8(const char *pabyData, int nLen)
501
0
{
502
0
    if (nLen < 0)
503
0
        nLen = static_cast<int>(strlen(pabyData));
504
0
    return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
505
0
}
506
507
/************************************************************************/
508
/* ==================================================================== */
509
/*      UTF.C code from FLTK with some modifications.                   */
510
/* ==================================================================== */
511
/************************************************************************/
512
513
/* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
514
   they are instead turned into the Unicode REPLACEMENT CHARACTER, of
515
   value 0xfffd.
516
   If this is on utf8decode will correctly map most (perhaps all)
517
   human-readable text that is in ISO-8859-1. This may allow you
518
   to completely ignore character sets in your code because virtually
519
   everything is either ISO-8859-1 or UTF-8.
520
*/
521
#define ERRORS_TO_ISO8859_1 1
522
523
/* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
524
   Unicode index for Microsoft's CP1252 character set. You should
525
   also set ERRORS_TO_ISO8859_1. With this a huge amount of more
526
   available text (such as all web pages) are correctly converted
527
   to Unicode.
528
*/
529
#define ERRORS_TO_CP1252 1
530
531
/* A number of Unicode code points are in fact illegal and should not
532
   be produced by a UTF-8 converter. Turn this on will replace the
533
   bytes in those encodings with errors. If you do this then converting
534
   arbitrary 16-bit data to UTF-8 and then back is not an identity,
535
   which will probably break a lot of software.
536
*/
537
#define STRICT_RFC3629 0
538
539
#if ERRORS_TO_CP1252
540
// Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
541
// to Unicode:
542
constexpr unsigned short cp1252[32] = {
543
    0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
544
    0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
545
    0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
546
    0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
547
#endif
548
549
/************************************************************************/
550
/*                             utf8decode()                             */
551
/************************************************************************/
552
553
/*
554
    Decode a single UTF-8 encoded character starting at \e p. The
555
    resulting Unicode value (in the range 0-0x10ffff) is returned,
556
    and \e len is set the number of bytes in the UTF-8 encoding
557
    (adding \e len to \e p will point at the next character).
558
559
    If \a p points at an illegal UTF-8 encoding, including one that
560
    would go past \e end, or where a code is uses more bytes than
561
    necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
562
though it is in the Microsoft CP1252 character set and \e len is set to 1.
563
    Treating errors this way allows this to decode almost any
564
    ISO-8859-1 or CP1252 text that has been mistakenly placed where
565
    UTF-8 is expected, and has proven very useful.
566
567
    If you want errors to be converted to error characters (as the
568
    standards recommend), adding a test to see if the length is
569
    unexpectedly 1 will work:
570
571
\code
572
    if( *p & 0x80 )
573
    {  // What should be a multibyte encoding.
574
      code = utf8decode(p, end, &len);
575
      if( len<2 ) code = 0xFFFD;  // Turn errors into REPLACEMENT CHARACTER.
576
    }
577
    else
578
    {  // Handle the 1-byte utf8 encoding:
579
      code = *p;
580
      len = 1;
581
    }
582
\endcode
583
584
    Direct testing for the 1-byte case (as shown above) will also
585
    speed up the scanning of strings where the majority of characters
586
    are ASCII.
587
*/
588
static unsigned utf8decode(const char *p, const char *end, int *len)
589
0
{
590
0
    unsigned char c = *reinterpret_cast<const unsigned char *>(p);
591
0
    if (c < 0x80)
592
0
    {
593
0
        *len = 1;
594
0
        return c;
595
0
#if ERRORS_TO_CP1252
596
0
    }
597
0
    else if (c < 0xa0)
598
0
    {
599
0
        *len = 1;
600
0
        return cp1252[c - 0x80];
601
0
#endif
602
0
    }
603
0
    else if (c < 0xc2)
604
0
    {
605
0
        goto FAIL;
606
0
    }
607
0
    if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
608
0
        goto FAIL;
609
0
    if (c < 0xe0)
610
0
    {
611
0
        *len = 2;
612
0
        return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
613
0
    }
614
0
    else if (c == 0xe0)
615
0
    {
616
0
        if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
617
0
            goto FAIL;
618
0
        goto UTF8_3;
619
#if STRICT_RFC3629
620
    }
621
    else if (c == 0xed)
622
    {
623
        // RFC 3629 says surrogate chars are illegal.
624
        if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
625
            goto FAIL;
626
        goto UTF8_3;
627
    }
628
    else if (c == 0xef)
629
    {
630
        // 0xfffe and 0xffff are also illegal characters.
631
        if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
632
            (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
633
            goto FAIL;
634
        goto UTF8_3;
635
#endif
636
0
    }
637
0
    else if (c < 0xf0)
638
0
    {
639
0
    UTF8_3:
640
0
        if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
641
0
            goto FAIL;
642
0
        *len = 3;
643
0
        return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
644
0
    }
645
0
    else if (c == 0xf0)
646
0
    {
647
0
        if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
648
0
            goto FAIL;
649
0
        goto UTF8_4;
650
0
    }
651
0
    else if (c < 0xf4)
652
0
    {
653
0
    UTF8_4:
654
0
        if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
655
0
            goto FAIL;
656
0
        *len = 4;
657
#if STRICT_RFC3629
658
        // RFC 3629 says all codes ending in fffe or ffff are illegal:
659
        if ((p[1] & 0xf) == 0xf &&
660
            (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
661
            (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
662
            goto FAIL;
663
#endif
664
0
        return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
665
0
               ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
666
0
    }
667
0
    else if (c == 0xf4)
668
0
    {
669
0
        if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
670
0
            goto FAIL;  // After 0x10ffff.
671
0
        goto UTF8_4;
672
0
    }
673
0
    else
674
0
    {
675
0
    FAIL:
676
0
        *len = 1;
677
0
#if ERRORS_TO_ISO8859_1
678
0
        return c;
679
#else
680
        return 0xfffd;  // Unicode REPLACEMENT CHARACTER
681
#endif
682
0
    }
683
0
}
684
685
/************************************************************************/
686
/*                              utf8towc()                              */
687
/************************************************************************/
688
689
/*  Convert a UTF-8 sequence into an array of wchar_t. These
690
    are used by some system calls, especially on Windows.
691
692
    \a src points at the UTF-8, and \a srclen is the number of bytes to
693
    convert.
694
695
    \a dst points at an array to write, and \a dstlen is the number of
696
    locations in this array. At most \a dstlen-1 words will be
697
    written there, plus a 0 terminating word. Thus this function
698
    will never overwrite the buffer and will always return a
699
    zero-terminated string. If \a dstlen is zero then \a dst can be
700
    null and no data is written, but the length is returned.
701
702
    The return value is the number of words that \e would be written
703
    to \a dst if it were long enough, not counting the terminating
704
    zero. If the return value is greater or equal to \a dstlen it
705
    indicates truncation, you can then allocate a new array of size
706
    return+1 and call this again.
707
708
    Errors in the UTF-8 are converted as though each byte in the
709
    erroneous string is in the Microsoft CP1252 encoding. This allows
710
    ISO-8859-1 text mistakenly identified as UTF-8 to be printed
711
    correctly.
712
713
    Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
714
    and most other systems. Where wchar_t is 16 bits, Unicode
715
    characters in the range 0x10000 to 0x10ffff are converted to
716
    "surrogate pairs" which take two words each (this is called UTF-16
717
    encoding). If wchar_t is 32 bits this rather nasty problem is
718
    avoided.
719
*/
720
static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
721
                         unsigned dstlen)
722
0
{
723
0
    const char *p = src;
724
0
    const char *e = src + srclen;
725
0
    unsigned count = 0;
726
0
    if (dstlen)
727
0
        while (true)
728
0
        {
729
0
            if (p >= e)
730
0
            {
731
0
                dst[count] = 0;
732
0
                return count;
733
0
            }
734
0
            if (!(*p & 0x80))
735
0
            {
736
                // ASCII
737
0
                dst[count] = *p++;
738
0
            }
739
0
            else
740
0
            {
741
0
                int len = 0;
742
0
                unsigned ucs = utf8decode(p, e, &len);
743
0
                p += len;
744
#ifdef _WIN32
745
                if (ucs < 0x10000)
746
                {
747
                    dst[count] = static_cast<wchar_t>(ucs);
748
                }
749
                else
750
                {
751
                    // Make a surrogate pair:
752
                    if (count + 2 >= dstlen)
753
                    {
754
                        dst[count] = 0;
755
                        count += 2;
756
                        break;
757
                    }
758
                    dst[count] = static_cast<wchar_t>(
759
                        (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
760
                    dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
761
                }
762
#else
763
0
                dst[count] = static_cast<wchar_t>(ucs);
764
0
#endif
765
0
            }
766
0
            if (++count == dstlen)
767
0
            {
768
0
                dst[count - 1] = 0;
769
0
                break;
770
0
            }
771
0
        }
772
    // We filled dst, measure the rest:
773
0
    while (p < e)
774
0
    {
775
0
        if (!(*p & 0x80))
776
0
        {
777
0
            p++;
778
0
        }
779
0
        else
780
0
        {
781
0
            int len = 0;
782
#ifdef _WIN32
783
            const unsigned ucs = utf8decode(p, e, &len);
784
            p += len;
785
            if (ucs >= 0x10000)
786
                ++count;
787
#else
788
0
            utf8decode(p, e, &len);
789
0
            p += len;
790
0
#endif
791
0
        }
792
0
        ++count;
793
0
    }
794
795
0
    return count;
796
0
}
797
798
/************************************************************************/
799
/*                              utf8toa()                               */
800
/************************************************************************/
801
/* Convert a UTF-8 sequence into an array of 1-byte characters.
802
803
    If the UTF-8 decodes to a character greater than 0xff then it is
804
    replaced with '?'.
805
806
    Errors in the UTF-8 are converted as individual bytes, same as
807
    utf8decode() does. This allows ISO-8859-1 text mistakenly identified
808
    as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
809
810
    \a src points at the UTF-8, and \a srclen is the number of bytes to
811
    convert.
812
813
    Up to \a dstlen bytes are written to \a dst, including a null
814
    terminator. The return value is the number of bytes that would be
815
    written, not counting the null terminator. If greater or equal to
816
    \a dstlen then if you malloc a new array of size n+1 you will have
817
    the space needed for the entire string. If \a dstlen is zero then
818
    nothing is written and this call just measures the storage space
819
    needed.
820
*/
821
static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
822
                            unsigned dstlen)
823
0
{
824
0
    const char *p = src;
825
0
    const char *e = src + srclen;
826
0
    unsigned int count = 0;
827
0
    if (dstlen)
828
0
        while (true)
829
0
        {
830
0
            if (p >= e)
831
0
            {
832
0
                dst[count] = 0;
833
0
                return count;
834
0
            }
835
0
            unsigned char c = *reinterpret_cast<const unsigned char *>(p);
836
0
            if (c < 0xC2)
837
0
            {
838
                // ASCII or bad code.
839
0
                dst[count] = c;
840
0
                p++;
841
0
            }
842
0
            else
843
0
            {
844
0
                int len = 0;
845
0
                const unsigned int ucs = utf8decode(p, e, &len);
846
0
                p += len;
847
0
                if (ucs < 0x100)
848
0
                {
849
0
                    dst[count] = static_cast<char>(ucs);
850
0
                }
851
0
                else
852
0
                {
853
0
                    if (!bHaveWarned4)
854
0
                    {
855
0
                        bHaveWarned4 = true;
856
0
                        CPLError(
857
0
                            CE_Warning, CPLE_AppDefined,
858
0
                            "One or several characters couldn't be converted "
859
0
                            "correctly from UTF-8 to ISO-8859-1.  "
860
0
                            "This warning will not be emitted anymore.");
861
0
                    }
862
0
                    dst[count] = '?';
863
0
                }
864
0
            }
865
0
            if (++count >= dstlen)
866
0
            {
867
0
                dst[count - 1] = 0;
868
0
                break;
869
0
            }
870
0
        }
871
    // We filled dst, measure the rest:
872
0
    while (p < e)
873
0
    {
874
0
        if (!(*p & 0x80))
875
0
        {
876
0
            p++;
877
0
        }
878
0
        else
879
0
        {
880
0
            int len = 0;
881
0
            utf8decode(p, e, &len);
882
0
            p += len;
883
0
        }
884
0
        ++count;
885
0
    }
886
0
    return count;
887
0
}
888
889
/************************************************************************/
890
/*                             utf8fromwc()                             */
891
/************************************************************************/
892
/* Turn "wide characters" as returned by some system calls
893
    (especially on Windows) into UTF-8.
894
895
    Up to \a dstlen bytes are written to \a dst, including a null
896
    terminator. The return value is the number of bytes that would be
897
    written, not counting the null terminator. If greater or equal to
898
    \a dstlen then if you malloc a new array of size n+1 you will have
899
    the space needed for the entire string. If \a dstlen is zero then
900
    nothing is written and this call just measures the storage space
901
    needed.
902
903
    \a srclen is the number of words in \a src to convert. On Windows
904
    this is not necessarily the number of characters, due to there
905
    possibly being "surrogate pairs" in the UTF-16 encoding used.
906
    On Unix wchar_t is 32 bits and each location is a character.
907
908
    On Unix if a src word is greater than 0x10ffff then this is an
909
    illegal character according to RFC 3629. These are converted as
910
    though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
911
    range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
912
    illegal according to RFC 3629. However I encode these as though
913
    they are legal, so that utf8towc will return the original data.
914
915
    On Windows "surrogate pairs" are converted to a single character
916
    and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
917
    pairs are converted as though they are individual characters.
918
*/
919
static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
920
                               unsigned srclen)
921
0
{
922
0
    unsigned int i = 0;
923
0
    unsigned int count = 0;
924
0
    if (dstlen)
925
0
        while (true)
926
0
        {
927
0
            if (i >= srclen)
928
0
            {
929
0
                dst[count] = 0;
930
0
                return count;
931
0
            }
932
0
            unsigned int ucs = src[i++];
933
0
            if (ucs < 0x80U)
934
0
            {
935
0
                dst[count++] = static_cast<char>(ucs);
936
0
                if (count >= dstlen)
937
0
                {
938
0
                    dst[count - 1] = 0;
939
0
                    break;
940
0
                }
941
0
            }
942
0
            else if (ucs < 0x800U)
943
0
            {
944
                // 2 bytes.
945
0
                if (count + 2 >= dstlen)
946
0
                {
947
0
                    dst[count] = 0;
948
0
                    count += 2;
949
0
                    break;
950
0
                }
951
0
                dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
952
0
                dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
953
#ifdef _WIN32
954
            }
955
            else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
956
                     src[i] >= 0xdc00 && src[i] <= 0xdfff)
957
            {
958
                // Surrogate pair.
959
                unsigned int ucs2 = src[i++];
960
                ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
961
                // All surrogate pairs turn into 4-byte utf8.
962
#else
963
0
            }
964
0
            else if (ucs >= 0x10000)
965
0
            {
966
0
                if (ucs > 0x10ffff)
967
0
                {
968
0
                    ucs = 0xfffd;
969
0
                    goto J1;
970
0
                }
971
0
#endif
972
0
                if (count + 4 >= dstlen)
973
0
                {
974
0
                    dst[count] = 0;
975
0
                    count += 4;
976
0
                    break;
977
0
                }
978
0
                dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
979
0
                dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
980
0
                dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
981
0
                dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
982
0
            }
983
0
            else
984
0
            {
985
0
#ifndef _WIN32
986
0
            J1:
987
0
#endif
988
                // All others are 3 bytes:
989
0
                if (count + 3 >= dstlen)
990
0
                {
991
0
                    dst[count] = 0;
992
0
                    count += 3;
993
0
                    break;
994
0
                }
995
0
                dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
996
0
                dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
997
0
                dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
998
0
            }
999
0
        }
1000
1001
    // We filled dst, measure the rest:
1002
0
    while (i < srclen)
1003
0
    {
1004
0
        unsigned int ucs = src[i++];
1005
0
        if (ucs < 0x80U)
1006
0
        {
1007
0
            count++;
1008
0
        }
1009
0
        else if (ucs < 0x800U)
1010
0
        {
1011
            // 2 bytes.
1012
0
            count += 2;
1013
#ifdef _WIN32
1014
        }
1015
        else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
1016
                 src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
1017
        {
1018
            // Surrogate pair.
1019
            ++i;
1020
#else
1021
0
        }
1022
0
        else if (ucs >= 0x10000 && ucs <= 0x10ffff)
1023
0
        {
1024
0
#endif
1025
0
            count += 4;
1026
0
        }
1027
0
        else
1028
0
        {
1029
0
            count += 3;
1030
0
        }
1031
0
    }
1032
0
    return count;
1033
0
}
1034
1035
/************************************************************************/
1036
/*                             utf8froma()                              */
1037
/************************************************************************/
1038
1039
/* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
1040
1041
    It is possible this should convert Microsoft's CP1252 to UTF-8
1042
    instead. This would translate the codes in the range 0x80-0x9f
1043
    to different characters. Currently it does not do this.
1044
1045
    Up to \a dstlen bytes are written to \a dst, including a null
1046
    terminator. The return value is the number of bytes that would be
1047
    written, not counting the null terminator. If greater or equal to
1048
    \a dstlen then if you malloc a new array of size n+1 you will have
1049
    the space needed for the entire string. If \a dstlen is zero then
1050
    nothing is written and this call just measures the storage space
1051
    needed.
1052
1053
    \a srclen is the number of bytes in \a src to convert.
1054
1055
    If the return value equals \a srclen then this indicates that
1056
    no conversion is necessary, as only ASCII characters are in the
1057
    string.
1058
*/
1059
static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
1060
                          unsigned srclen)
1061
0
{
1062
0
    const char *p = src;
1063
0
    const char *e = src + srclen;
1064
0
    unsigned count = 0;
1065
0
    if (dstlen)
1066
0
        while (true)
1067
0
        {
1068
0
            if (p >= e)
1069
0
            {
1070
0
                dst[count] = 0;
1071
0
                return count;
1072
0
            }
1073
0
            unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1074
0
            p++;
1075
0
            if (ucs < 0x80U)
1076
0
            {
1077
0
                dst[count++] = ucs;
1078
0
                if (count >= dstlen)
1079
0
                {
1080
0
                    dst[count - 1] = 0;
1081
0
                    break;
1082
0
                }
1083
0
            }
1084
0
            else
1085
0
            {
1086
                // 2 bytes (note that CP1252 translate could make 3 bytes!)
1087
0
                if (count + 2 >= dstlen)
1088
0
                {
1089
0
                    dst[count] = 0;
1090
0
                    count += 2;
1091
0
                    break;
1092
0
                }
1093
0
                dst[count++] = 0xc0 | (ucs >> 6);
1094
0
                dst[count++] = 0x80 | (ucs & 0x3F);
1095
0
            }
1096
0
        }
1097
1098
    // We filled dst, measure the rest:
1099
0
    while (p < e)
1100
0
    {
1101
0
        unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1102
0
        p++;
1103
0
        if (ucs < 0x80U)
1104
0
        {
1105
0
            count++;
1106
0
        }
1107
0
        else
1108
0
        {
1109
0
            count += 2;
1110
0
        }
1111
0
    }
1112
1113
0
    return count;
1114
0
}
1115
1116
#ifdef _WIN32
1117
1118
/************************************************************************/
1119
/*                            CPLWin32Recode()                          */
1120
/************************************************************************/
1121
1122
/* Convert an CODEPAGE (i.e. normal c-string) byte stream
1123
     to another CODEPAGE (i.e. normal c-string) byte stream.
1124
1125
    \a src is target c-string byte stream (including a null terminator).
1126
    \a src_code_page is target c-string byte code page.
1127
    \a dst_code_page is destination c-string byte code page.
1128
1129
   UTF7          65000
1130
   UTF8          65001
1131
   OEM-US          437
1132
   OEM-ALABIC      720
1133
   OEM-GREEK       737
1134
   OEM-BALTIC      775
1135
   OEM-MLATIN1     850
1136
   OEM-LATIN2      852
1137
   OEM-CYRILLIC    855
1138
   OEM-TURKISH     857
1139
   OEM-MLATIN1P    858
1140
   OEM-HEBREW      862
1141
   OEM-RUSSIAN     866
1142
1143
   THAI            874
1144
   SJIS            932
1145
   GBK             936
1146
   KOREA           949
1147
   BIG5            950
1148
1149
   EUROPE         1250
1150
   CYRILLIC       1251
1151
   LATIN1         1252
1152
   GREEK          1253
1153
   TURKISH        1254
1154
   HEBREW         1255
1155
   ARABIC         1256
1156
   BALTIC         1257
1157
   VIETNAM        1258
1158
1159
   ISO-LATIN1    28591
1160
   ISO-LATIN2    28592
1161
   ISO-LATIN3    28593
1162
   ISO-BALTIC    28594
1163
   ISO-CYRILLIC  28595
1164
   ISO-ARABIC    28596
1165
   ISO-HEBREW    28598
1166
   ISO-TURKISH   28599
1167
   ISO-LATIN9    28605
1168
1169
   ISO-2022-JP   50220
1170
1171
*/
1172
1173
char *CPLWin32Recode(const char *src, unsigned src_code_page,
1174
                     unsigned dst_code_page)
1175
{
1176
    // Convert from source code page to Unicode.
1177
1178
    // Compute the length in wide characters.
1179
    int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
1180
                                   nullptr, 0);
1181
    if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
1182
    {
1183
        if (!bHaveWarned5)
1184
        {
1185
            bHaveWarned5 = true;
1186
            CPLError(
1187
                CE_Warning, CPLE_AppDefined,
1188
                "One or several characters could not be translated from CP%d. "
1189
                "This warning will not be emitted anymore.",
1190
                src_code_page);
1191
        }
1192
1193
        // Retry now without MB_ERR_INVALID_CHARS flag.
1194
        wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
1195
    }
1196
1197
    // Do the actual conversion.
1198
    wchar_t *tbuf =
1199
        static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1200
    tbuf[wlen] = 0;
1201
    MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
1202
1203
    // Convert from Unicode to destination code page.
1204
1205
    // Compute the length in chars.
1206
    BOOL bUsedDefaultChar = FALSE;
1207
    int len = 0;
1208
    if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
1209
        len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1210
                                  nullptr, nullptr);
1211
    else
1212
        len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1213
                                  nullptr, &bUsedDefaultChar);
1214
    if (bUsedDefaultChar)
1215
    {
1216
        if (!bHaveWarned6)
1217
        {
1218
            bHaveWarned6 = true;
1219
            CPLError(
1220
                CE_Warning, CPLE_AppDefined,
1221
                "One or several characters could not be translated to CP%d. "
1222
                "This warning will not be emitted anymore.",
1223
                dst_code_page);
1224
        }
1225
    }
1226
1227
    // Do the actual conversion.
1228
    char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1229
    WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
1230
                        nullptr);
1231
    pszResult[len] = 0;
1232
1233
    CPLFree(tbuf);
1234
1235
    return pszResult;
1236
}
1237
1238
#endif
1239
1240
/*
1241
** For now we disable the rest which is locale() related.  We may need
1242
** parts of it later.
1243
*/
1244
1245
#ifdef notdef
1246
1247
#ifdef _WIN32
1248
#include <windows.h>
1249
#endif
1250
1251
/*! Return true if the "locale" seems to indicate that UTF-8 encoding
1252
    is used. If true the utf8tomb and utf8frommb don't do anything
1253
    useful.
1254
1255
    <i>It is highly recommended that you change your system so this
1256
    does return true.</i> On Windows this is done by setting the
1257
    "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
1258
    to a string containing the letters "utf" or "UTF" in it, or by
1259
    deleting all $LC* and $LANG environment variables. In the future
1260
    it is likely that all non-Asian Unix systems will return true,
1261
    due to the compatibility of UTF-8 with ISO-8859-1.
1262
*/
1263
int utf8locale(void)
1264
{
1265
    static int ret = 2;
1266
    if (ret == 2)
1267
    {
1268
#ifdef _WIN32
1269
        ret = GetACP() == CP_UTF8;
1270
#else
1271
        char *s;
1272
        ret = 1;  // assume UTF-8 if no locale
1273
        if (((s = getenv("LC_CTYPE")) && *s) ||
1274
            ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
1275
        {
1276
            ret = strstr(s, "utf") || strstr(s, "UTF");
1277
        }
1278
#endif
1279
    }
1280
1281
    return ret;
1282
}
1283
1284
/*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1285
    used for filenames (and sometimes used for data in files).
1286
    Unfortunately due to stupid design you will have to do this as
1287
    needed for filenames. This is a bug on both Unix and Windows.
1288
1289
    Up to \a dstlen bytes are written to \a dst, including a null
1290
    terminator. The return value is the number of bytes that would be
1291
    written, not counting the null terminator. If greater or equal to
1292
    \a dstlen then if you malloc a new array of size n+1 you will have
1293
    the space needed for the entire string. If \a dstlen is zero then
1294
    nothing is written and this call just measures the storage space
1295
    needed.
1296
1297
    If utf8locale() returns true then this does not change the data.
1298
    It is copied and truncated as necessary to
1299
    the destination buffer and \a srclen is always returned.  */
1300
unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
1301
{
1302
    if (!utf8locale())
1303
    {
1304
#ifdef _WIN32
1305
        wchar_t lbuf[1024] = {};
1306
        wchar_t *buf = lbuf;
1307
        unsigned length = utf8towc(src, srclen, buf, 1024);
1308
        unsigned ret;
1309
        if (length >= 1024)
1310
        {
1311
            buf =
1312
                static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1313
            utf8towc(src, srclen, buf, length + 1);
1314
        }
1315
        if (dstlen)
1316
        {
1317
            // apparently this does not null-terminate, even though msdn
1318
            // documentation claims it does:
1319
            ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
1320
                                      0);
1321
            dst[ret] = 0;
1322
        }
1323
        // if it overflows or measuring length, get the actual length:
1324
        if (dstlen == 0 || ret >= dstlen - 1)
1325
            ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1326
        if (buf != lbuf)
1327
            free((void *)buf);
1328
        return ret;
1329
#else
1330
        wchar_t lbuf[1024] = {};
1331
        wchar_t *buf = lbuf;
1332
        unsigned length = utf8towc(src, srclen, buf, 1024);
1333
        if (length >= 1024)
1334
        {
1335
            buf =
1336
                static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1337
            utf8towc(src, srclen, buf, length + 1);
1338
        }
1339
        int ret = 0;
1340
        if (dstlen)
1341
        {
1342
            ret = wcstombs(dst, buf, dstlen);
1343
            if (ret >= dstlen - 1)
1344
                ret = wcstombs(0, buf, 0);
1345
        }
1346
        else
1347
        {
1348
            ret = wcstombs(0, buf, 0);
1349
        }
1350
        if (buf != lbuf)
1351
            free((void *)buf);
1352
        if (ret >= 0)
1353
            return (unsigned)ret;
1354
            // On any errors we return the UTF-8 as raw text...
1355
#endif
1356
    }
1357
    // Identity transform:
1358
    if (srclen < dstlen)
1359
    {
1360
        memcpy(dst, src, srclen);
1361
        dst[srclen] = 0;
1362
    }
1363
    else
1364
    {
1365
        memcpy(dst, src, dstlen - 1);
1366
        dst[dstlen - 1] = 0;
1367
    }
1368
    return srclen;
1369
}
1370
1371
/*! Convert a filename from the locale-specific multibyte encoding
1372
    used by Windows to UTF-8 as used by FLTK.
1373
1374
    Up to \a dstlen bytes are written to \a dst, including a null
1375
    terminator. The return value is the number of bytes that would be
1376
    written, not counting the null terminator. If greater or equal to
1377
    \a dstlen then if you malloc a new array of size n+1 you will have
1378
    the space needed for the entire string. If \a dstlen is zero then
1379
    nothing is written and this call just measures the storage space
1380
    needed.
1381
1382
    On Unix or on Windows when a UTF-8 locale is in effect, this
1383
    does not change the data. It is copied and truncated as necessary to
1384
    the destination buffer and \a srclen is always returned.
1385
    You may also want to check if utf8test() returns non-zero, so that
1386
    the filesystem can store filenames in UTF-8 encoding regardless of
1387
    the locale.
1388
*/
1389
unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
1390
                    unsigned srclen)
1391
{
1392
    if (!utf8locale())
1393
    {
1394
#ifdef _WIN32
1395
        wchar_t lbuf[1024] = {};
1396
        wchar_t *buf = lbuf;
1397
        unsigned ret;
1398
        const unsigned length =
1399
            MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1400
        if (length >= 1024)
1401
        {
1402
            length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1403
            buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1404
            MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1405
        }
1406
        ret = utf8fromwc(dst, dstlen, buf, length);
1407
        if (buf != lbuf)
1408
            free(buf);
1409
        return ret;
1410
#else
1411
        wchar_t lbuf[1024] = {};
1412
        wchar_t *buf = lbuf;
1413
        const int length = mbstowcs(buf, src, 1024);
1414
        if (length >= 1024)
1415
        {
1416
            length = mbstowcs(0, src, 0) + 1;
1417
            buf =
1418
                static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
1419
            mbstowcs(buf, src, length);
1420
        }
1421
        if (length >= 0)
1422
        {
1423
            const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1424
            if (buf != lbuf)
1425
                free(buf);
1426
            return ret;
1427
        }
1428
        // Errors in conversion return the UTF-8 unchanged.
1429
#endif
1430
    }
1431
    // Identity transform:
1432
    if (srclen < dstlen)
1433
    {
1434
        memcpy(dst, src, srclen);
1435
        dst[srclen] = 0;
1436
    }
1437
    else
1438
    {
1439
        memcpy(dst, src, dstlen - 1);
1440
        dst[dstlen - 1] = 0;
1441
    }
1442
    return srclen;
1443
}
1444
1445
#endif  // def notdef - disabled locale specific stuff.
1446
1447
/*! Examines the first \a srclen bytes in \a src and return a verdict
1448
    on whether it is UTF-8 or not.
1449
    - Returns 0 if there is any illegal UTF-8 sequences, using the
1450
      same rules as utf8decode(). Note that some UCS values considered
1451
      illegal by RFC 3629, such as 0xffff, are considered legal by this.
1452
    - Returns 1 if there are only single-byte characters (i.e. no bytes
1453
      have the high bit set). This is legal UTF-8, but also indicates
1454
      plain ASCII. It also returns 1 if \a srclen is zero.
1455
    - Returns 2 if there are only characters less than 0x800.
1456
    - Returns 3 if there are only characters less than 0x10000.
1457
    - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1458
1459
    Because there are many illegal sequences in UTF-8, it is almost
1460
    impossible for a string in another encoding to be confused with
1461
    UTF-8. This is very useful for transitioning Unix to UTF-8
1462
    filenames, you can simply test each filename with this to decide
1463
    if it is UTF-8 or in the locale encoding. My hope is that if
1464
    this is done we will be able to cleanly transition to a locale-less
1465
    encoding.
1466
*/
1467
1468
static int utf8test(const char *src, unsigned srclen)
1469
0
{
1470
0
    int ret = 1;
1471
0
    const char *p = src;
1472
0
    const char *e = src + srclen;
1473
0
    while (p < e)
1474
0
    {
1475
0
        if (*p == 0)
1476
0
            return 0;
1477
0
        if (*p & 0x80)
1478
0
        {
1479
0
            int len = 0;
1480
0
            utf8decode(p, e, &len);
1481
0
            if (len < 2)
1482
0
                return 0;
1483
0
            if (len > ret)
1484
0
                ret = len;
1485
0
            p += len;
1486
0
        }
1487
0
        else
1488
0
        {
1489
0
            p++;
1490
0
        }
1491
0
    }
1492
0
    return ret;
1493
0
}