Coverage Report

Created: 2025-07-07 10:01

/src/libreoffice/sal/textenc/tcvtutf8.cxx
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
#include <sal/config.h>
21
22
#include <cassert>
23
24
#include <sal/types.h>
25
#include <rtl/character.hxx>
26
#include <rtl/textcvt.h>
27
28
#include "converter.hxx"
29
#include "tcvtutf8.hxx"
30
31
namespace {
32
33
struct ImplUtf8ToUnicodeContext
34
{
35
    sal_uInt32 nUtf32;
36
    int nBytes;
37
    int nShift;
38
    bool bCheckBom;
39
};
40
41
struct ImplUnicodeToUtf8Context
42
{
43
    sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
44
};
45
46
}
47
48
void * ImplCreateUtf8ToUnicodeContext()
49
115k
{
50
115k
    ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
51
115k
    ImplResetUtf8ToUnicodeContext(p);
52
115k
    return p;
53
115k
}
54
55
void ImplResetUtf8ToUnicodeContext(void * pContext)
56
191k
{
57
191k
    if (pContext != nullptr)
58
191k
    {
59
191k
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = 1;
60
191k
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
61
191k
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
62
191k
    }
63
191k
}
64
65
void ImplDestroyUtf8ToUnicodeContext(void * pContext)
66
115k
{
67
115k
    delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
68
115k
}
69
70
sal_Size ImplConvertUtf8ToUnicode(
71
    void const * pData, void * pContext, char const * pSrcBuf,
72
    sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
73
    sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
74
95.2M
{
75
95.2M
    bool bJavaUtf8 = pData != nullptr;
76
95.2M
    sal_uInt32 nUtf32 = 0;
77
95.2M
    int nBytes = 1;
78
95.2M
    int nShift = -1;
79
95.2M
    bool bCheckBom = true;
80
95.2M
    sal_uInt32 nInfo = 0;
81
95.2M
    unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
82
95.2M
    unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
83
95.2M
    sal_Unicode * pDestBufPtr = pDestBuf;
84
95.2M
    sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
85
95.2M
    unsigned char const * startOfCurrentChar = pSrcBufPtr;
86
87
95.2M
    if (pContext != nullptr)
88
92.5M
    {
89
92.5M
        nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
90
92.5M
        nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
91
92.5M
        nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92
92.5M
        bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
93
92.5M
    }
94
95
1.42G
    while (pSrcBufPtr < pSrcBufEnd)
96
1.32G
    {
97
1.32G
        bool bConsume = true;
98
1.32G
        sal_uInt32 nChar = *pSrcBufPtr++;
99
1.32G
        if (nShift < 0)
100
            // Allow (illegal) 5 and 6 byte sequences, so they are read as a
101
            // single individual bad character:
102
1.10G
            if (nChar <= 0x7F)
103
843M
            {
104
843M
                nUtf32 = nChar;
105
843M
                nBytes = 1;
106
843M
                goto transform;
107
843M
            }
108
260M
            else if (nChar <= 0xBF)
109
80.4M
                goto bad_input;
110
179M
            else if (nChar <= 0xDF)
111
55.2M
            {
112
55.2M
                nUtf32 = (nChar & 0x1F) << 6;
113
55.2M
                nBytes = 2;
114
55.2M
                nShift = 0;
115
55.2M
            }
116
124M
            else if (nChar <= 0xEF)
117
76.8M
            {
118
76.8M
                nUtf32 = (nChar & 0x0F) << 12;
119
76.8M
                nBytes = 3;
120
76.8M
                nShift = 6;
121
76.8M
            }
122
47.5M
            else if (nChar <= 0xF7)
123
14.5M
            {
124
14.5M
                nUtf32 = (nChar & 0x07) << 18;
125
14.5M
                nBytes = 4;
126
14.5M
                nShift = 12;
127
14.5M
            }
128
32.9M
            else if (nChar <= 0xFB)
129
5.56M
            {
130
5.56M
                nUtf32 = (nChar & 0x03) << 24;
131
5.56M
                nBytes = 5;
132
5.56M
                nShift = 18;
133
5.56M
            }
134
27.4M
            else if (nChar <= 0xFD)
135
2.99M
            {
136
2.99M
                nUtf32 = (nChar & 0x01) << 30;
137
2.99M
                nBytes = 6;
138
2.99M
                nShift = 24;
139
2.99M
            }
140
24.4M
            else
141
24.4M
                goto bad_input;
142
225M
        else if ((nChar & 0xC0) == 0x80)
143
144M
        {
144
144M
            nUtf32 |= (nChar & 0x3F) << nShift;
145
144M
            if (nShift == 0)
146
74.3M
                goto transform;
147
70.5M
            else
148
70.5M
                nShift -= 6;
149
144M
        }
150
80.8M
        else
151
80.8M
        {
152
            /*
153
             This byte is preceded by a broken UTF-8 sequence; if this byte
154
             is neither in the range [0x80..0xBF] nor in the range
155
             [0xFE..0xFF], assume that this byte does not belong to that
156
             broken sequence, but instead starts a new, legal UTF-8 sequence:
157
             */
158
80.8M
            bConsume = nChar >= 0xFE;
159
80.8M
            goto bad_input;
160
80.8M
        }
161
225M
        continue;
162
163
918M
    transform:
164
918M
        if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
165
918M
            || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
166
918M
            || bJavaUtf8)
167
918M
        {
168
918M
            switch (nBytes) {
169
843M
            case 1:
170
843M
                if (bJavaUtf8 && nUtf32 == 0) {
171
14.5k
                    goto bad_input;
172
14.5k
                }
173
843M
                break;
174
843M
            case 2:
175
17.4M
                if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
176
710k
                    goto bad_input;
177
710k
                }
178
16.7M
                break;
179
53.3M
            case 3:
180
53.3M
                if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
181
175k
                {
182
175k
                    goto bad_input;
183
175k
                }
184
53.1M
                break;
185
53.1M
            case 4:
186
3.47M
                if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
187
3.47M
                    || bJavaUtf8)
188
219k
                {
189
219k
                    goto bad_input;
190
219k
                }
191
3.25M
                break;
192
3.25M
            default:
193
37.3k
                goto bad_input;
194
918M
            }
195
916M
            if (nUtf32 <= 0xFFFF)
196
913M
                if (pDestBufPtr != pDestBufEnd)
197
913M
                    *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
198
181k
                else
199
181k
                    goto no_output;
200
3.25M
            else if (pDestBufEnd - pDestBufPtr >= 2)
201
3.21M
                pDestBufPtr += rtl::splitSurrogates(nUtf32, pDestBufPtr);
202
36.5k
            else
203
36.5k
                goto no_output;
204
916M
        }
205
916M
        nShift = -1;
206
916M
        bCheckBom = false;
207
916M
        startOfCurrentChar = pSrcBufPtr;
208
916M
        continue;
209
210
186M
    bad_input:
211
186M
        switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
212
186M
                    false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
213
186M
                    &nInfo))
214
186M
        {
215
1.19M
        case sal::detail::textenc::BAD_INPUT_STOP:
216
1.19M
            nShift = -1;
217
1.19M
            bCheckBom = false;
218
1.19M
            if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
219
1.19M
                if (!bConsume)
220
315k
                    --pSrcBufPtr;
221
1.19M
            } else {
222
0
                pSrcBufPtr = startOfCurrentChar;
223
0
            }
224
1.19M
            break;
225
226
185M
        case sal::detail::textenc::BAD_INPUT_CONTINUE:
227
185M
            nShift = -1;
228
185M
            bCheckBom = false;
229
185M
            if (!bConsume)
230
79.6M
                --pSrcBufPtr;
231
185M
            startOfCurrentChar = pSrcBufPtr;
232
185M
            continue;
233
234
28.6k
        case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
235
28.6k
            goto no_output;
236
186M
        }
237
1.19M
        break;
238
239
1.19M
    no_output:
240
246k
        --pSrcBufPtr;
241
246k
        nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
242
246k
        break;
243
186M
    }
244
245
95.2M
    if (nShift >= 0
246
95.2M
        && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
247
949k
                         | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
248
949k
               == 0)
249
897k
    {
250
897k
        if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
251
896k
            nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
252
1.55k
        else
253
1.55k
            switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
254
1.55k
                        false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
255
1.55k
                        &nInfo))
256
1.55k
            {
257
0
            case sal::detail::textenc::BAD_INPUT_STOP:
258
0
                if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
259
0
                    pSrcBufPtr = startOfCurrentChar;
260
0
                }
261
0
                [[fallthrough]];
262
1.39k
            case sal::detail::textenc::BAD_INPUT_CONTINUE:
263
1.39k
                nShift = -1;
264
1.39k
                bCheckBom = false;
265
1.39k
                break;
266
267
160
            case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
268
160
                nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
269
160
                break;
270
1.55k
            }
271
897k
    }
272
273
95.2M
    if (pContext != nullptr)
274
92.5M
    {
275
92.5M
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
276
92.5M
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
277
92.5M
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
278
92.5M
        static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
279
92.5M
    }
280
95.2M
    if (pInfo != nullptr)
281
95.2M
        *pInfo = nInfo;
282
95.2M
    if (pSrcCvtBytes != nullptr)
283
95.2M
        *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
284
95.2M
    return pDestBufPtr - pDestBuf;
285
95.2M
}
286
287
void * ImplCreateUnicodeToUtf8Context()
288
32.9k
{
289
32.9k
    ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
290
32.9k
    ImplResetUnicodeToUtf8Context(p);
291
32.9k
    return p;
292
32.9k
}
293
294
void ImplResetUnicodeToUtf8Context(void * pContext)
295
32.9k
{
296
32.9k
    if (pContext != nullptr)
297
32.9k
        static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
298
32.9k
}
299
300
void ImplDestroyUnicodeToUtf8Context(void * pContext)
301
32.9k
{
302
32.9k
    delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
303
32.9k
}
304
305
sal_Size ImplConvertUnicodeToUtf8(
306
    void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
307
    sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
308
    sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
309
255k
{
310
255k
    bool bJavaUtf8 = pData != nullptr;
311
255k
    sal_Unicode nHighSurrogate = 0xFFFF;
312
255k
    sal_uInt32 nInfo = 0;
313
255k
    sal_Unicode const * pSrcBufPtr = pSrcBuf;
314
255k
    sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
315
255k
    char * pDestBufPtr = pDestBuf;
316
255k
    char * pDestBufEnd = pDestBufPtr + nDestBytes;
317
318
255k
    if (pContext != nullptr)
319
43.3k
        nHighSurrogate
320
43.3k
            = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
321
322
255k
    if (nHighSurrogate == 0xFFFF)
323
242k
    {
324
242k
        if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
325
242k
            && !bJavaUtf8)
326
0
        {
327
0
            if (pDestBufEnd - pDestBufPtr >= 3)
328
0
            {
329
                /* Write BOM (U+FEFF) as UTF-8: */
330
0
                *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
331
0
                *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
332
0
                *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
333
0
            }
334
0
            else
335
0
            {
336
0
                nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
337
0
                goto done;
338
0
            }
339
0
        }
340
242k
        nHighSurrogate = 0;
341
242k
    }
342
343
385M
    while (pSrcBufPtr < pSrcBufEnd)
344
385M
    {
345
385M
        sal_uInt32 nChar = *pSrcBufPtr++;
346
385M
        if (nHighSurrogate == 0)
347
385M
        {
348
385M
            if (rtl::isHighSurrogate(nChar) && !bJavaUtf8)
349
30.8k
            {
350
30.8k
                nHighSurrogate = static_cast<sal_Unicode>(nChar);
351
30.8k
                continue;
352
30.8k
            }
353
385M
            else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
354
28.8k
            {
355
28.8k
                goto bad_input;
356
28.8k
            }
357
385M
        }
358
30.8k
        else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
359
7.48k
            nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
360
23.3k
        else
361
23.3k
            goto bad_input;
362
363
385M
        assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
364
365
385M
        if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
366
342M
            if (pDestBufPtr != pDestBufEnd)
367
342M
                *pDestBufPtr++ = static_cast< char >(nChar);
368
0
            else
369
0
                goto no_output;
370
42.3M
        else if (nChar <= 0x7FF)
371
3.51M
            if (pDestBufEnd - pDestBufPtr >= 2)
372
3.51M
            {
373
3.51M
                *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
374
3.51M
                *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
375
3.51M
            }
376
0
            else
377
0
                goto no_output;
378
38.8M
        else if (nChar <= 0xFFFF)
379
38.8M
            if (pDestBufEnd - pDestBufPtr >= 3)
380
38.8M
            {
381
38.8M
                *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
382
38.8M
                *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
383
38.8M
                *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
384
38.8M
            }
385
0
            else
386
0
                goto no_output;
387
7.48k
        else if (pDestBufEnd - pDestBufPtr >= 4)
388
7.48k
        {
389
7.48k
            *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
390
7.48k
            *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
391
7.48k
            *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
392
7.48k
            *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
393
7.48k
        }
394
0
        else
395
0
            goto no_output;
396
385M
        nHighSurrogate = 0;
397
385M
        continue;
398
399
52.2k
    bad_input:
400
52.2k
        switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
401
52.2k
                    false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
402
52.2k
                    0, nullptr))
403
52.2k
        {
404
0
        case sal::detail::textenc::BAD_INPUT_STOP:
405
0
            nHighSurrogate = 0;
406
0
            break;
407
408
52.2k
        case sal::detail::textenc::BAD_INPUT_CONTINUE:
409
52.2k
            nHighSurrogate = 0;
410
52.2k
            continue;
411
412
0
        case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
413
0
            goto no_output;
414
52.2k
        }
415
0
        break;
416
417
0
    no_output:
418
0
        --pSrcBufPtr;
419
0
        nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
420
0
        break;
421
52.2k
    }
422
423
255k
    if (nHighSurrogate != 0
424
255k
        && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
425
35
                         | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
426
35
               == 0)
427
35
    {
428
35
        if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
429
22
            nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
430
13
        else
431
13
            switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
432
13
                        false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
433
13
                        nullptr, 0, nullptr))
434
13
            {
435
0
            case sal::detail::textenc::BAD_INPUT_STOP:
436
13
            case sal::detail::textenc::BAD_INPUT_CONTINUE:
437
13
                nHighSurrogate = 0;
438
13
                break;
439
440
0
            case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
441
0
                nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
442
0
                break;
443
13
            }
444
35
    }
445
446
255k
 done:
447
255k
    if (pContext != nullptr)
448
43.3k
        static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
449
43.3k
            = nHighSurrogate;
450
255k
    if (pInfo != nullptr)
451
255k
        *pInfo = nInfo;
452
255k
    if (pSrcCvtChars != nullptr)
453
255k
        *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
454
255k
    return pDestBufPtr - pDestBuf;
455
255k
}
456
457
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */