Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/unistr_cnv.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 1999-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  unistr_cnv.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:2
14
*
15
*   created on: 2004aug19
16
*   created by: Markus W. Scherer
17
*
18
*   Character conversion functions moved here from unistr.cpp
19
*/
20
21
#include "unicode/utypes.h"
22
23
#if !UCONFIG_NO_CONVERSION
24
25
#include "unicode/putil.h"
26
#include "cstring.h"
27
#include "cmemory.h"
28
#include "unicode/ustring.h"
29
#include "unicode/unistr.h"
30
#include "unicode/ucnv.h"
31
#include "ucnv_imp.h"
32
#include "putilimp.h"
33
#include "ustr_cnv.h"
34
#include "ustr_imp.h"
35
36
U_NAMESPACE_BEGIN
37
38
//========================================
39
// Constructors
40
//========================================
41
42
#if !U_CHARSET_IS_UTF8
43
44
UnicodeString::UnicodeString(const char *codepageData) {
45
    fUnion.fFields.fLengthAndFlags = kShortString;
46
    if(codepageData != 0) {
47
        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48
    }
49
}
50
51
UnicodeString::UnicodeString(const char *codepageData,
52
                             int32_t dataLength) {
53
    fUnion.fFields.fLengthAndFlags = kShortString;
54
    if(codepageData != 0) {
55
        doCodepageCreate(codepageData, dataLength, 0);
56
    }
57
}
58
59
// else see unistr.cpp
60
#endif
61
62
UnicodeString::UnicodeString(const char *codepageData,
63
0
                             const char *codepage) {
64
0
    fUnion.fFields.fLengthAndFlags = kShortString;
65
0
    if (codepageData != nullptr) {
66
0
        doCodepageCreate(codepageData, static_cast<int32_t>(uprv_strlen(codepageData)), codepage);
67
0
    }
68
0
}
69
70
UnicodeString::UnicodeString(const char *codepageData,
71
                             int32_t dataLength,
72
2.49k
                             const char *codepage) {
73
2.49k
    fUnion.fFields.fLengthAndFlags = kShortString;
74
2.49k
    if (codepageData != nullptr) {
75
2.49k
        doCodepageCreate(codepageData, dataLength, codepage);
76
2.49k
    }
77
2.49k
}
78
79
UnicodeString::UnicodeString(const char *src, int32_t srcLength,
80
                             UConverter *cnv,
81
0
                             UErrorCode &errorCode) {
82
0
    fUnion.fFields.fLengthAndFlags = kShortString;
83
0
    if(U_SUCCESS(errorCode)) {
84
        // check arguments
85
0
        if(src==nullptr) {
86
            // treat as an empty string, do nothing more
87
0
        } else if(srcLength<-1) {
88
0
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
89
0
        } else {
90
            // get input length
91
0
            if(srcLength==-1) {
92
0
                srcLength = static_cast<int32_t>(uprv_strlen(src));
93
0
            }
94
0
            if(srcLength>0) {
95
0
                if (cnv != nullptr) {
96
                    // use the provided converter
97
0
                    ucnv_resetToUnicode(cnv);
98
0
                    doCodepageCreate(src, srcLength, cnv, errorCode);
99
0
                } else {
100
                    // use the default converter
101
0
                    cnv=u_getDefaultConverter(&errorCode);
102
0
                    doCodepageCreate(src, srcLength, cnv, errorCode);
103
0
                    u_releaseDefaultConverter(cnv);
104
0
                }
105
0
            }
106
0
        }
107
108
0
        if(U_FAILURE(errorCode)) {
109
0
            setToBogus();
110
0
        }
111
0
    }
112
0
}
113
114
//========================================
115
// Codeset conversion
116
//========================================
117
118
#if !U_CHARSET_IS_UTF8
119
120
int32_t
121
UnicodeString::extract(int32_t start,
122
                       int32_t length,
123
                       char *target,
124
                       uint32_t dstSize) const {
125
    return extract(start, length, target, dstSize, 0);
126
}
127
128
// else see unistr.cpp
129
#endif
130
131
int32_t
132
UnicodeString::extract(int32_t start,
133
                       int32_t length,
134
                       char *target,
135
                       uint32_t dstSize,
136
                       const char *codepage) const
137
0
{
138
    // if the arguments are illegal, then do nothing
139
0
    if (/*dstSize < 0 || */(dstSize > 0 && target == nullptr)) {
140
0
        return 0;
141
0
    }
142
143
    // pin the indices to legal values
144
0
    pinIndices(start, length);
145
146
    // We need to cast dstSize to int32_t for all subsequent code.
147
    // I don't know why the API was defined with uint32_t but we are stuck with it.
148
    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
149
    // as a limit in some functions, it may wrap around and yield a pointer
150
    // that compares less-than target.
151
0
    int32_t capacity;
152
0
    if(dstSize < 0x7fffffff) {
153
        // Assume that the capacity is real and a limit pointer won't wrap around.
154
0
        capacity = static_cast<int32_t>(dstSize);
155
0
    } else {
156
        // Pin the capacity so that a limit pointer does not wrap around.
157
0
        char* targetLimit = static_cast<char*>(U_MAX_PTR(target));
158
        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
159
        // greater than target and does not wrap around the top of the address space.
160
0
        capacity = static_cast<int32_t>(targetLimit - target);
161
0
    }
162
163
    // create the converter
164
0
    UConverter *converter;
165
0
    UErrorCode status = U_ZERO_ERROR;
166
167
    // just write the NUL if the string length is 0
168
0
    if(length == 0) {
169
0
        return u_terminateChars(target, capacity, 0, &status);
170
0
    }
171
172
    // if the codepage is the default, use our cache
173
    // if it is an empty string, then use the "invariant character" conversion
174
0
    if (codepage == nullptr) {
175
0
        const char *defaultName = ucnv_getDefaultName();
176
0
        if(UCNV_FAST_IS_UTF8(defaultName)) {
177
0
            return toUTF8(start, length, target, capacity);
178
0
        }
179
0
        converter = u_getDefaultConverter(&status);
180
0
    } else if (*codepage == 0) {
181
        // use the "invariant characters" conversion
182
0
        int32_t destLength;
183
0
        if(length <= capacity) {
184
0
            destLength = length;
185
0
        } else {
186
0
            destLength = capacity;
187
0
        }
188
0
        u_UCharsToChars(getArrayStart() + start, target, destLength);
189
0
        return u_terminateChars(target, capacity, length, &status);
190
0
    } else {
191
0
        converter = ucnv_open(codepage, &status);
192
0
    }
193
194
0
    length = doExtract(start, length, target, capacity, converter, status);
195
196
    // close the converter
197
0
    if (codepage == nullptr) {
198
0
        u_releaseDefaultConverter(converter);
199
0
    } else {
200
0
        ucnv_close(converter);
201
0
    }
202
203
0
    return length;
204
0
}
205
206
int32_t
207
UnicodeString::extract(char *dest, int32_t destCapacity,
208
                       UConverter *cnv,
209
                       UErrorCode &errorCode) const
210
6.09k
{
211
6.09k
    if(U_FAILURE(errorCode)) {
212
0
        return 0;
213
0
    }
214
215
6.09k
    if (isBogus() || destCapacity < 0 || (destCapacity > 0 && dest == nullptr)) {
216
0
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
217
0
        return 0;
218
0
    }
219
220
    // nothing to do?
221
6.09k
    if(isEmpty()) {
222
44
        return u_terminateChars(dest, destCapacity, 0, &errorCode);
223
44
    }
224
225
    // get the converter
226
6.05k
    UBool isDefaultConverter;
227
6.05k
    if (cnv == nullptr) {
228
0
        isDefaultConverter=true;
229
0
        cnv=u_getDefaultConverter(&errorCode);
230
0
        if(U_FAILURE(errorCode)) {
231
0
            return 0;
232
0
        }
233
6.05k
    } else {
234
6.05k
        isDefaultConverter=false;
235
6.05k
        ucnv_resetFromUnicode(cnv);
236
6.05k
    }
237
238
    // convert
239
6.05k
    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
240
241
    // release the converter
242
6.05k
    if(isDefaultConverter) {
243
0
        u_releaseDefaultConverter(cnv);
244
0
    }
245
246
6.05k
    return len;
247
6.05k
}
248
249
int32_t
250
UnicodeString::doExtract(int32_t start, int32_t length,
251
                         char *dest, int32_t destCapacity,
252
                         UConverter *cnv,
253
                         UErrorCode &errorCode) const
254
6.05k
{
255
6.05k
    if(U_FAILURE(errorCode)) {
256
0
        if(destCapacity!=0) {
257
0
            *dest=0;
258
0
        }
259
0
        return 0;
260
0
    }
261
262
6.05k
    const char16_t *src=getArrayStart()+start, *srcLimit=src+length;
263
6.05k
    char *originalDest=dest;
264
6.05k
    const char *destLimit;
265
266
6.05k
    if(destCapacity==0) {
267
0
        destLimit=dest=nullptr;
268
6.05k
    } else if(destCapacity==-1) {
269
        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
270
0
        destLimit = static_cast<char*>(U_MAX_PTR(dest));
271
        // for NUL-termination, translate into highest int32_t
272
0
        destCapacity=0x7fffffff;
273
6.05k
    } else {
274
6.05k
        destLimit=dest+destCapacity;
275
6.05k
    }
276
277
    // perform the conversion
278
6.05k
    UErrorCode bufferStatus = U_ZERO_ERROR;
279
6.05k
    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, nullptr, true, &bufferStatus);
280
6.05k
    length = static_cast<int32_t>(dest - originalDest);
281
282
    // if an overflow occurs, then get the preflighting length
283
6.05k
    if(bufferStatus==U_BUFFER_OVERFLOW_ERROR) {
284
635
        char buffer[1024];
285
286
635
        destLimit=buffer+sizeof(buffer);
287
27.2k
        do {
288
27.2k
            dest=buffer;
289
27.2k
            bufferStatus=U_ZERO_ERROR;
290
27.2k
            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, nullptr, true, &bufferStatus);
291
27.2k
            length += static_cast<int32_t>(dest - buffer);
292
27.2k
        } while(bufferStatus==U_BUFFER_OVERFLOW_ERROR);
293
635
    }
294
6.05k
    if (U_FAILURE(bufferStatus)) {
295
0
        errorCode = bufferStatus;
296
0
    }
297
298
6.05k
    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
299
6.05k
}
300
301
void
302
UnicodeString::doCodepageCreate(const char *codepageData,
303
                                int32_t dataLength,
304
                                const char *codepage)
305
2.49k
{
306
    // if there's nothing to convert, do nothing
307
2.49k
    if (codepageData == nullptr || dataLength == 0 || dataLength < -1) {
308
1
        return;
309
1
    }
310
2.49k
    if(dataLength == -1) {
311
0
        dataLength = static_cast<int32_t>(uprv_strlen(codepageData));
312
0
    }
313
314
2.49k
    UErrorCode status = U_ZERO_ERROR;
315
316
    // create the converter
317
    // if the codepage is the default, use our cache
318
    // if it is an empty string, then use the "invariant character" conversion
319
2.49k
    UConverter *converter;
320
2.49k
    if (codepage == nullptr) {
321
0
        const char *defaultName = ucnv_getDefaultName();
322
0
        if(UCNV_FAST_IS_UTF8(defaultName)) {
323
0
            setToUTF8(StringPiece(codepageData, dataLength));
324
0
            return;
325
0
        }
326
0
        converter = u_getDefaultConverter(&status);
327
2.49k
    } else if (*codepage == 0) {
328
        // use the "invariant characters" conversion
329
0
        if(cloneArrayIfNeeded(dataLength, dataLength, false)) {
330
0
            u_charsToUChars(codepageData, getArrayStart(), dataLength);
331
0
            setLength(dataLength);
332
0
        } else {
333
0
            setToBogus();
334
0
        }
335
0
        return;
336
2.49k
    } else {
337
2.49k
        converter = ucnv_open(codepage, &status);
338
2.49k
    }
339
340
    // if we failed, set the appropriate flags and return
341
2.49k
    if(U_FAILURE(status)) {
342
25
        setToBogus();
343
25
        return;
344
25
    }
345
346
    // perform the conversion
347
2.47k
    doCodepageCreate(codepageData, dataLength, converter, status);
348
2.47k
    if(U_FAILURE(status)) {
349
0
        setToBogus();
350
0
    }
351
352
    // close the converter
353
2.47k
    if (codepage == nullptr) {
354
0
        u_releaseDefaultConverter(converter);
355
2.47k
    } else {
356
2.47k
        ucnv_close(converter);
357
2.47k
    }
358
2.47k
}
359
360
void
361
UnicodeString::doCodepageCreate(const char *codepageData,
362
                                int32_t dataLength,
363
                                UConverter *converter,
364
                                UErrorCode &status)
365
2.47k
{
366
2.47k
    if(U_FAILURE(status)) {
367
0
        return;
368
0
    }
369
370
    // set up the conversion parameters
371
2.47k
    const char *mySource     = codepageData;
372
2.47k
    const char *mySourceEnd  = mySource + dataLength;
373
2.47k
    char16_t *array, *myTarget;
374
375
    // estimate the size needed:
376
2.47k
    int32_t arraySize;
377
2.47k
    if(dataLength <= US_STACKBUF_SIZE) {
378
        // try to use the stack buffer
379
1.55k
        arraySize = US_STACKBUF_SIZE;
380
1.55k
    } else {
381
        // 1.25 char16_t's per source byte should cover most cases
382
916
        arraySize = dataLength + (dataLength >> 2);
383
916
    }
384
385
    // we do not care about the current contents
386
2.47k
    UBool doCopyArray = false;
387
2.47k
    for(;;) {
388
2.47k
        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
389
0
            setToBogus();
390
0
            break;
391
0
        }
392
393
        // perform the conversion
394
2.47k
        array = getArrayStart();
395
2.47k
        myTarget = array + length();
396
2.47k
        UErrorCode bufferStatus = U_ZERO_ERROR;
397
2.47k
        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
398
2.47k
            &mySource, mySourceEnd, nullptr, true, &bufferStatus);
399
400
        // update the conversion parameters
401
2.47k
        setLength(static_cast<int32_t>(myTarget - array));
402
403
        // allocate more space and copy data, if needed
404
2.47k
        if(bufferStatus == U_BUFFER_OVERFLOW_ERROR) {
405
            // keep the previous conversion results
406
0
            doCopyArray = true;
407
408
            // estimate the new size needed, larger than before
409
            // try 2 char16_t's per remaining source byte
410
0
            arraySize = static_cast<int32_t>(length() + 2 * (mySourceEnd - mySource));
411
2.47k
        } else {
412
2.47k
            if (U_FAILURE(bufferStatus)) {
413
0
                status = bufferStatus;
414
0
            }
415
2.47k
            break;
416
2.47k
        }
417
2.47k
    }
418
2.47k
}
419
420
U_NAMESPACE_END
421
422
#endif