Coverage Report

Created: 2026-06-13 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/i18n/csdetect.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "unicode/ucsdet.h"
15
16
#include "csdetect.h"
17
#include "csmatch.h"
18
#include "uenumimp.h"
19
20
#include "cmemory.h"
21
#include "cstring.h"
22
#include "umutex.h"
23
#include "ucln_in.h"
24
#include "uarrsort.h"
25
#include "inputext.h"
26
#include "csrsbcs.h"
27
#include "csrmbcs.h"
28
#include "csrutf8.h"
29
#include "csrucode.h"
30
#include "csr2022.h"
31
32
3
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33
0
#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35
U_NAMESPACE_BEGIN
36
37
struct CSRecognizerInfo : public UMemory {
38
    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39
84
        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
40
41
0
    ~CSRecognizerInfo() {delete recognizer;}
42
43
    CharsetRecognizer *recognizer;
44
    UBool isDefaultEnabled;
45
};
46
47
U_NAMESPACE_END
48
49
static icu::CSRecognizerInfo **fCSRecognizers = nullptr;
50
static icu::UInitOnce gCSRecognizersInitOnce {};
51
static int32_t fCSRecognizers_size = 0;
52
53
U_CDECL_BEGIN
54
static UBool U_CALLCONV csdet_cleanup()
55
0
{
56
0
    U_NAMESPACE_USE
57
0
    if (fCSRecognizers != nullptr) {
58
0
        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59
0
            delete fCSRecognizers[r];
60
0
            fCSRecognizers[r] = nullptr;
61
0
        }
62
63
0
        DELETE_ARRAY(fCSRecognizers);
64
0
        fCSRecognizers = nullptr;
65
0
        fCSRecognizers_size = 0;
66
0
    }
67
0
    gCSRecognizersInitOnce.reset();
68
69
0
    return true;
70
0
}
71
72
static int32_t U_CALLCONV
73
charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74
1.85M
{
75
1.85M
    U_NAMESPACE_USE
76
77
1.85M
    const CharsetMatch **csm_l = (const CharsetMatch **) left;
78
1.85M
    const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80
    // NOTE: compare is backwards to sort from highest to lowest.
81
1.85M
    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82
1.85M
}
83
84
3
static void U_CALLCONV initRecognizers(UErrorCode &status) {
85
3
    U_NAMESPACE_USE
86
3
    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87
3
    CSRecognizerInfo *tempArray[] = {
88
3
        new CSRecognizerInfo(new CharsetRecog_UTF8(), true),
89
90
3
        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), true),
91
3
        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), true),
92
3
        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), true),
93
3
        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), true),
94
95
3
        new CSRecognizerInfo(new CharsetRecog_8859_1(), true),
96
3
        new CSRecognizerInfo(new CharsetRecog_8859_2(), true),
97
3
        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), true),
98
3
        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), true),
99
3
        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), true),
100
3
        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), true),
101
3
        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), true),
102
3
        new CSRecognizerInfo(new CharsetRecog_windows_1251(), true),
103
3
        new CSRecognizerInfo(new CharsetRecog_windows_1256(), true),
104
3
        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), true),
105
3
        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), true),
106
3
        new CSRecognizerInfo(new CharsetRecog_sjis(), true),
107
3
        new CSRecognizerInfo(new CharsetRecog_gb_18030(), true),
108
3
        new CSRecognizerInfo(new CharsetRecog_euc_jp(), true),
109
3
        new CSRecognizerInfo(new CharsetRecog_euc_kr(), true),
110
3
        new CSRecognizerInfo(new CharsetRecog_big5(), true),
111
112
3
        new CSRecognizerInfo(new CharsetRecog_2022JP(), true),
113
3
#if !UCONFIG_ONLY_HTML_CONVERSION
114
3
        new CSRecognizerInfo(new CharsetRecog_2022KR(), true),
115
3
        new CSRecognizerInfo(new CharsetRecog_2022CN(), true),
116
117
3
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), false),
118
3
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), false),
119
3
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), false),
120
3
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), false)
121
3
#endif
122
3
    };
123
3
    int32_t rCount = UPRV_LENGTHOF(tempArray);
124
125
3
    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
127
3
    if (fCSRecognizers == nullptr) {
128
0
        status = U_MEMORY_ALLOCATION_ERROR;
129
0
    } 
130
3
    else {
131
3
        fCSRecognizers_size = rCount;
132
87
        for (int32_t r = 0; r < rCount; r += 1) {
133
84
            fCSRecognizers[r] = tempArray[r];
134
84
            if (fCSRecognizers[r] == nullptr) {
135
0
                status = U_MEMORY_ALLOCATION_ERROR;
136
0
            }
137
84
        }
138
3
    }
139
3
}
140
141
U_CDECL_END
142
143
U_NAMESPACE_BEGIN
144
145
void CharsetDetector::setRecognizers(UErrorCode &status)
146
659k
{
147
659k
    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148
659k
}
149
150
CharsetDetector::CharsetDetector(UErrorCode &status)
151
659k
  : textIn(new InputText(status)), resultArray(nullptr),
152
659k
    resultCount(0), fStripTags(false), fFreshTextSet(false),
153
659k
    fEnabledRecognizers(nullptr)
154
659k
{
155
659k
    if (U_FAILURE(status)) {
156
0
        return;
157
0
    }
158
159
659k
    setRecognizers(status);
160
161
659k
    if (U_FAILURE(status)) {
162
0
        return;
163
0
    }
164
165
659k
    resultArray = static_cast<CharsetMatch**>(uprv_malloc(sizeof(CharsetMatch*) * fCSRecognizers_size));
166
167
659k
    if (resultArray == nullptr) {
168
0
        status = U_MEMORY_ALLOCATION_ERROR;
169
0
        return;
170
0
    }
171
172
19.1M
    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173
18.4M
        resultArray[i] = new CharsetMatch();
174
175
18.4M
        if (resultArray[i] == nullptr) {
176
0
            status = U_MEMORY_ALLOCATION_ERROR;
177
0
            break;
178
0
        }
179
18.4M
    }
180
659k
}
181
182
CharsetDetector::~CharsetDetector()
183
659k
{
184
659k
    delete textIn;
185
186
19.1M
    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187
18.4M
        delete resultArray[i];
188
18.4M
    }
189
190
659k
    uprv_free(resultArray);
191
192
659k
    if (fEnabledRecognizers) {
193
0
        uprv_free(fEnabledRecognizers);
194
0
    }
195
659k
}
196
197
void CharsetDetector::setText(const char *in, int32_t len)
198
659k
{
199
659k
    textIn->setText(in, len);
200
659k
    fFreshTextSet = true;
201
659k
}
202
203
UBool CharsetDetector::setStripTagsFlag(UBool flag)
204
629k
{
205
629k
    UBool temp = fStripTags;
206
629k
    fStripTags = flag;
207
629k
    fFreshTextSet = true;
208
629k
    return temp;
209
629k
}
210
211
UBool CharsetDetector::getStripTagsFlag() const
212
629k
{
213
629k
    return fStripTags;
214
629k
}
215
216
void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217
0
{
218
0
    textIn->setDeclaredEncoding(encoding,len);
219
0
}
220
221
int32_t CharsetDetector::getDetectableCount()
222
0
{
223
0
    UErrorCode status = U_ZERO_ERROR;
224
225
0
    setRecognizers(status);
226
227
0
    return fCSRecognizers_size; 
228
0
}
229
230
const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231
635k
{
232
635k
    int32_t maxMatchesFound = 0;
233
234
635k
    detectAll(maxMatchesFound, status);
235
236
635k
    if(maxMatchesFound > 0) {
237
594k
        return resultArray[0];
238
594k
    } else {
239
40.4k
        return nullptr;
240
40.4k
    }
241
635k
}
242
243
const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244
659k
{
245
659k
    if(!textIn->isSet()) {
246
0
        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
247
248
0
        return nullptr;
249
659k
    } else if (fFreshTextSet) {
250
659k
        CharsetRecognizer *csr;
251
659k
        int32_t            i;
252
253
659k
        textIn->MungeInput(fStripTags);
254
255
        // Iterate over all possible charsets, remember all that
256
        // give a match quality > 0.
257
659k
        resultCount = 0;
258
19.1M
        for (i = 0; i < fCSRecognizers_size; i += 1) {
259
18.4M
            csr = fCSRecognizers[i]->recognizer;
260
18.4M
            if (csr->match(textIn, resultArray[resultCount])) {
261
1.39M
                resultCount++;
262
1.39M
            }
263
18.4M
        }
264
265
659k
        if (resultCount > 1) {
266
261k
            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, nullptr, true, &status);
267
261k
        }
268
659k
        fFreshTextSet = false;
269
659k
    }
270
271
659k
    maxMatchesFound = resultCount;
272
273
659k
    if (maxMatchesFound == 0) {
274
41.3k
        status = U_INVALID_CHAR_FOUND;
275
41.3k
        return nullptr;
276
41.3k
    }
277
278
618k
    return resultArray;
279
659k
}
280
281
void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
282
0
{
283
0
    if (U_FAILURE(status)) {
284
0
        return;
285
0
    }
286
287
0
    int32_t modIdx = -1;
288
0
    UBool isDefaultVal = false;
289
0
    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
290
0
        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
291
0
        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
292
0
            modIdx = i;
293
0
            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
294
0
            break;
295
0
        }
296
0
    }
297
0
    if (modIdx < 0) {
298
        // No matching encoding found
299
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
300
0
        return;
301
0
    }
302
303
0
    if (fEnabledRecognizers == nullptr && !isDefaultVal) {
304
        // Create an array storing the non default setting
305
0
        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
306
0
        if (fEnabledRecognizers == nullptr) {
307
0
            status = U_MEMORY_ALLOCATION_ERROR;
308
0
            return;
309
0
        }
310
        // Initialize the array with default info
311
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
312
0
            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
313
0
        }
314
0
    }
315
316
0
    if (fEnabledRecognizers != nullptr) {
317
0
        fEnabledRecognizers[modIdx] = enabled;
318
0
    }
319
0
}
320
321
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
322
{
323
    if( index > fCSRecognizers_size-1 || index < 0) {
324
        status = U_INDEX_OUTOFBOUNDS_ERROR;
325
326
        return 0;
327
    } else {
328
        return fCSRecognizers[index]->getName();
329
    }
330
}*/
331
332
U_NAMESPACE_END
333
334
U_CDECL_BEGIN
335
typedef struct {
336
    int32_t currIndex;
337
    UBool all;
338
    UBool *enabledRecognizers;
339
} Context;
340
341
342
343
static void U_CALLCONV
344
0
enumClose(UEnumeration *en) {
345
0
    if(en->context != nullptr) {
346
0
        DELETE_ARRAY(en->context);
347
0
    }
348
349
0
    DELETE_ARRAY(en);
350
0
}
351
352
static int32_t U_CALLCONV
353
0
enumCount(UEnumeration *en, UErrorCode *) {
354
0
    if (((Context *)en->context)->all) {
355
        // ucsdet_getAllDetectableCharsets, all charset detector names
356
0
        return fCSRecognizers_size;
357
0
    }
358
359
    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
360
0
    int32_t count = 0;
361
0
    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
362
0
    if (enabledArray != nullptr) {
363
        // custom set
364
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365
0
            if (enabledArray[i]) {
366
0
                count++;
367
0
            }
368
0
        }
369
0
    } else {
370
        // default set
371
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
372
0
            if (fCSRecognizers[i]->isDefaultEnabled) {
373
0
                count++;
374
0
            }
375
0
        }
376
0
    }
377
0
    return count;
378
0
}
379
380
static const char* U_CALLCONV
381
0
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
382
0
    const char *currName = nullptr;
383
384
0
    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
385
0
        if (((Context *)en->context)->all) {
386
            // ucsdet_getAllDetectableCharsets, all charset detector names
387
0
            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
388
0
            ((Context *)en->context)->currIndex++;
389
0
        } else {
390
            // ucsdet_getDetectableCharsets
391
0
            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
392
0
            if (enabledArray != nullptr) {
393
                // custom set
394
0
                while (currName == nullptr && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
395
0
                    if (enabledArray[((Context *)en->context)->currIndex]) {
396
0
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
397
0
                    }
398
0
                    ((Context *)en->context)->currIndex++;
399
0
                }
400
0
            } else {
401
                // default set
402
0
                while (currName == nullptr && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
403
0
                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
404
0
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
405
0
                    }
406
0
                    ((Context *)en->context)->currIndex++;
407
0
                }
408
0
            }
409
0
        }
410
0
    }
411
412
0
    if(resultLength != nullptr) {
413
0
        *resultLength = currName == nullptr ? 0 : (int32_t)uprv_strlen(currName);
414
0
    }
415
416
0
    return currName;
417
0
}
418
419
420
static void U_CALLCONV
421
0
enumReset(UEnumeration *en, UErrorCode *) {
422
0
    ((Context *)en->context)->currIndex = 0;
423
0
}
424
425
static const UEnumeration gCSDetEnumeration = {
426
    nullptr,
427
    nullptr,
428
    enumClose,
429
    enumCount,
430
    uenum_unextDefault,
431
    enumNext,
432
    enumReset
433
};
434
435
U_CDECL_END
436
437
U_NAMESPACE_BEGIN
438
439
UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
440
0
{
441
442
    /* Initialize recognized charsets. */
443
0
    setRecognizers(status);
444
445
0
    if(U_FAILURE(status)) {
446
0
        return nullptr;
447
0
    }
448
449
0
    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
450
0
    if (en == nullptr) {
451
0
        status = U_MEMORY_ALLOCATION_ERROR;
452
0
        return nullptr;
453
0
    }
454
0
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
455
0
    en->context = (void*)NEW_ARRAY(Context, 1);
456
0
    if (en->context == nullptr) {
457
0
        status = U_MEMORY_ALLOCATION_ERROR;
458
0
        DELETE_ARRAY(en);
459
0
        return nullptr;
460
0
    }
461
0
    uprv_memset(en->context, 0, sizeof(Context));
462
0
    static_cast<Context*>(en->context)->all = true;
463
0
    return en;
464
0
}
465
466
UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
467
0
{
468
0
    if(U_FAILURE(status)) {
469
0
        return nullptr;
470
0
    }
471
472
0
    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
473
0
    if (en == nullptr) {
474
0
        status = U_MEMORY_ALLOCATION_ERROR;
475
0
        return nullptr;
476
0
    }
477
0
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
478
0
    en->context = (void*)NEW_ARRAY(Context, 1);
479
0
    if (en->context == nullptr) {
480
0
        status = U_MEMORY_ALLOCATION_ERROR;
481
0
        DELETE_ARRAY(en);
482
0
        return nullptr;
483
0
    }
484
0
    uprv_memset(en->context, 0, sizeof(Context));
485
0
    static_cast<Context*>(en->context)->all = false;
486
0
    static_cast<Context*>(en->context)->enabledRecognizers = fEnabledRecognizers;
487
0
    return en;
488
0
}
489
490
U_NAMESPACE_END
491
492
#endif