Coverage Report

Created: 2026-03-12 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/i18n/csdetect.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "unicode/ucsdet.h"
15
16
#include "csdetect.h"
17
#include "csmatch.h"
18
#include "uenumimp.h"
19
20
#include "cmemory.h"
21
#include "cstring.h"
22
#include "umutex.h"
23
#include "ucln_in.h"
24
#include "uarrsort.h"
25
#include "inputext.h"
26
#include "csrsbcs.h"
27
#include "csrmbcs.h"
28
#include "csrutf8.h"
29
#include "csrucode.h"
30
#include "csr2022.h"
31
32
3
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33
0
#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35
U_NAMESPACE_BEGIN
36
37
struct CSRecognizerInfo : public UMemory {
38
    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39
84
        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
40
41
0
    ~CSRecognizerInfo() {delete recognizer;};
42
43
    CharsetRecognizer *recognizer;
44
    UBool isDefaultEnabled;
45
};
46
47
U_NAMESPACE_END
48
49
static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50
static icu::UInitOnce gCSRecognizersInitOnce;
51
static int32_t fCSRecognizers_size = 0;
52
53
U_CDECL_BEGIN
54
static UBool U_CALLCONV csdet_cleanup(void)
55
0
{
56
0
    U_NAMESPACE_USE
57
0
    if (fCSRecognizers != NULL) {
58
0
        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59
0
            delete fCSRecognizers[r];
60
0
            fCSRecognizers[r] = NULL;
61
0
        }
62
63
0
        DELETE_ARRAY(fCSRecognizers);
64
0
        fCSRecognizers = NULL;
65
0
        fCSRecognizers_size = 0;
66
0
    }
67
0
    gCSRecognizersInitOnce.reset();
68
69
0
    return TRUE;
70
0
}
71
72
static int32_t U_CALLCONV
73
charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74
219k
{
75
219k
    U_NAMESPACE_USE
76
77
219k
    const CharsetMatch **csm_l = (const CharsetMatch **) left;
78
219k
    const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80
    // NOTE: compare is backwards to sort from highest to lowest.
81
219k
    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82
219k
}
83
84
3
static void U_CALLCONV initRecognizers(UErrorCode &status) {
85
3
    U_NAMESPACE_USE
86
3
    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87
3
    CSRecognizerInfo *tempArray[] = {
88
3
        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
89
90
3
        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91
3
        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92
3
        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93
3
        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
94
95
3
        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96
3
        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97
3
        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98
3
        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99
3
        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100
3
        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101
3
        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102
3
        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103
3
        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104
3
        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105
3
        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106
3
        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107
3
        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108
3
        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109
3
        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110
3
        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
111
112
3
        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
113
3
#if !UCONFIG_ONLY_HTML_CONVERSION
114
3
        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
115
3
        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
116
117
3
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
118
3
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
119
3
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
120
3
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
121
3
#endif
122
3
    };
123
3
    int32_t rCount = UPRV_LENGTHOF(tempArray);
124
125
3
    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
127
3
    if (fCSRecognizers == NULL) {
128
0
        status = U_MEMORY_ALLOCATION_ERROR;
129
0
    } 
130
3
    else {
131
3
        fCSRecognizers_size = rCount;
132
87
        for (int32_t r = 0; r < rCount; r += 1) {
133
84
            fCSRecognizers[r] = tempArray[r];
134
84
            if (fCSRecognizers[r] == NULL) {
135
0
                status = U_MEMORY_ALLOCATION_ERROR;
136
0
            }
137
84
        }
138
3
    }
139
3
}
140
141
U_CDECL_END
142
143
U_NAMESPACE_BEGIN
144
145
void CharsetDetector::setRecognizers(UErrorCode &status)
146
31.3k
{
147
31.3k
    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148
31.3k
}
149
150
CharsetDetector::CharsetDetector(UErrorCode &status)
151
31.3k
  : textIn(new InputText(status)), resultArray(NULL),
152
31.3k
    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
153
31.3k
    fEnabledRecognizers(NULL)
154
31.3k
{
155
31.3k
    if (U_FAILURE(status)) {
156
0
        return;
157
0
    }
158
159
31.3k
    setRecognizers(status);
160
161
31.3k
    if (U_FAILURE(status)) {
162
0
        return;
163
0
    }
164
165
31.3k
    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
166
167
31.3k
    if (resultArray == NULL) {
168
0
        status = U_MEMORY_ALLOCATION_ERROR;
169
0
        return;
170
0
    }
171
172
908k
    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173
877k
        resultArray[i] = new CharsetMatch();
174
175
877k
        if (resultArray[i] == NULL) {
176
0
            status = U_MEMORY_ALLOCATION_ERROR;
177
0
            break;
178
0
        }
179
877k
    }
180
31.3k
}
181
182
CharsetDetector::~CharsetDetector()
183
31.3k
{
184
31.3k
    delete textIn;
185
186
908k
    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187
877k
        delete resultArray[i];
188
877k
    }
189
190
31.3k
    uprv_free(resultArray);
191
192
31.3k
    if (fEnabledRecognizers) {
193
0
        uprv_free(fEnabledRecognizers);
194
0
    }
195
31.3k
}
196
197
void CharsetDetector::setText(const char *in, int32_t len)
198
31.2k
{
199
31.2k
    textIn->setText(in, len);
200
31.2k
    fFreshTextSet = TRUE;
201
31.2k
}
202
203
UBool CharsetDetector::setStripTagsFlag(UBool flag)
204
2.35k
{
205
2.35k
    UBool temp = fStripTags;
206
2.35k
    fStripTags = flag;
207
2.35k
    fFreshTextSet = TRUE;
208
2.35k
    return temp;
209
2.35k
}
210
211
UBool CharsetDetector::getStripTagsFlag() const
212
2.35k
{
213
2.35k
    return fStripTags;
214
2.35k
}
215
216
void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217
0
{
218
0
    textIn->setDeclaredEncoding(encoding,len);
219
0
}
220
221
int32_t CharsetDetector::getDetectableCount()
222
0
{
223
0
    UErrorCode status = U_ZERO_ERROR;
224
225
0
    setRecognizers(status);
226
227
0
    return fCSRecognizers_size; 
228
0
}
229
230
const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231
8.07k
{
232
8.07k
    int32_t maxMatchesFound = 0;
233
234
8.07k
    detectAll(maxMatchesFound, status);
235
236
8.07k
    if(maxMatchesFound > 0) {
237
7.99k
        return resultArray[0];
238
7.99k
    } else {
239
72
        return NULL;
240
72
    }
241
8.07k
}
242
243
const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244
31.2k
{
245
31.2k
    if(!textIn->isSet()) {
246
0
        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
247
248
0
        return NULL;
249
31.2k
    } else if (fFreshTextSet) {
250
31.2k
        CharsetRecognizer *csr;
251
31.2k
        int32_t            i;
252
253
31.2k
        textIn->MungeInput(fStripTags);
254
255
        // Iterate over all possible charsets, remember all that
256
        // give a match quality > 0.
257
31.2k
        resultCount = 0;
258
906k
        for (i = 0; i < fCSRecognizers_size; i += 1) {
259
875k
            csr = fCSRecognizers[i]->recognizer;
260
875k
            if (csr->match(textIn, resultArray[resultCount])) {
261
116k
                resultCount++;
262
116k
            }
263
875k
        }
264
265
31.2k
        if (resultCount > 1) {
266
26.5k
            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
267
26.5k
        }
268
31.2k
        fFreshTextSet = FALSE;
269
31.2k
    }
270
271
31.2k
    maxMatchesFound = resultCount;
272
273
31.2k
    return resultArray;
274
31.2k
}
275
276
void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
277
0
{
278
0
    if (U_FAILURE(status)) {
279
0
        return;
280
0
    }
281
282
0
    int32_t modIdx = -1;
283
0
    UBool isDefaultVal = FALSE;
284
0
    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
285
0
        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
286
0
        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
287
0
            modIdx = i;
288
0
            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
289
0
            break;
290
0
        }
291
0
    }
292
0
    if (modIdx < 0) {
293
        // No matching encoding found
294
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
295
0
        return;
296
0
    }
297
298
0
    if (fEnabledRecognizers == NULL && !isDefaultVal) {
299
        // Create an array storing the non default setting
300
0
        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
301
0
        if (fEnabledRecognizers == NULL) {
302
0
            status = U_MEMORY_ALLOCATION_ERROR;
303
0
            return;
304
0
        }
305
        // Initialize the array with default info
306
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
307
0
            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
308
0
        }
309
0
    }
310
311
0
    if (fEnabledRecognizers != NULL) {
312
0
        fEnabledRecognizers[modIdx] = enabled;
313
0
    }
314
0
}
315
316
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
317
{
318
    if( index > fCSRecognizers_size-1 || index < 0) {
319
        status = U_INDEX_OUTOFBOUNDS_ERROR;
320
321
        return 0;
322
    } else {
323
        return fCSRecognizers[index]->getName();
324
    }
325
}*/
326
327
U_NAMESPACE_END
328
329
U_CDECL_BEGIN
330
typedef struct {
331
    int32_t currIndex;
332
    UBool all;
333
    UBool *enabledRecognizers;
334
} Context;
335
336
337
338
static void U_CALLCONV
339
0
enumClose(UEnumeration *en) {
340
0
    if(en->context != NULL) {
341
0
        DELETE_ARRAY(en->context);
342
0
    }
343
344
0
    DELETE_ARRAY(en);
345
0
}
346
347
static int32_t U_CALLCONV
348
0
enumCount(UEnumeration *en, UErrorCode *) {
349
0
    if (((Context *)en->context)->all) {
350
        // ucsdet_getAllDetectableCharsets, all charset detector names
351
0
        return fCSRecognizers_size;
352
0
    }
353
354
    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
355
0
    int32_t count = 0;
356
0
    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
357
0
    if (enabledArray != NULL) {
358
        // custom set
359
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
360
0
            if (enabledArray[i]) {
361
0
                count++;
362
0
            }
363
0
        }
364
0
    } else {
365
        // default set
366
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
367
0
            if (fCSRecognizers[i]->isDefaultEnabled) {
368
0
                count++;
369
0
            }
370
0
        }
371
0
    }
372
0
    return count;
373
0
}
374
375
static const char* U_CALLCONV
376
0
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
377
0
    const char *currName = NULL;
378
379
0
    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
380
0
        if (((Context *)en->context)->all) {
381
            // ucsdet_getAllDetectableCharsets, all charset detector names
382
0
            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
383
0
            ((Context *)en->context)->currIndex++;
384
0
        } else {
385
            // ucsdet_getDetectableCharsets
386
0
            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
387
0
            if (enabledArray != NULL) {
388
                // custome set
389
0
                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
390
0
                    if (enabledArray[((Context *)en->context)->currIndex]) {
391
0
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
392
0
                    }
393
0
                    ((Context *)en->context)->currIndex++;
394
0
                }
395
0
            } else {
396
                // default set
397
0
                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
398
0
                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
399
0
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
400
0
                    }
401
0
                    ((Context *)en->context)->currIndex++;
402
0
                }
403
0
            }
404
0
        }
405
0
    }
406
407
0
    if(resultLength != NULL) {
408
0
        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
409
0
    }
410
411
0
    return currName;
412
0
}
413
414
415
static void U_CALLCONV
416
0
enumReset(UEnumeration *en, UErrorCode *) {
417
0
    ((Context *)en->context)->currIndex = 0;
418
0
}
419
420
static const UEnumeration gCSDetEnumeration = {
421
    NULL,
422
    NULL,
423
    enumClose,
424
    enumCount,
425
    uenum_unextDefault,
426
    enumNext,
427
    enumReset
428
};
429
430
U_CDECL_END
431
432
U_NAMESPACE_BEGIN
433
434
UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
435
0
{
436
437
    /* Initialize recognized charsets. */
438
0
    setRecognizers(status);
439
440
0
    if(U_FAILURE(status)) {
441
0
        return 0;
442
0
    }
443
444
0
    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
445
0
    if (en == NULL) {
446
0
        status = U_MEMORY_ALLOCATION_ERROR;
447
0
        return 0;
448
0
    }
449
0
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
450
0
    en->context = (void*)NEW_ARRAY(Context, 1);
451
0
    if (en->context == NULL) {
452
0
        status = U_MEMORY_ALLOCATION_ERROR;
453
0
        DELETE_ARRAY(en);
454
0
        return 0;
455
0
    }
456
0
    uprv_memset(en->context, 0, sizeof(Context));
457
0
    ((Context*)en->context)->all = TRUE;
458
0
    return en;
459
0
}
460
461
UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
462
0
{
463
0
    if(U_FAILURE(status)) {
464
0
        return 0;
465
0
    }
466
467
0
    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
468
0
    if (en == NULL) {
469
0
        status = U_MEMORY_ALLOCATION_ERROR;
470
0
        return 0;
471
0
    }
472
0
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
473
0
    en->context = (void*)NEW_ARRAY(Context, 1);
474
0
    if (en->context == NULL) {
475
0
        status = U_MEMORY_ALLOCATION_ERROR;
476
0
        DELETE_ARRAY(en);
477
0
        return 0;
478
0
    }
479
0
    uprv_memset(en->context, 0, sizeof(Context));
480
0
    ((Context*)en->context)->all = FALSE;
481
0
    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
482
0
    return en;
483
0
}
484
485
U_NAMESPACE_END
486
487
#endif