Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/i18n/csdetect.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "unicode/ucsdet.h"
15
16
#include "csdetect.h"
17
#include "csmatch.h"
18
#include "uenumimp.h"
19
20
#include "cmemory.h"
21
#include "cstring.h"
22
#include "umutex.h"
23
#include "ucln_in.h"
24
#include "uarrsort.h"
25
#include "inputext.h"
26
#include "csrsbcs.h"
27
#include "csrmbcs.h"
28
#include "csrutf8.h"
29
#include "csrucode.h"
30
#include "csr2022.h"
31
32
0
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33
0
#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35
U_NAMESPACE_BEGIN
36
37
struct CSRecognizerInfo : public UMemory {
38
    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39
0
        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
40
41
0
    ~CSRecognizerInfo() {delete recognizer;};
42
43
    CharsetRecognizer *recognizer;
44
    UBool isDefaultEnabled;
45
};
46
47
U_NAMESPACE_END
48
49
static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50
static icu::UInitOnce gCSRecognizersInitOnce;
51
static int32_t fCSRecognizers_size = 0;
52
53
U_CDECL_BEGIN
54
static UBool U_CALLCONV csdet_cleanup(void)
55
0
{
56
0
    U_NAMESPACE_USE
57
0
    if (fCSRecognizers != NULL) {
58
0
        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59
0
            delete fCSRecognizers[r];
60
0
            fCSRecognizers[r] = NULL;
61
0
        }
62
0
63
0
        DELETE_ARRAY(fCSRecognizers);
64
0
        fCSRecognizers = NULL;
65
0
        fCSRecognizers_size = 0;
66
0
    }
67
0
    gCSRecognizersInitOnce.reset();
68
0
69
0
    return TRUE;
70
0
}
71
72
static int32_t U_CALLCONV
73
charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74
0
{
75
0
    U_NAMESPACE_USE
76
0
77
0
    const CharsetMatch **csm_l = (const CharsetMatch **) left;
78
0
    const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
0
80
0
    // NOTE: compare is backwards to sort from highest to lowest.
81
0
    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82
0
}
83
84
0
static void U_CALLCONV initRecognizers(UErrorCode &status) {
85
0
    U_NAMESPACE_USE
86
0
    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87
0
    CSRecognizerInfo *tempArray[] = {
88
0
        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
89
0
90
0
        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91
0
        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92
0
        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93
0
        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
94
0
95
0
        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96
0
        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97
0
        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98
0
        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99
0
        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100
0
        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101
0
        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102
0
        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103
0
        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104
0
        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105
0
        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106
0
        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107
0
        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108
0
        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109
0
        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110
0
        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
111
0
112
0
        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
113
0
#if !UCONFIG_ONLY_HTML_CONVERSION
114
0
        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
115
0
        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
116
0
117
0
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
118
0
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
119
0
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
120
0
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
121
0
#endif
122
0
    };
123
0
    int32_t rCount = UPRV_LENGTHOF(tempArray);
124
0
125
0
    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
0
127
0
    if (fCSRecognizers == NULL) {
128
0
        status = U_MEMORY_ALLOCATION_ERROR;
129
0
    } 
130
0
    else {
131
0
        fCSRecognizers_size = rCount;
132
0
        for (int32_t r = 0; r < rCount; r += 1) {
133
0
            fCSRecognizers[r] = tempArray[r];
134
0
            if (fCSRecognizers[r] == NULL) {
135
0
                status = U_MEMORY_ALLOCATION_ERROR;
136
0
            }
137
0
        }
138
0
    }
139
0
}
140
141
U_CDECL_END
142
143
U_NAMESPACE_BEGIN
144
145
void CharsetDetector::setRecognizers(UErrorCode &status)
146
0
{
147
0
    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148
0
}
149
150
CharsetDetector::CharsetDetector(UErrorCode &status)
151
  : textIn(new InputText(status)), resultArray(NULL),
152
    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
153
    fEnabledRecognizers(NULL)
154
0
{
155
0
    if (U_FAILURE(status)) {
156
0
        return;
157
0
    }
158
0
159
0
    setRecognizers(status);
160
0
161
0
    if (U_FAILURE(status)) {
162
0
        return;
163
0
    }
164
0
165
0
    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
166
0
167
0
    if (resultArray == NULL) {
168
0
        status = U_MEMORY_ALLOCATION_ERROR;
169
0
        return;
170
0
    }
171
0
172
0
    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173
0
        resultArray[i] = new CharsetMatch();
174
0
175
0
        if (resultArray[i] == NULL) {
176
0
            status = U_MEMORY_ALLOCATION_ERROR;
177
0
            break;
178
0
        }
179
0
    }
180
0
}
181
182
CharsetDetector::~CharsetDetector()
183
0
{
184
0
    delete textIn;
185
0
186
0
    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187
0
        delete resultArray[i];
188
0
    }
189
0
190
0
    uprv_free(resultArray);
191
0
192
0
    if (fEnabledRecognizers) {
193
0
        uprv_free(fEnabledRecognizers);
194
0
    }
195
0
}
196
197
void CharsetDetector::setText(const char *in, int32_t len)
198
0
{
199
0
    textIn->setText(in, len);
200
0
    fFreshTextSet = TRUE;
201
0
}
202
203
UBool CharsetDetector::setStripTagsFlag(UBool flag)
204
0
{
205
0
    UBool temp = fStripTags;
206
0
    fStripTags = flag;
207
0
    fFreshTextSet = TRUE;
208
0
    return temp;
209
0
}
210
211
UBool CharsetDetector::getStripTagsFlag() const
212
0
{
213
0
    return fStripTags;
214
0
}
215
216
void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217
0
{
218
0
    textIn->setDeclaredEncoding(encoding,len);
219
0
}
220
221
int32_t CharsetDetector::getDetectableCount()
222
0
{
223
0
    UErrorCode status = U_ZERO_ERROR;
224
0
225
0
    setRecognizers(status);
226
0
227
0
    return fCSRecognizers_size; 
228
0
}
229
230
const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231
0
{
232
0
    int32_t maxMatchesFound = 0;
233
0
234
0
    detectAll(maxMatchesFound, status);
235
0
236
0
    if(maxMatchesFound > 0) {
237
0
        return resultArray[0];
238
0
    } else {
239
0
        return NULL;
240
0
    }
241
0
}
242
243
const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244
0
{
245
0
    if(!textIn->isSet()) {
246
0
        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
247
0
248
0
        return NULL;
249
0
    } else if (fFreshTextSet) {
250
0
        CharsetRecognizer *csr;
251
0
        int32_t            i;
252
0
253
0
        textIn->MungeInput(fStripTags);
254
0
255
0
        // Iterate over all possible charsets, remember all that
256
0
        // give a match quality > 0.
257
0
        resultCount = 0;
258
0
        for (i = 0; i < fCSRecognizers_size; i += 1) {
259
0
            csr = fCSRecognizers[i]->recognizer;
260
0
            if (csr->match(textIn, resultArray[resultCount])) {
261
0
                resultCount++;
262
0
            }
263
0
        }
264
0
265
0
        if (resultCount > 1) {
266
0
            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
267
0
        }
268
0
        fFreshTextSet = FALSE;
269
0
    }
270
0
271
0
    maxMatchesFound = resultCount;
272
0
273
0
    return resultArray;
274
0
}
275
276
void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
277
0
{
278
0
    if (U_FAILURE(status)) {
279
0
        return;
280
0
    }
281
0
282
0
    int32_t modIdx = -1;
283
0
    UBool isDefaultVal = FALSE;
284
0
    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
285
0
        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
286
0
        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
287
0
            modIdx = i;
288
0
            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
289
0
            break;
290
0
        }
291
0
    }
292
0
    if (modIdx < 0) {
293
0
        // No matching encoding found
294
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
295
0
        return;
296
0
    }
297
0
298
0
    if (fEnabledRecognizers == NULL && !isDefaultVal) {
299
0
        // Create an array storing the non default setting
300
0
        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
301
0
        if (fEnabledRecognizers == NULL) {
302
0
            status = U_MEMORY_ALLOCATION_ERROR;
303
0
            return;
304
0
        }
305
0
        // Initialize the array with default info
306
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
307
0
            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
308
0
        }
309
0
    }
310
0
311
0
    if (fEnabledRecognizers != NULL) {
312
0
        fEnabledRecognizers[modIdx] = enabled;
313
0
    }
314
0
}
315
316
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
317
{
318
    if( index > fCSRecognizers_size-1 || index < 0) {
319
        status = U_INDEX_OUTOFBOUNDS_ERROR;
320
321
        return 0;
322
    } else {
323
        return fCSRecognizers[index]->getName();
324
    }
325
}*/
326
327
U_NAMESPACE_END
328
329
U_CDECL_BEGIN
330
typedef struct {
331
    int32_t currIndex;
332
    UBool all;
333
    UBool *enabledRecognizers;
334
} Context;
335
336
337
338
static void U_CALLCONV
339
0
enumClose(UEnumeration *en) {
340
0
    if(en->context != NULL) {
341
0
        DELETE_ARRAY(en->context);
342
0
    }
343
0
344
0
    DELETE_ARRAY(en);
345
0
}
346
347
static int32_t U_CALLCONV
348
0
enumCount(UEnumeration *en, UErrorCode *) {
349
0
    if (((Context *)en->context)->all) {
350
0
        // ucsdet_getAllDetectableCharsets, all charset detector names
351
0
        return fCSRecognizers_size;
352
0
    }
353
0
354
0
    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
355
0
    int32_t count = 0;
356
0
    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
357
0
    if (enabledArray != NULL) {
358
0
        // custom set
359
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
360
0
            if (enabledArray[i]) {
361
0
                count++;
362
0
            }
363
0
        }
364
0
    } else {
365
0
        // default set
366
0
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
367
0
            if (fCSRecognizers[i]->isDefaultEnabled) {
368
0
                count++;
369
0
            }
370
0
        }
371
0
    }
372
0
    return count;
373
0
}
374
375
static const char* U_CALLCONV
376
0
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
377
0
    const char *currName = NULL;
378
0
379
0
    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
380
0
        if (((Context *)en->context)->all) {
381
0
            // ucsdet_getAllDetectableCharsets, all charset detector names
382
0
            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
383
0
            ((Context *)en->context)->currIndex++;
384
0
        } else {
385
0
            // ucsdet_getDetectableCharsets
386
0
            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
387
0
            if (enabledArray != NULL) {
388
0
                // custome set
389
0
                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
390
0
                    if (enabledArray[((Context *)en->context)->currIndex]) {
391
0
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
392
0
                    }
393
0
                    ((Context *)en->context)->currIndex++;
394
0
                }
395
0
            } else {
396
0
                // default set
397
0
                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
398
0
                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
399
0
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
400
0
                    }
401
0
                    ((Context *)en->context)->currIndex++;
402
0
                }
403
0
            }
404
0
        }
405
0
    }
406
0
407
0
    if(resultLength != NULL) {
408
0
        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
409
0
    }
410
0
411
0
    return currName;
412
0
}
413
414
415
static void U_CALLCONV
416
0
enumReset(UEnumeration *en, UErrorCode *) {
417
0
    ((Context *)en->context)->currIndex = 0;
418
0
}
419
420
static const UEnumeration gCSDetEnumeration = {
421
    NULL,
422
    NULL,
423
    enumClose,
424
    enumCount,
425
    uenum_unextDefault,
426
    enumNext,
427
    enumReset
428
};
429
430
U_CDECL_END
431
432
U_NAMESPACE_BEGIN
433
434
UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
435
0
{
436
0
437
0
    /* Initialize recognized charsets. */
438
0
    setRecognizers(status);
439
0
440
0
    if(U_FAILURE(status)) {
441
0
        return 0;
442
0
    }
443
0
444
0
    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
445
0
    if (en == NULL) {
446
0
        status = U_MEMORY_ALLOCATION_ERROR;
447
0
        return 0;
448
0
    }
449
0
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
450
0
    en->context = (void*)NEW_ARRAY(Context, 1);
451
0
    if (en->context == NULL) {
452
0
        status = U_MEMORY_ALLOCATION_ERROR;
453
0
        DELETE_ARRAY(en);
454
0
        return 0;
455
0
    }
456
0
    uprv_memset(en->context, 0, sizeof(Context));
457
0
    ((Context*)en->context)->all = TRUE;
458
0
    return en;
459
0
}
460
461
UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
462
0
{
463
0
    if(U_FAILURE(status)) {
464
0
        return 0;
465
0
    }
466
0
467
0
    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
468
0
    if (en == NULL) {
469
0
        status = U_MEMORY_ALLOCATION_ERROR;
470
0
        return 0;
471
0
    }
472
0
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
473
0
    en->context = (void*)NEW_ARRAY(Context, 1);
474
0
    if (en->context == NULL) {
475
0
        status = U_MEMORY_ALLOCATION_ERROR;
476
0
        DELETE_ARRAY(en);
477
0
        return 0;
478
0
    }
479
0
    uprv_memset(en->context, 0, sizeof(Context));
480
0
    ((Context*)en->context)->all = FALSE;
481
0
    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
482
0
    return en;
483
0
}
484
485
U_NAMESPACE_END
486
487
#endif