Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/i18n/csrmbcs.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "cmemory.h"
15
#include "csmatch.h"
16
#include "csrmbcs.h"
17
18
#include <math.h>
19
20
U_NAMESPACE_BEGIN
21
22
0
#define min(x,y) (((x)<(y))?(x):(y))
23
24
static const uint16_t commonChars_sjis [] = {
25
// TODO:  This set of data comes from the character frequency-
26
//        of-occurence analysis tool.  The data needs to be moved
27
//        into a resource and loaded from there.
28
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35
static const uint16_t commonChars_euc_jp[] = {
36
// TODO:  This set of data comes from the character frequency-
37
//        of-occurence analysis tool.  The data needs to be moved
38
//        into a resource and loaded from there.
39
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50
static const uint16_t commonChars_euc_kr[] = {
51
// TODO:  This set of data comes from the character frequency-
52
//        of-occurence analysis tool.  The data needs to be moved
53
//        into a resource and loaded from there.
54
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65
static const uint16_t commonChars_big5[] = {
66
// TODO:  This set of data comes from the character frequency-
67
//        of-occurence analysis tool.  The data needs to be moved
68
//        into a resource and loaded from there.
69
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80
static const uint16_t commonChars_gb_18030[] = {
81
// TODO:  This set of data comes from the character frequency-
82
//        of-occurence analysis tool.  The data needs to be moved
83
//        into a resource and loaded from there.
84
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95
static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96
0
{
97
0
    int32_t start = 0, end = len-1;
98
0
    int32_t mid = (start+end)/2;
99
0
100
0
    while(start <= end) {
101
0
        if(array[mid] == value) {
102
0
            return mid;
103
0
        }
104
0
105
0
        if(array[mid] < value){
106
0
            start = mid+1;
107
0
        } else {
108
0
            end = mid-1;
109
0
        }
110
0
111
0
        mid = (start+end)/2;
112
0
    }
113
0
114
0
    return -1;
115
0
}
116
117
IteratedChar::IteratedChar() : 
118
charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119
0
{
120
0
    // nothing else to do.
121
0
}
122
123
/*void IteratedChar::reset()
124
{
125
    charValue = 0;
126
    index     = -1;
127
    nextIndex = 0;
128
    error     = FALSE;
129
    done      = FALSE;
130
}*/
131
132
int32_t IteratedChar::nextByte(InputText *det)
133
0
{
134
0
    if (nextIndex >= det->fRawLength) {
135
0
        done = TRUE;
136
0
137
0
        return -1;
138
0
    }
139
0
140
0
    return det->fRawInput[nextIndex++];
141
0
}
142
143
CharsetRecog_mbcs::~CharsetRecog_mbcs()
144
0
{
145
0
    // nothing to do.
146
0
}
147
148
0
int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149
0
    int32_t singleByteCharCount = 0;
150
0
    int32_t doubleByteCharCount = 0;
151
0
    int32_t commonCharCount     = 0;
152
0
    int32_t badCharCount        = 0;
153
0
    int32_t totalCharCount      = 0;
154
0
    int32_t confidence          = 0;
155
0
    IteratedChar iter;
156
0
157
0
    while (nextChar(&iter, det)) {
158
0
        totalCharCount++;
159
0
160
0
        if (iter.error) {
161
0
            badCharCount++;
162
0
        } else {
163
0
            if (iter.charValue <= 0xFF) {
164
0
                singleByteCharCount++;
165
0
            } else {
166
0
                doubleByteCharCount++;
167
0
168
0
                if (commonChars != 0) {
169
0
                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170
0
                        commonCharCount += 1;
171
0
                    }
172
0
                }
173
0
            }
174
0
        }
175
0
176
0
177
0
        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178
0
            // Bail out early if the byte data is not matching the encoding scheme.
179
0
            // break detectBlock;
180
0
            return confidence;
181
0
        }
182
0
    }
183
0
184
0
    if (doubleByteCharCount <= 10 && badCharCount == 0) {
185
0
        // Not many multi-byte chars.
186
0
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
187
0
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188
0
            // We don't have enough data to have any confidence.
189
0
            // Statistical analysis of single byte non-ASCII charcters would probably help here.
190
0
            confidence = 0;
191
0
        }
192
0
        else {
193
0
            //   ASCII or ISO file?  It's probably not our encoding,
194
0
            //   but is not incompatible with our encoding, so don't give it a zero.
195
0
            confidence = 10;
196
0
        }
197
0
198
0
        return confidence;
199
0
    }
200
0
201
0
    //
202
0
    //  No match if there are too many characters that don't fit the encoding scheme.
203
0
    //    (should we have zero tolerance for these?)
204
0
    //
205
0
    if (doubleByteCharCount < 20*badCharCount) {
206
0
        confidence = 0;
207
0
208
0
        return confidence;
209
0
    }
210
0
211
0
    if (commonChars == 0) {
212
0
        // We have no statistics on frequently occuring characters.
213
0
        //  Assess confidence purely on having a reasonable number of
214
0
        //  multi-byte characters (the more the better)
215
0
        confidence = 30 + doubleByteCharCount - 20*badCharCount;
216
0
217
0
        if (confidence > 100) {
218
0
            confidence = 100;
219
0
        }
220
0
    } else {
221
0
        //
222
0
        // Frequency of occurence statistics exist.
223
0
        //
224
0
225
0
        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226
0
        double scaleFactor = 90.0 / maxVal;
227
0
        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228
0
229
0
        confidence = min(confidence, 100);
230
0
    }
231
0
232
0
    if (confidence < 0) {
233
0
        confidence = 0;
234
0
    }
235
0
236
0
    return confidence;
237
0
}
238
239
CharsetRecog_sjis::~CharsetRecog_sjis()
240
{
241
    // nothing to do
242
}
243
244
0
UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245
0
    it->index = it->nextIndex;
246
0
    it->error = FALSE;
247
0
248
0
    int32_t firstByte = it->charValue = it->nextByte(det);
249
0
250
0
    if (firstByte < 0) {
251
0
        return FALSE;
252
0
    }
253
0
254
0
    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
255
0
        return TRUE;
256
0
    }
257
0
258
0
    int32_t secondByte = it->nextByte(det);
259
0
    if (secondByte >= 0) {
260
0
        it->charValue = (firstByte << 8) | secondByte;
261
0
    }
262
0
    // else we'll handle the error later.
263
0
264
0
    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265
0
        // Illegal second byte value.
266
0
        it->error = TRUE;
267
0
    }
268
0
269
0
    return TRUE;
270
0
}
271
272
0
UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273
0
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274
0
    results->set(det, this, confidence);
275
0
    return (confidence > 0);
276
0
}
277
278
const char *CharsetRecog_sjis::getName() const
279
0
{
280
0
    return "Shift_JIS";
281
0
}
282
283
const char *CharsetRecog_sjis::getLanguage() const
284
0
{
285
0
    return "ja";
286
0
}
287
288
CharsetRecog_euc::~CharsetRecog_euc()
289
{
290
    // nothing to do
291
}
292
293
0
UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294
0
    int32_t firstByte  = 0;
295
0
    int32_t secondByte = 0;
296
0
    int32_t thirdByte  = 0;
297
0
298
0
    it->index = it->nextIndex;
299
0
    it->error = FALSE;
300
0
    firstByte = it->charValue = it->nextByte(det);
301
0
302
0
    if (firstByte < 0) {
303
0
        // Ran off the end of the input data
304
0
        return FALSE;
305
0
    }
306
0
307
0
    if (firstByte <= 0x8D) {
308
0
        // single byte char
309
0
        return TRUE;
310
0
    }
311
0
312
0
    secondByte = it->nextByte(det);
313
0
    if (secondByte >= 0) {
314
0
        it->charValue = (it->charValue << 8) | secondByte;
315
0
    }
316
0
    // else we'll handle the error later.
317
0
318
0
    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319
0
        // Two byte Char
320
0
        if (secondByte < 0xA1) {
321
0
            it->error = TRUE;
322
0
        }
323
0
324
0
        return TRUE;
325
0
    }
326
0
327
0
    if (firstByte == 0x8E) {
328
0
        // Code Set 2.
329
0
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330
0
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331
0
        // We don't know which we've got.
332
0
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
333
0
        //   bytes will look like a well formed 2 byte char.
334
0
        if (secondByte < 0xA1) {
335
0
            it->error = TRUE;
336
0
        }
337
0
338
0
        return TRUE;
339
0
    }
340
0
341
0
    if (firstByte == 0x8F) {
342
0
        // Code set 3.
343
0
        // Three byte total char size, two bytes of actual char value.
344
0
        thirdByte    = it->nextByte(det);
345
0
        it->charValue = (it->charValue << 8) | thirdByte;
346
0
347
0
        if (thirdByte < 0xa1) {
348
0
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349
0
            it->error = TRUE;
350
0
        }
351
0
    }
352
0
353
0
    return TRUE;
354
0
355
0
}
356
357
CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358
{
359
    // nothing to do
360
}
361
362
const char *CharsetRecog_euc_jp::getName() const
363
0
{
364
0
    return "EUC-JP";
365
0
}
366
367
const char *CharsetRecog_euc_jp::getLanguage() const
368
0
{
369
0
    return "ja";
370
0
}
371
372
UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
373
0
{
374
0
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375
0
    results->set(det, this, confidence);
376
0
    return (confidence > 0);
377
0
}
378
379
CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380
{
381
    // nothing to do
382
}
383
384
const char *CharsetRecog_euc_kr::getName() const
385
0
{
386
0
    return "EUC-KR";
387
0
}
388
389
const char *CharsetRecog_euc_kr::getLanguage() const
390
0
{
391
0
    return "ko";
392
0
}
393
394
UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
395
0
{
396
0
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397
0
    results->set(det, this, confidence);
398
0
    return (confidence > 0);
399
0
}
400
401
CharsetRecog_big5::~CharsetRecog_big5()
402
{
403
    // nothing to do
404
}
405
406
UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407
0
{
408
0
    int32_t firstByte;
409
0
410
0
    it->index = it->nextIndex;
411
0
    it->error = FALSE;
412
0
    firstByte = it->charValue = it->nextByte(det);
413
0
414
0
    if (firstByte < 0) {
415
0
        return FALSE;
416
0
    }
417
0
418
0
    if (firstByte <= 0x7F || firstByte == 0xFF) {
419
0
        // single byte character.
420
0
        return TRUE;
421
0
    }
422
0
423
0
    int32_t secondByte = it->nextByte(det);
424
0
    if (secondByte >= 0)  {
425
0
        it->charValue = (it->charValue << 8) | secondByte;
426
0
    }
427
0
    // else we'll handle the error later.
428
0
429
0
    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
430
0
        it->error = TRUE;
431
0
    }
432
0
433
0
    return TRUE;
434
0
}
435
436
const char *CharsetRecog_big5::getName() const
437
0
{
438
0
    return "Big5";
439
0
}
440
441
const char *CharsetRecog_big5::getLanguage() const
442
0
{
443
0
    return "zh";
444
0
}
445
446
UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
447
0
{
448
0
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449
0
    results->set(det, this, confidence);
450
0
    return (confidence > 0);
451
0
}
452
453
CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454
{
455
    // nothing to do
456
}
457
458
0
UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459
0
    int32_t firstByte  = 0;
460
0
    int32_t secondByte = 0;
461
0
    int32_t thirdByte  = 0;
462
0
    int32_t fourthByte = 0;
463
0
464
0
    it->index = it->nextIndex;
465
0
    it->error = FALSE;
466
0
    firstByte = it->charValue = it->nextByte(det);
467
0
468
0
    if (firstByte < 0) {
469
0
        // Ran off the end of the input data
470
0
        return FALSE;
471
0
    }
472
0
473
0
    if (firstByte <= 0x80) {
474
0
        // single byte char
475
0
        return TRUE;
476
0
    }
477
0
478
0
    secondByte = it->nextByte(det);
479
0
    if (secondByte >= 0) {
480
0
        it->charValue = (it->charValue << 8) | secondByte;
481
0
    }
482
0
    // else we'll handle the error later.
483
0
484
0
    if (firstByte >= 0x81 && firstByte <= 0xFE) {
485
0
        // Two byte Char
486
0
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
487
0
            return TRUE;
488
0
        }
489
0
490
0
        // Four byte char
491
0
        if (secondByte >= 0x30 && secondByte <= 0x39) {
492
0
            thirdByte = it->nextByte(det);
493
0
494
0
            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495
0
                fourthByte = it->nextByte(det);
496
0
497
0
                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498
0
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
499
0
500
0
                    return TRUE;
501
0
                }
502
0
            }
503
0
        }
504
0
505
0
        // Something wasn't valid, or we ran out of data (-1).
506
0
        it->error = TRUE;
507
0
    }
508
0
509
0
    return TRUE;
510
0
}
511
512
const char *CharsetRecog_gb_18030::getName() const
513
0
{
514
0
    return "GB18030";
515
0
}
516
517
const char *CharsetRecog_gb_18030::getLanguage() const
518
0
{
519
0
    return "zh";
520
0
}
521
522
UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
523
0
{
524
0
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525
0
    results->set(det, this, confidence);
526
0
    return (confidence > 0);
527
0
}
528
529
U_NAMESPACE_END
530
#endif