Coverage Report

Created: 2026-03-12 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/i18n/csrmbcs.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "cmemory.h"
15
#include "csmatch.h"
16
#include "csrmbcs.h"
17
18
#include <math.h>
19
20
U_NAMESPACE_BEGIN
21
22
4.08k
#define min(x,y) (((x)<(y))?(x):(y))
23
24
static const uint16_t commonChars_sjis [] = {
25
// TODO:  This set of data comes from the character frequency-
26
//        of-occurence analysis tool.  The data needs to be moved
27
//        into a resource and loaded from there.
28
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35
static const uint16_t commonChars_euc_jp[] = {
36
// TODO:  This set of data comes from the character frequency-
37
//        of-occurence analysis tool.  The data needs to be moved
38
//        into a resource and loaded from there.
39
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50
static const uint16_t commonChars_euc_kr[] = {
51
// TODO:  This set of data comes from the character frequency-
52
//        of-occurence analysis tool.  The data needs to be moved
53
//        into a resource and loaded from there.
54
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65
static const uint16_t commonChars_big5[] = {
66
// TODO:  This set of data comes from the character frequency-
67
//        of-occurence analysis tool.  The data needs to be moved
68
//        into a resource and loaded from there.
69
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80
static const uint16_t commonChars_gb_18030[] = {
81
// TODO:  This set of data comes from the character frequency-
82
//        of-occurence analysis tool.  The data needs to be moved
83
//        into a resource and loaded from there.
84
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95
static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96
155M
{
97
155M
    int32_t start = 0, end = len-1;
98
155M
    int32_t mid = (start+end)/2;
99
100
1.15G
    while(start <= end) {
101
1.00G
        if(array[mid] == value) {
102
289k
            return mid;
103
289k
        }
104
105
1.00G
        if(array[mid] < value){
106
590M
            start = mid+1;
107
590M
        } else {
108
410M
            end = mid-1;
109
410M
        }
110
111
1.00G
        mid = (start+end)/2;
112
1.00G
    }
113
114
154M
    return -1;
115
155M
}
116
117
IteratedChar::IteratedChar() : 
118
156k
charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119
156k
{
120
    // nothing else to do.
121
156k
}
122
123
/*void IteratedChar::reset()
124
{
125
    charValue = 0;
126
    index     = -1;
127
    nextIndex = 0;
128
    error     = FALSE;
129
    done      = FALSE;
130
}*/
131
132
int32_t IteratedChar::nextByte(InputText *det)
133
811M
{
134
811M
    if (nextIndex >= det->fRawLength) {
135
63.6k
        done = TRUE;
136
137
63.6k
        return -1;
138
63.6k
    }
139
140
811M
    return det->fRawInput[nextIndex++];
141
811M
}
142
143
CharsetRecog_mbcs::~CharsetRecog_mbcs()
144
0
{
145
    // nothing to do.
146
0
}
147
148
156k
int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149
156k
    int32_t singleByteCharCount = 0;
150
156k
    int32_t doubleByteCharCount = 0;
151
156k
    int32_t commonCharCount     = 0;
152
156k
    int32_t badCharCount        = 0;
153
156k
    int32_t totalCharCount      = 0;
154
156k
    int32_t confidence          = 0;
155
156k
    IteratedChar iter;
156
157
644M
    while (nextChar(&iter, det)) {
158
644M
        totalCharCount++;
159
160
644M
        if (iter.error) {
161
9.47M
            badCharCount++;
162
635M
        } else {
163
635M
            if (iter.charValue <= 0xFF) {
164
480M
                singleByteCharCount++;
165
480M
            } else {
166
155M
                doubleByteCharCount++;
167
168
155M
                if (commonChars != 0) {
169
155M
                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170
289k
                        commonCharCount += 1;
171
289k
                    }
172
155M
                }
173
155M
            }
174
635M
        }
175
176
177
644M
        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178
            // Bail out early if the byte data is not matching the encoding scheme.
179
            // break detectBlock;
180
101k
            return confidence;
181
101k
        }
182
644M
    }
183
184
54.8k
    if (doubleByteCharCount <= 10 && badCharCount == 0) {
185
        // Not many multi-byte chars.
186
29.1k
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
187
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188
            // We don't have enough data to have any confidence.
189
            // Statistical analysis of single byte non-ASCII charcters would probably help here.
190
15.4k
            confidence = 0;
191
15.4k
        }
192
13.6k
        else {
193
            //   ASCII or ISO file?  It's probably not our encoding,
194
            //   but is not incompatible with our encoding, so don't give it a zero.
195
13.6k
            confidence = 10;
196
13.6k
        }
197
198
29.1k
        return confidence;
199
29.1k
    }
200
201
    //
202
    //  No match if there are too many characters that don't fit the encoding scheme.
203
    //    (should we have zero tolerance for these?)
204
    //
205
25.6k
    if (doubleByteCharCount < 20*badCharCount) {
206
21.5k
        confidence = 0;
207
208
21.5k
        return confidence;
209
21.5k
    }
210
211
4.08k
    if (commonChars == 0) {
212
        // We have no statistics on frequently occuring characters.
213
        //  Assess confidence purely on having a reasonable number of
214
        //  multi-byte characters (the more the better)
215
0
        confidence = 30 + doubleByteCharCount - 20*badCharCount;
216
217
0
        if (confidence > 100) {
218
0
            confidence = 100;
219
0
        }
220
4.08k
    } else {
221
        //
222
        // Frequency of occurence statistics exist.
223
        //
224
225
4.08k
        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226
4.08k
        double scaleFactor = 90.0 / maxVal;
227
4.08k
        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228
229
4.08k
        confidence = min(confidence, 100);
230
4.08k
    }
231
232
4.08k
    if (confidence < 0) {
233
0
        confidence = 0;
234
0
    }
235
236
4.08k
    return confidence;
237
25.6k
}
238
239
CharsetRecog_sjis::~CharsetRecog_sjis()
240
{
241
    // nothing to do
242
}
243
244
124M
UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245
124M
    it->index = it->nextIndex;
246
124M
    it->error = FALSE;
247
248
124M
    int32_t firstByte = it->charValue = it->nextByte(det);
249
250
124M
    if (firstByte < 0) {
251
11.5k
        return FALSE;
252
11.5k
    }
253
254
124M
    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
255
94.4M
        return TRUE;
256
94.4M
    }
257
258
29.8M
    int32_t secondByte = it->nextByte(det);
259
29.8M
    if (secondByte >= 0) {
260
29.8M
        it->charValue = (firstByte << 8) | secondByte;
261
29.8M
    }
262
    // else we'll handle the error later.
263
264
29.8M
    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265
        // Illegal second byte value.
266
2.25M
        it->error = TRUE;
267
2.25M
    }
268
269
29.8M
    return TRUE;
270
124M
}
271
272
31.2k
UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273
31.2k
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274
31.2k
    results->set(det, this, confidence);
275
31.2k
    return (confidence > 0);
276
31.2k
}
277
278
const char *CharsetRecog_sjis::getName() const
279
31.2k
{
280
31.2k
    return "Shift_JIS";
281
31.2k
}
282
283
const char *CharsetRecog_sjis::getLanguage() const
284
31.2k
{
285
31.2k
    return "ja";
286
31.2k
}
287
288
CharsetRecog_euc::~CharsetRecog_euc()
289
{
290
    // nothing to do
291
}
292
293
216M
UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294
216M
    int32_t firstByte  = 0;
295
216M
    int32_t secondByte = 0;
296
216M
    int32_t thirdByte  = 0;
297
298
216M
    it->index = it->nextIndex;
299
216M
    it->error = FALSE;
300
216M
    firstByte = it->charValue = it->nextByte(det);
301
302
216M
    if (firstByte < 0) {
303
        // Ran off the end of the input data
304
20.2k
        return FALSE;
305
20.2k
    }
306
307
216M
    if (firstByte <= 0x8D) {
308
        // single byte char
309
171M
        return TRUE;
310
171M
    }
311
312
44.6M
    secondByte = it->nextByte(det);
313
44.6M
    if (secondByte >= 0) {
314
44.6M
        it->charValue = (it->charValue << 8) | secondByte;
315
44.6M
    }
316
    // else we'll handle the error later.
317
318
44.6M
    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319
        // Two byte Char
320
25.0M
        if (secondByte < 0xA1) {
321
2.42M
            it->error = TRUE;
322
2.42M
        }
323
324
25.0M
        return TRUE;
325
25.0M
    }
326
327
19.5M
    if (firstByte == 0x8E) {
328
        // Code Set 2.
329
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331
        // We don't know which we've got.
332
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
333
        //   bytes will look like a well formed 2 byte char.
334
21.5k
        if (secondByte < 0xA1) {
335
17.0k
            it->error = TRUE;
336
17.0k
        }
337
338
21.5k
        return TRUE;
339
21.5k
    }
340
341
19.5M
    if (firstByte == 0x8F) {
342
        // Code set 3.
343
        // Three byte total char size, two bytes of actual char value.
344
236k
        thirdByte    = it->nextByte(det);
345
236k
        it->charValue = (it->charValue << 8) | thirdByte;
346
347
236k
        if (thirdByte < 0xa1) {
348
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349
8.65k
            it->error = TRUE;
350
8.65k
        }
351
236k
    }
352
353
19.5M
    return TRUE;
354
355
19.5M
}
356
357
CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358
{
359
    // nothing to do
360
}
361
362
const char *CharsetRecog_euc_jp::getName() const
363
31.2k
{
364
31.2k
    return "EUC-JP";
365
31.2k
}
366
367
const char *CharsetRecog_euc_jp::getLanguage() const
368
31.2k
{
369
31.2k
    return "ja";
370
31.2k
}
371
372
UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
373
31.2k
{
374
31.2k
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375
31.2k
    results->set(det, this, confidence);
376
31.2k
    return (confidence > 0);
377
31.2k
}
378
379
CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380
{
381
    // nothing to do
382
}
383
384
const char *CharsetRecog_euc_kr::getName() const
385
31.2k
{
386
31.2k
    return "EUC-KR";
387
31.2k
}
388
389
const char *CharsetRecog_euc_kr::getLanguage() const
390
31.2k
{
391
31.2k
    return "ko";
392
31.2k
}
393
394
UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
395
31.2k
{
396
31.2k
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397
31.2k
    results->set(det, this, confidence);
398
31.2k
    return (confidence > 0);
399
31.2k
}
400
401
CharsetRecog_big5::~CharsetRecog_big5()
402
{
403
    // nothing to do
404
}
405
406
UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407
125M
{
408
125M
    int32_t firstByte;
409
410
125M
    it->index = it->nextIndex;
411
125M
    it->error = FALSE;
412
125M
    firstByte = it->charValue = it->nextByte(det);
413
414
125M
    if (firstByte < 0) {
415
10.1k
        return FALSE;
416
10.1k
    }
417
418
125M
    if (firstByte <= 0x7F || firstByte == 0xFF) {
419
        // single byte character.
420
93.3M
        return TRUE;
421
93.3M
    }
422
423
32.2M
    int32_t secondByte = it->nextByte(det);
424
32.2M
    if (secondByte >= 0)  {
425
32.2M
        it->charValue = (it->charValue << 8) | secondByte;
426
32.2M
    }
427
    // else we'll handle the error later.
428
429
32.2M
    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
430
1.37M
        it->error = TRUE;
431
1.37M
    }
432
433
32.2M
    return TRUE;
434
125M
}
435
436
const char *CharsetRecog_big5::getName() const
437
31.2k
{
438
31.2k
    return "Big5";
439
31.2k
}
440
441
const char *CharsetRecog_big5::getLanguage() const
442
31.2k
{
443
31.2k
    return "zh";
444
31.2k
}
445
446
UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
447
31.2k
{
448
31.2k
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449
31.2k
    results->set(det, this, confidence);
450
31.2k
    return (confidence > 0);
451
31.2k
}
452
453
CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454
{
455
    // nothing to do
456
}
457
458
178M
UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459
178M
    int32_t firstByte  = 0;
460
178M
    int32_t secondByte = 0;
461
178M
    int32_t thirdByte  = 0;
462
178M
    int32_t fourthByte = 0;
463
464
178M
    it->index = it->nextIndex;
465
178M
    it->error = FALSE;
466
178M
    firstByte = it->charValue = it->nextByte(det);
467
468
178M
    if (firstByte < 0) {
469
        // Ran off the end of the input data
470
12.8k
        return FALSE;
471
12.8k
    }
472
473
178M
    if (firstByte <= 0x80) {
474
        // single byte char
475
120M
        return TRUE;
476
120M
    }
477
478
57.8M
    secondByte = it->nextByte(det);
479
57.8M
    if (secondByte >= 0) {
480
57.8M
        it->charValue = (it->charValue << 8) | secondByte;
481
57.8M
    }
482
    // else we'll handle the error later.
483
484
57.8M
    if (firstByte >= 0x81 && firstByte <= 0xFE) {
485
        // Two byte Char
486
50.3M
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
487
46.8M
            return TRUE;
488
46.8M
        }
489
490
        // Four byte char
491
3.48M
        if (secondByte >= 0x30 && secondByte <= 0x39) {
492
1.05M
            thirdByte = it->nextByte(det);
493
494
1.05M
            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495
343k
                fourthByte = it->nextByte(det);
496
497
343k
                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498
91.7k
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
499
500
91.7k
                    return TRUE;
501
91.7k
                }
502
343k
            }
503
1.05M
        }
504
505
        // Something wasn't valid, or we ran out of data (-1).
506
3.38M
        it->error = TRUE;
507
3.38M
    }
508
509
10.9M
    return TRUE;
510
57.8M
}
511
512
const char *CharsetRecog_gb_18030::getName() const
513
31.2k
{
514
31.2k
    return "GB18030";
515
31.2k
}
516
517
const char *CharsetRecog_gb_18030::getLanguage() const
518
31.2k
{
519
31.2k
    return "zh";
520
31.2k
}
521
522
UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
523
31.2k
{
524
31.2k
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525
31.2k
    results->set(det, this, confidence);
526
31.2k
    return (confidence > 0);
527
31.2k
}
528
529
U_NAMESPACE_END
530
#endif