Coverage Report

Created: 2026-04-29 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/i18n/csrmbcs.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "cmemory.h"
15
#include "csmatch.h"
16
#include "csrmbcs.h"
17
18
#include <math.h>
19
20
U_NAMESPACE_BEGIN
21
22
4.02k
#define min(x,y) (((x)<(y))?(x):(y))
23
24
static const uint16_t commonChars_sjis [] = {
25
// TODO:  This set of data comes from the character frequency-
26
//        of-occurence analysis tool.  The data needs to be moved
27
//        into a resource and loaded from there.
28
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35
static const uint16_t commonChars_euc_jp[] = {
36
// TODO:  This set of data comes from the character frequency-
37
//        of-occurence analysis tool.  The data needs to be moved
38
//        into a resource and loaded from there.
39
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50
static const uint16_t commonChars_euc_kr[] = {
51
// TODO:  This set of data comes from the character frequency-
52
//        of-occurence analysis tool.  The data needs to be moved
53
//        into a resource and loaded from there.
54
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65
static const uint16_t commonChars_big5[] = {
66
// TODO:  This set of data comes from the character frequency-
67
//        of-occurence analysis tool.  The data needs to be moved
68
//        into a resource and loaded from there.
69
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80
static const uint16_t commonChars_gb_18030[] = {
81
// TODO:  This set of data comes from the character frequency-
82
//        of-occurence analysis tool.  The data needs to be moved
83
//        into a resource and loaded from there.
84
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95
static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96
221M
{
97
221M
    int32_t start = 0, end = len-1;
98
221M
    int32_t mid = (start+end)/2;
99
100
1.66G
    while(start <= end) {
101
1.44G
        if(array[mid] == value) {
102
117k
            return mid;
103
117k
        }
104
105
1.44G
        if(array[mid] < value){
106
928M
            start = mid+1;
107
928M
        } else {
108
513M
            end = mid-1;
109
513M
        }
110
111
1.44G
        mid = (start+end)/2;
112
1.44G
    }
113
114
221M
    return -1;
115
221M
}
116
117
IteratedChar::IteratedChar() : 
118
194k
charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119
194k
{
120
    // nothing else to do.
121
194k
}
122
123
/*void IteratedChar::reset()
124
{
125
    charValue = 0;
126
    index     = -1;
127
    nextIndex = 0;
128
    error     = FALSE;
129
    done      = FALSE;
130
}*/
131
132
int32_t IteratedChar::nextByte(InputText *det)
133
1.19G
{
134
1.19G
    if (nextIndex >= det->fRawLength) {
135
102k
        done = TRUE;
136
137
102k
        return -1;
138
102k
    }
139
140
1.19G
    return det->fRawInput[nextIndex++];
141
1.19G
}
142
143
CharsetRecog_mbcs::~CharsetRecog_mbcs()
144
0
{
145
    // nothing to do.
146
0
}
147
148
194k
int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149
194k
    int32_t singleByteCharCount = 0;
150
194k
    int32_t doubleByteCharCount = 0;
151
194k
    int32_t commonCharCount     = 0;
152
194k
    int32_t badCharCount        = 0;
153
194k
    int32_t totalCharCount      = 0;
154
194k
    int32_t confidence          = 0;
155
194k
    IteratedChar iter;
156
157
949M
    while (nextChar(&iter, det)) {
158
949M
        totalCharCount++;
159
160
949M
        if (iter.error) {
161
17.9M
            badCharCount++;
162
931M
        } else {
163
931M
            if (iter.charValue <= 0xFF) {
164
709M
                singleByteCharCount++;
165
709M
            } else {
166
221M
                doubleByteCharCount++;
167
168
221M
                if (commonChars != 0) {
169
221M
                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170
117k
                        commonCharCount += 1;
171
117k
                    }
172
221M
                }
173
221M
            }
174
931M
        }
175
176
177
949M
        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178
            // Bail out early if the byte data is not matching the encoding scheme.
179
            // break detectBlock;
180
103k
            return confidence;
181
103k
        }
182
949M
    }
183
184
90.7k
    if (doubleByteCharCount <= 10 && badCharCount == 0) {
185
        // Not many multi-byte chars.
186
59.7k
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
187
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188
            // We don't have enough data to have any confidence.
189
            // Statistical analysis of single byte non-ASCII charcters would probably help here.
190
36.4k
            confidence = 0;
191
36.4k
        }
192
23.3k
        else {
193
            //   ASCII or ISO file?  It's probably not our encoding,
194
            //   but is not incompatible with our encoding, so don't give it a zero.
195
23.3k
            confidence = 10;
196
23.3k
        }
197
198
59.7k
        return confidence;
199
59.7k
    }
200
201
    //
202
    //  No match if there are too many characters that don't fit the encoding scheme.
203
    //    (should we have zero tolerance for these?)
204
    //
205
30.9k
    if (doubleByteCharCount < 20*badCharCount) {
206
26.9k
        confidence = 0;
207
208
26.9k
        return confidence;
209
26.9k
    }
210
211
4.02k
    if (commonChars == 0) {
212
        // We have no statistics on frequently occuring characters.
213
        //  Assess confidence purely on having a reasonable number of
214
        //  multi-byte characters (the more the better)
215
0
        confidence = 30 + doubleByteCharCount - 20*badCharCount;
216
217
0
        if (confidence > 100) {
218
0
            confidence = 100;
219
0
        }
220
4.02k
    } else {
221
        //
222
        // Frequency of occurence statistics exist.
223
        //
224
225
4.02k
        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226
4.02k
        double scaleFactor = 90.0 / maxVal;
227
4.02k
        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228
229
4.02k
        confidence = min(confidence, 100);
230
4.02k
    }
231
232
4.02k
    if (confidence < 0) {
233
0
        confidence = 0;
234
0
    }
235
236
4.02k
    return confidence;
237
30.9k
}
238
239
CharsetRecog_sjis::~CharsetRecog_sjis()
240
{
241
    // nothing to do
242
}
243
244
191M
UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245
191M
    it->index = it->nextIndex;
246
191M
    it->error = FALSE;
247
248
191M
    int32_t firstByte = it->charValue = it->nextByte(det);
249
250
191M
    if (firstByte < 0) {
251
18.2k
        return FALSE;
252
18.2k
    }
253
254
191M
    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
255
150M
        return TRUE;
256
150M
    }
257
258
40.3M
    int32_t secondByte = it->nextByte(det);
259
40.3M
    if (secondByte >= 0) {
260
40.3M
        it->charValue = (firstByte << 8) | secondByte;
261
40.3M
    }
262
    // else we'll handle the error later.
263
264
40.3M
    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265
        // Illegal second byte value.
266
4.02M
        it->error = TRUE;
267
4.02M
    }
268
269
40.3M
    return TRUE;
270
191M
}
271
272
38.8k
UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273
38.8k
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274
38.8k
    results->set(det, this, confidence);
275
38.8k
    return (confidence > 0);
276
38.8k
}
277
278
const char *CharsetRecog_sjis::getName() const
279
38.8k
{
280
38.8k
    return "Shift_JIS";
281
38.8k
}
282
283
const char *CharsetRecog_sjis::getLanguage() const
284
38.8k
{
285
38.8k
    return "ja";
286
38.8k
}
287
288
CharsetRecog_euc::~CharsetRecog_euc()
289
{
290
    // nothing to do
291
}
292
293
258M
UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294
258M
    int32_t firstByte  = 0;
295
258M
    int32_t secondByte = 0;
296
258M
    int32_t thirdByte  = 0;
297
298
258M
    it->index = it->nextIndex;
299
258M
    it->error = FALSE;
300
258M
    firstByte = it->charValue = it->nextByte(det);
301
302
258M
    if (firstByte < 0) {
303
        // Ran off the end of the input data
304
35.8k
        return FALSE;
305
35.8k
    }
306
307
258M
    if (firstByte <= 0x8D) {
308
        // single byte char
309
193M
        return TRUE;
310
193M
    }
311
312
64.4M
    secondByte = it->nextByte(det);
313
64.4M
    if (secondByte >= 0) {
314
64.4M
        it->charValue = (it->charValue << 8) | secondByte;
315
64.4M
    }
316
    // else we'll handle the error later.
317
318
64.4M
    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319
        // Two byte Char
320
35.3M
        if (secondByte < 0xA1) {
321
3.82M
            it->error = TRUE;
322
3.82M
        }
323
324
35.3M
        return TRUE;
325
35.3M
    }
326
327
29.1M
    if (firstByte == 0x8E) {
328
        // Code Set 2.
329
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331
        // We don't know which we've got.
332
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
333
        //   bytes will look like a well formed 2 byte char.
334
20.4k
        if (secondByte < 0xA1) {
335
16.6k
            it->error = TRUE;
336
16.6k
        }
337
338
20.4k
        return TRUE;
339
20.4k
    }
340
341
29.1M
    if (firstByte == 0x8F) {
342
        // Code set 3.
343
        // Three byte total char size, two bytes of actual char value.
344
468k
        thirdByte    = it->nextByte(det);
345
468k
        it->charValue = (it->charValue << 8) | thirdByte;
346
347
468k
        if (thirdByte < 0xa1) {
348
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349
7.42k
            it->error = TRUE;
350
7.42k
        }
351
468k
    }
352
353
29.1M
    return TRUE;
354
355
29.1M
}
356
357
CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358
{
359
    // nothing to do
360
}
361
362
const char *CharsetRecog_euc_jp::getName() const
363
38.8k
{
364
38.8k
    return "EUC-JP";
365
38.8k
}
366
367
const char *CharsetRecog_euc_jp::getLanguage() const
368
38.8k
{
369
38.8k
    return "ja";
370
38.8k
}
371
372
UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
373
38.8k
{
374
38.8k
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375
38.8k
    results->set(det, this, confidence);
376
38.8k
    return (confidence > 0);
377
38.8k
}
378
379
CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380
{
381
    // nothing to do
382
}
383
384
const char *CharsetRecog_euc_kr::getName() const
385
38.8k
{
386
38.8k
    return "EUC-KR";
387
38.8k
}
388
389
const char *CharsetRecog_euc_kr::getLanguage() const
390
38.8k
{
391
38.8k
    return "ko";
392
38.8k
}
393
394
UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
395
38.8k
{
396
38.8k
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397
38.8k
    results->set(det, this, confidence);
398
38.8k
    return (confidence > 0);
399
38.8k
}
400
401
CharsetRecog_big5::~CharsetRecog_big5()
402
{
403
    // nothing to do
404
}
405
406
UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407
181M
{
408
181M
    int32_t firstByte;
409
410
181M
    it->index = it->nextIndex;
411
181M
    it->error = FALSE;
412
181M
    firstByte = it->charValue = it->nextByte(det);
413
414
181M
    if (firstByte < 0) {
415
17.6k
        return FALSE;
416
17.6k
    }
417
418
181M
    if (firstByte <= 0x7F || firstByte == 0xFF) {
419
        // single byte character.
420
136M
        return TRUE;
421
136M
    }
422
423
44.8M
    int32_t secondByte = it->nextByte(det);
424
44.8M
    if (secondByte >= 0)  {
425
44.8M
        it->charValue = (it->charValue << 8) | secondByte;
426
44.8M
    }
427
    // else we'll handle the error later.
428
429
44.8M
    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
430
3.03M
        it->error = TRUE;
431
3.03M
    }
432
433
44.8M
    return TRUE;
434
181M
}
435
436
const char *CharsetRecog_big5::getName() const
437
38.8k
{
438
38.8k
    return "Big5";
439
38.8k
}
440
441
const char *CharsetRecog_big5::getLanguage() const
442
38.8k
{
443
38.8k
    return "zh";
444
38.8k
}
445
446
UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
447
38.8k
{
448
38.8k
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449
38.8k
    results->set(det, this, confidence);
450
38.8k
    return (confidence > 0);
451
38.8k
}
452
453
CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454
{
455
    // nothing to do
456
}
457
458
318M
UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459
318M
    int32_t firstByte  = 0;
460
318M
    int32_t secondByte = 0;
461
318M
    int32_t thirdByte  = 0;
462
318M
    int32_t fourthByte = 0;
463
464
318M
    it->index = it->nextIndex;
465
318M
    it->error = FALSE;
466
318M
    firstByte = it->charValue = it->nextByte(det);
467
468
318M
    if (firstByte < 0) {
469
        // Ran off the end of the input data
470
18.9k
        return FALSE;
471
18.9k
    }
472
473
318M
    if (firstByte <= 0x80) {
474
        // single byte char
475
228M
        return TRUE;
476
228M
    }
477
478
89.4M
    secondByte = it->nextByte(det);
479
89.4M
    if (secondByte >= 0) {
480
89.4M
        it->charValue = (it->charValue << 8) | secondByte;
481
89.4M
    }
482
    // else we'll handle the error later.
483
484
89.4M
    if (firstByte >= 0x81 && firstByte <= 0xFE) {
485
        // Two byte Char
486
77.1M
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
487
69.9M
            return TRUE;
488
69.9M
        }
489
490
        // Four byte char
491
7.16M
        if (secondByte >= 0x30 && secondByte <= 0x39) {
492
2.44M
            thirdByte = it->nextByte(det);
493
494
2.44M
            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495
1.11M
                fourthByte = it->nextByte(det);
496
497
1.11M
                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498
109k
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
499
500
109k
                    return TRUE;
501
109k
                }
502
1.11M
            }
503
2.44M
        }
504
505
        // Something wasn't valid, or we ran out of data (-1).
506
7.06M
        it->error = TRUE;
507
7.06M
    }
508
509
19.3M
    return TRUE;
510
89.4M
}
511
512
const char *CharsetRecog_gb_18030::getName() const
513
38.8k
{
514
38.8k
    return "GB18030";
515
38.8k
}
516
517
const char *CharsetRecog_gb_18030::getLanguage() const
518
38.8k
{
519
38.8k
    return "zh";
520
38.8k
}
521
522
UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
523
38.8k
{
524
38.8k
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525
38.8k
    results->set(det, this, confidence);
526
38.8k
    return (confidence > 0);
527
38.8k
}
528
529
U_NAMESPACE_END
530
#endif