Coverage Report

Created: 2026-06-13 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/i18n/csrmbcs.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "cmemory.h"
15
#include "csmatch.h"
16
#include "csrmbcs.h"
17
18
#include <math.h>
19
20
U_NAMESPACE_BEGIN
21
22
5.85k
#define min(x,y) (((x)<(y))?(x):(y))
23
24
static const uint16_t commonChars_sjis [] = {
25
// TODO:  This set of data comes from the character frequency-
26
//        of-occurrence analysis tool.  The data needs to be moved
27
//        into a resource and loaded from there.
28
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35
static const uint16_t commonChars_euc_jp[] = {
36
// TODO:  This set of data comes from the character frequency-
37
//        of-occurrence analysis tool.  The data needs to be moved
38
//        into a resource and loaded from there.
39
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50
static const uint16_t commonChars_euc_kr[] = {
51
// TODO:  This set of data comes from the character frequency-
52
//        of-occurrence analysis tool.  The data needs to be moved
53
//        into a resource and loaded from there.
54
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65
static const uint16_t commonChars_big5[] = {
66
// TODO:  This set of data comes from the character frequency-
67
//        of-occurrence analysis tool.  The data needs to be moved
68
//        into a resource and loaded from there.
69
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80
static const uint16_t commonChars_gb_18030[] = {
81
// TODO:  This set of data comes from the character frequency-
82
//        of-occurrence analysis tool.  The data needs to be moved
83
//        into a resource and loaded from there.
84
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95
static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96
263M
{
97
263M
    int32_t start = 0, end = len-1;
98
263M
    int32_t mid = (start+end)/2;
99
100
1.98G
    while(start <= end) {
101
1.72G
        if(array[mid] == value) {
102
161k
            return mid;
103
161k
        }
104
105
1.72G
        if(array[mid] < value){
106
1.14G
            start = mid+1;
107
1.14G
        } else {
108
573M
            end = mid-1;
109
573M
        }
110
111
1.72G
        mid = (start+end)/2;
112
1.72G
    }
113
114
262M
    return -1;
115
263M
}
116
117
IteratedChar::IteratedChar() : 
118
3.29M
charValue(0), index(-1), nextIndex(0), error(false), done(false)
119
3.29M
{
120
    // nothing else to do.
121
3.29M
}
122
123
/*void IteratedChar::reset()
124
{
125
    charValue = 0;
126
    index     = -1;
127
    nextIndex = 0;
128
    error     = false;
129
    done      = false;
130
}*/
131
132
int32_t IteratedChar::nextByte(InputText *det)
133
1.46G
{
134
1.46G
    if (nextIndex >= det->fRawLength) {
135
3.43M
        done = true;
136
137
3.43M
        return -1;
138
3.43M
    }
139
140
1.46G
    return det->fRawInput[nextIndex++];
141
1.46G
}
142
143
CharsetRecog_mbcs::~CharsetRecog_mbcs()
144
0
{
145
    // nothing to do.
146
0
}
147
148
3.29M
int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149
3.29M
    int32_t doubleByteCharCount = 0;
150
3.29M
    int32_t commonCharCount     = 0;
151
3.29M
    int32_t badCharCount        = 0;
152
3.29M
    int32_t totalCharCount      = 0;
153
3.29M
    int32_t confidence          = 0;
154
3.29M
    IteratedChar iter;
155
156
1.17G
    while (nextChar(&iter, det)) {
157
1.17G
        totalCharCount++;
158
159
1.17G
        if (iter.error) {
160
19.8M
            badCharCount++;
161
1.15G
        } else {
162
1.15G
            if (iter.charValue > 0xFF) {
163
263M
                doubleByteCharCount++;
164
165
263M
                if (commonChars != nullptr) {
166
263M
                    if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
167
161k
                        commonCharCount += 1;
168
161k
                    }
169
263M
                }
170
263M
            }
171
1.15G
        }
172
173
174
1.17G
        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
175
            // Bail out early if the byte data is not matching the encoding scheme.
176
            // break detectBlock;
177
163k
            return confidence;
178
163k
        }
179
1.17G
    }
180
181
3.13M
    if (doubleByteCharCount <= 10 && badCharCount == 0) {
182
        // Not many multi-byte chars.
183
2.79M
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
184
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
185
            // We don't have enough data to have any confidence.
186
            // Statistical analysis of single byte non-ASCII characters would probably help here.
187
2.38M
            confidence = 0;
188
2.38M
        }
189
406k
        else {
190
            //   ASCII or ISO file?  It's probably not our encoding,
191
            //   but is not incompatible with our encoding, so don't give it a zero.
192
406k
            confidence = 10;
193
406k
        }
194
195
2.79M
        return confidence;
196
2.79M
    }
197
198
    //
199
    //  No match if there are too many characters that don't fit the encoding scheme.
200
    //    (should we have zero tolerance for these?)
201
    //
202
341k
    if (doubleByteCharCount < 20*badCharCount) {
203
335k
        confidence = 0;
204
205
335k
        return confidence;
206
335k
    }
207
208
5.85k
    if (commonChars == nullptr) {
209
        // We have no statistics on frequently occurring characters.
210
        //  Assess confidence purely on having a reasonable number of
211
        //  multi-byte characters (the more the better)
212
0
        confidence = 30 + doubleByteCharCount - 20*badCharCount;
213
214
0
        if (confidence > 100) {
215
0
            confidence = 100;
216
0
        }
217
5.85k
    } else {
218
        //
219
        // Frequency of occurrence statistics exist.
220
        //
221
222
5.85k
        double maxVal = log(static_cast<double>(doubleByteCharCount) / 4); /*(float)?*/
223
5.85k
        double scaleFactor = 90.0 / maxVal;
224
5.85k
        confidence = static_cast<int32_t>(log(static_cast<double>(commonCharCount) + 1) * scaleFactor + 10.0);
225
226
5.85k
        confidence = min(confidence, 100);
227
5.85k
    }
228
229
5.85k
    if (confidence < 0) {
230
0
        confidence = 0;
231
0
    }
232
233
5.85k
    return confidence;
234
341k
}
235
236
CharsetRecog_sjis::~CharsetRecog_sjis()
237
{
238
    // nothing to do
239
}
240
241
202M
UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
242
202M
    it->index = it->nextIndex;
243
202M
    it->error = false;
244
245
202M
    int32_t firstByte = it->charValue = it->nextByte(det);
246
247
202M
    if (firstByte < 0) {
248
615k
        return false;
249
615k
    }
250
251
201M
    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
252
157M
        return true;
253
157M
    }
254
255
44.5M
    int32_t secondByte = it->nextByte(det);
256
44.5M
    if (secondByte >= 0) {
257
44.5M
        it->charValue = (firstByte << 8) | secondByte;
258
44.5M
    }
259
    // else we'll handle the error later.
260
261
44.5M
    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
262
        // Illegal second byte value.
263
4.02M
        it->error = true;
264
4.02M
    }
265
266
44.5M
    return true;
267
201M
}
268
269
659k
UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
270
659k
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
271
659k
    results->set(det, this, confidence);
272
659k
    return (confidence > 0);
273
659k
}
274
275
const char *CharsetRecog_sjis::getName() const
276
659k
{
277
659k
    return "Shift_JIS";
278
659k
}
279
280
const char *CharsetRecog_sjis::getLanguage() const
281
659k
{
282
659k
    return "ja";
283
659k
}
284
285
CharsetRecog_euc::~CharsetRecog_euc()
286
{
287
    // nothing to do
288
}
289
290
412M
UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
291
412M
    int32_t firstByte  = 0;
292
412M
    int32_t secondByte = 0;
293
412M
    int32_t thirdByte  = 0;
294
295
412M
    it->index = it->nextIndex;
296
412M
    it->error = false;
297
412M
    firstByte = it->charValue = it->nextByte(det);
298
299
412M
    if (firstByte < 0) {
300
        // Ran off the end of the input data
301
1.26M
        return false;
302
1.26M
    }
303
304
411M
    if (firstByte <= 0x8D) {
305
        // single byte char
306
328M
        return true;
307
328M
    }
308
309
83.3M
    secondByte = it->nextByte(det);
310
83.3M
    if (secondByte >= 0) {
311
83.1M
        it->charValue = (it->charValue << 8) | secondByte;
312
83.1M
    }
313
    // else we'll handle the error later.
314
315
83.3M
    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316
        // Two byte Char
317
47.6M
        if (secondByte < 0xA1) {
318
5.15M
            it->error = true;
319
5.15M
        }
320
321
47.6M
        return true;
322
47.6M
    }
323
324
35.7M
    if (firstByte == 0x8E) {
325
        // Code Set 2.
326
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328
        // We don't know which we've got.
329
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
330
        //   bytes will look like a well formed 2 byte char.
331
21.9k
        if (secondByte < 0xA1) {
332
19.0k
            it->error = true;
333
19.0k
        }
334
335
21.9k
        return true;
336
21.9k
    }
337
338
35.7M
    if (firstByte == 0x8F) {
339
        // Code set 3.
340
        // Three byte total char size, two bytes of actual char value.
341
732k
        thirdByte    = it->nextByte(det);
342
732k
        it->charValue = (it->charValue << 8) | thirdByte;
343
344
732k
        if (thirdByte < 0xa1) {
345
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346
11.4k
            it->error = true;
347
11.4k
        }
348
732k
    }
349
350
35.7M
    return true;
351
352
35.7M
}
353
354
CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355
{
356
    // nothing to do
357
}
358
359
const char *CharsetRecog_euc_jp::getName() const
360
659k
{
361
659k
    return "EUC-JP";
362
659k
}
363
364
const char *CharsetRecog_euc_jp::getLanguage() const
365
659k
{
366
659k
    return "ja";
367
659k
}
368
369
UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
370
659k
{
371
659k
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
372
659k
    results->set(det, this, confidence);
373
659k
    return (confidence > 0);
374
659k
}
375
376
CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
377
{
378
    // nothing to do
379
}
380
381
const char *CharsetRecog_euc_kr::getName() const
382
659k
{
383
659k
    return "EUC-KR";
384
659k
}
385
386
const char *CharsetRecog_euc_kr::getLanguage() const
387
659k
{
388
659k
    return "ko";
389
659k
}
390
391
UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
392
659k
{
393
659k
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
394
659k
    results->set(det, this, confidence);
395
659k
    return (confidence > 0);
396
659k
}
397
398
CharsetRecog_big5::~CharsetRecog_big5()
399
{
400
    // nothing to do
401
}
402
403
UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
404
214M
{
405
214M
    int32_t firstByte;
406
407
214M
    it->index = it->nextIndex;
408
214M
    it->error = false;
409
214M
    firstByte = it->charValue = it->nextByte(det);
410
411
214M
    if (firstByte < 0) {
412
628k
        return false;
413
628k
    }
414
415
214M
    if (firstByte <= 0x7F || firstByte == 0xFF) {
416
        // single byte character.
417
158M
        return true;
418
158M
    }
419
420
56.0M
    int32_t secondByte = it->nextByte(det);
421
56.0M
    if (secondByte >= 0)  {
422
55.9M
        it->charValue = (it->charValue << 8) | secondByte;
423
55.9M
    }
424
    // else we'll handle the error later.
425
426
56.0M
    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
427
3.39M
        it->error = true;
428
3.39M
    }
429
430
56.0M
    return true;
431
214M
}
432
433
const char *CharsetRecog_big5::getName() const
434
659k
{
435
659k
    return "Big5";
436
659k
}
437
438
const char *CharsetRecog_big5::getLanguage() const
439
659k
{
440
659k
    return "zh";
441
659k
}
442
443
UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
444
659k
{
445
659k
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
446
659k
    results->set(det, this, confidence);
447
659k
    return (confidence > 0);
448
659k
}
449
450
CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
451
{
452
    // nothing to do
453
}
454
455
347M
UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
456
347M
    int32_t firstByte  = 0;
457
347M
    int32_t secondByte = 0;
458
347M
    int32_t thirdByte  = 0;
459
347M
    int32_t fourthByte = 0;
460
461
347M
    it->index = it->nextIndex;
462
347M
    it->error = false;
463
347M
    firstByte = it->charValue = it->nextByte(det);
464
465
347M
    if (firstByte < 0) {
466
        // Ran off the end of the input data
467
630k
        return false;
468
630k
    }
469
470
346M
    if (firstByte <= 0x80) {
471
        // single byte char
472
247M
        return true;
473
247M
    }
474
475
99.1M
    secondByte = it->nextByte(det);
476
99.1M
    if (secondByte >= 0) {
477
99.0M
        it->charValue = (it->charValue << 8) | secondByte;
478
99.0M
    }
479
    // else we'll handle the error later.
480
481
99.1M
    if (firstByte >= 0x81 && firstByte <= 0xFE) {
482
        // Two byte Char
483
87.2M
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
484
79.8M
            return true;
485
79.8M
        }
486
487
        // Four byte char
488
7.36M
        if (secondByte >= 0x30 && secondByte <= 0x39) {
489
2.40M
            thirdByte = it->nextByte(det);
490
491
2.40M
            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
492
1.09M
                fourthByte = it->nextByte(det);
493
494
1.09M
                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
495
111k
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
496
497
111k
                    return true;
498
111k
                }
499
1.09M
            }
500
2.40M
        }
501
502
        // Something wasn't valid, or we ran out of data (-1).
503
7.25M
        it->error = true;
504
7.25M
    }
505
506
19.1M
    return true;
507
99.1M
}
508
509
const char *CharsetRecog_gb_18030::getName() const
510
659k
{
511
659k
    return "GB18030";
512
659k
}
513
514
const char *CharsetRecog_gb_18030::getLanguage() const
515
659k
{
516
659k
    return "zh";
517
659k
}
518
519
UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
520
659k
{
521
659k
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
522
659k
    results->set(det, this, confidence);
523
659k
    return (confidence > 0);
524
659k
}
525
526
U_NAMESPACE_END
527
#endif