/src/icu/source/i18n/csrmbcs.cpp

Source
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 2005-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "cmemory.h"
#include "csmatch.h"
#include "csrmbcs.h"

#include <math.h>

U_NAMESPACE_BEGIN

#define min(x,y) (((x)<(y))?(x):(y))

static const uint16_t commonChars_sjis [] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};

static const uint16_t commonChars_euc_jp[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};

static const uint16_t commonChars_euc_kr[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};

static const uint16_t commonChars_big5[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};

static const uint16_t commonChars_gb_18030[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};

static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
{
    int32_t start = 0, end = len-1;
    int32_t mid = (start+end)/2;

    while(start <= end) {
        if(array[mid] == value) {
            return mid;
        }

        if(array[mid] < value){
            start = mid+1;
        } else {
            end = mid-1;
        }

        mid = (start+end)/2;
    }

    return -1;
}

IteratedChar::IteratedChar() : 
charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
{
    // nothing else to do.
}

/*void IteratedChar::reset()
{
    charValue = 0;
    index     = -1;
    nextIndex = 0;
    error     = FALSE;
    done      = FALSE;
}*/

int32_t IteratedChar::nextByte(InputText *det)
{
    if (nextIndex >= det->fRawLength) {
        done = TRUE;

        return -1;
    }

    return det->fRawInput[nextIndex++];
}

CharsetRecog_mbcs::~CharsetRecog_mbcs()
{
    // nothing to do.
}

int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
    int32_t singleByteCharCount = 0;
    int32_t doubleByteCharCount = 0;
    int32_t commonCharCount     = 0;
    int32_t badCharCount        = 0;
    int32_t totalCharCount      = 0;
    int32_t confidence          = 0;
    IteratedChar iter;

    while (nextChar(&iter, det)) {
        totalCharCount++;

        if (iter.error) {
            badCharCount++;
        } else {
            if (iter.charValue <= 0xFF) {
                singleByteCharCount++;
            } else {
                doubleByteCharCount++;

                if (commonChars != 0) {
                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
                        commonCharCount += 1;
                    }
                }
            }
        }


        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
            // Bail out early if the byte data is not matching the encoding scheme.
            // break detectBlock;
            return confidence;
        }
    }

    if (doubleByteCharCount <= 10 && badCharCount == 0) {
        // Not many multi-byte chars.
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
            // We don't have enough data to have any confidence.
            // Statistical analysis of single byte non-ASCII charcters would probably help here.
            confidence = 0;
        }
        else {
            //   ASCII or ISO file?  It's probably not our encoding,
            //   but is not incompatible with our encoding, so don't give it a zero.
            confidence = 10;
        }

        return confidence;
    }

    //
    //  No match if there are too many characters that don't fit the encoding scheme.
    //    (should we have zero tolerance for these?)
    //
    if (doubleByteCharCount < 20*badCharCount) {
        confidence = 0;

        return confidence;
    }

    if (commonChars == 0) {
        // We have no statistics on frequently occuring characters.
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better)
        confidence = 30 + doubleByteCharCount - 20*badCharCount;

        if (confidence > 100) {
            confidence = 100;
        }
    } else {
        //
        // Frequency of occurence statistics exist.
        //

        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
        double scaleFactor = 90.0 / maxVal;
        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);

        confidence = min(confidence, 100);
    }

    if (confidence < 0) {
        confidence = 0;
    }

    return confidence;
}

CharsetRecog_sjis::~CharsetRecog_sjis()
{
    // nothing to do
}

UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
    it->index = it->nextIndex;
    it->error = FALSE;

    int32_t firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return FALSE;
    }

    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
        return TRUE;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (firstByte << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
        // Illegal second byte value.
        it->error = TRUE;
    }

    return TRUE;
}

UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
    results->set(det, this, confidence);
    return (confidence > 0);
}

const char *CharsetRecog_sjis::getName() const
{
    return "Shift_JIS";
}

const char *CharsetRecog_sjis::getLanguage() const
{
    return "ja";
}

CharsetRecog_euc::~CharsetRecog_euc()
{
    // nothing to do
}

UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return FALSE;
    }

    if (firstByte <= 0x8D) {
        // single byte char
        return TRUE;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
        // Two byte Char
        if (secondByte < 0xA1) {
            it->error = TRUE;
        }

        return TRUE;
    }

    if (firstByte == 0x8E) {
        // Code Set 2.
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
        // We don't know which we've got.
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
        //   bytes will look like a well formed 2 byte char.
        if (secondByte < 0xA1) {
            it->error = TRUE;
        }

        return TRUE;
    }

    if (firstByte == 0x8F) {
        // Code set 3.
        // Three byte total char size, two bytes of actual char value.
        thirdByte    = it->nextByte(det);
        it->charValue = (it->charValue << 8) | thirdByte;

        if (thirdByte < 0xa1) {
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
            it->error = TRUE;
        }
    }

    return TRUE;

}

CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
{
    // nothing to do
}

const char *CharsetRecog_euc_jp::getName() const
{
    return "EUC-JP";
}

const char *CharsetRecog_euc_jp::getLanguage() const
{
    return "ja";
}

UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
{
    // nothing to do
}

const char *CharsetRecog_euc_kr::getName() const
{
    return "EUC-KR";
}

const char *CharsetRecog_euc_kr::getLanguage() const
{
    return "ko";
}

UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_big5::~CharsetRecog_big5()
{
    // nothing to do
}

UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
{
    int32_t firstByte;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return FALSE;
    }

    if (firstByte <= 0x7F || firstByte == 0xFF) {
        // single byte character.
        return TRUE;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0)  {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
        it->error = TRUE;
    }

    return TRUE;
}

const char *CharsetRecog_big5::getName() const
{
    return "Big5";
}

const char *CharsetRecog_big5::getLanguage() const
{
    return "zh";
}

UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
{
    // nothing to do
}

UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;
    int32_t fourthByte = 0;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return FALSE;
    }

    if (firstByte <= 0x80) {
        // single byte char
        return TRUE;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0x81 && firstByte <= 0xFE) {
        // Two byte Char
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
            return TRUE;
        }

        // Four byte char
        if (secondByte >= 0x30 && secondByte <= 0x39) {
            thirdByte = it->nextByte(det);

            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
                fourthByte = it->nextByte(det);

                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;

                    return TRUE;
                }
            }
        }

        // Something wasn't valid, or we ran out of data (-1).
        it->error = TRUE;
    }

    return TRUE;
}

const char *CharsetRecog_gb_18030::getName() const
{
    return "GB18030";
}

const char *CharsetRecog_gb_18030::getLanguage() const
{
    return "zh";
}

UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
    results->set(det, this, confidence);
    return (confidence > 0);
}

U_NAMESPACE_END
#endif

Coverage Report

Created: 2026-04-29 07:28

Line	Count	Source
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		**********************************************************************
5		* Copyright (C) 2005-2016, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		**********************************************************************
8		*/
9
10		#include "unicode/utypes.h"
11
12		#if !UCONFIG_NO_CONVERSION
13
14		#include "cmemory.h"
15		#include "csmatch.h"
16		#include "csrmbcs.h"
17
18		#include <math.h>
19
20		U_NAMESPACE_BEGIN
21
22	4.02k	#define min(x,y) (((x)<(y))?(x):(y))
23
24		static const uint16_t commonChars_sjis [] = {
25		// TODO: This set of data comes from the character frequency-
26		// of-occurence analysis tool. The data needs to be moved
27		// into a resource and loaded from there.
28		0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29		0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30		0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31		0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32		0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33		0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35		static const uint16_t commonChars_euc_jp[] = {
36		// TODO: This set of data comes from the character frequency-
37		// of-occurence analysis tool. The data needs to be moved
38		// into a resource and loaded from there.
39		0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40		0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41		0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42		0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43		0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44		0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45		0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46		0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47		0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48		0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50		static const uint16_t commonChars_euc_kr[] = {
51		// TODO: This set of data comes from the character frequency-
52		// of-occurence analysis tool. The data needs to be moved
53		// into a resource and loaded from there.
54		0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55		0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56		0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57		0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58		0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59		0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60		0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61		0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62		0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63		0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65		static const uint16_t commonChars_big5[] = {
66		// TODO: This set of data comes from the character frequency-
67		// of-occurence analysis tool. The data needs to be moved
68		// into a resource and loaded from there.
69		0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70		0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71		0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72		0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73		0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74		0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75		0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76		0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77		0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78		0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80		static const uint16_t commonChars_gb_18030[] = {
81		// TODO: This set of data comes from the character frequency-
82		// of-occurence analysis tool. The data needs to be moved
83		// into a resource and loaded from there.
84		0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85		0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86		0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87		0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88		0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89		0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90		0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91		0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92		0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93		0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95		static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96	221M	{
97	221M	int32_t start = 0, end = len-1;
98	221M	int32_t mid = (start+end)/2;
99
100	1.66G	while(start <= end) {
101	1.44G	if(array[mid] == value) {
102	117k	return mid;
103	117k	}
104
105	1.44G	if(array[mid] < value){
106	928M	start = mid+1;
107	928M	} else {
108	513M	end = mid-1;
109	513M	}
110
111	1.44G	mid = (start+end)/2;
112	1.44G	}
113
114	221M	return -1;
115	221M	}
116
117		IteratedChar::IteratedChar() :
118	194k	charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119	194k	{
120		// nothing else to do.
121	194k	}
122
123		/*void IteratedChar::reset()
124		{
125		charValue = 0;
126		index = -1;
127		nextIndex = 0;
128		error = FALSE;
129		done = FALSE;
130		}*/
131
132		int32_t IteratedChar::nextByte(InputText *det)
133	1.19G	{
134	1.19G	if (nextIndex >= det->fRawLength) {
135	102k	done = TRUE;
136
137	102k	return -1;
138	102k	}
139
140	1.19G	return det->fRawInput[nextIndex++];
141	1.19G	}
142
143		CharsetRecog_mbcs::~CharsetRecog_mbcs()
144	0	{
145		// nothing to do.
146	0	}
147
148	194k	int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149	194k	int32_t singleByteCharCount = 0;
150	194k	int32_t doubleByteCharCount = 0;
151	194k	int32_t commonCharCount = 0;
152	194k	int32_t badCharCount = 0;
153	194k	int32_t totalCharCount = 0;
154	194k	int32_t confidence = 0;
155	194k	IteratedChar iter;
156
157	949M	while (nextChar(&iter, det)) {
158	949M	totalCharCount++;
159
160	949M	if (iter.error) {
161	17.9M	badCharCount++;
162	931M	} else {
163	931M	if (iter.charValue <= 0xFF) {
164	709M	singleByteCharCount++;
165	709M	} else {
166	221M	doubleByteCharCount++;
167
168	221M	if (commonChars != 0) {
169	221M	if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170	117k	commonCharCount += 1;
171	117k	}
172	221M	}
173	221M	}
174	931M	}
175
176
177	949M	if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178		// Bail out early if the byte data is not matching the encoding scheme.
179		// break detectBlock;
180	103k	return confidence;
181	103k	}
182	949M	}
183
184	90.7k	if (doubleByteCharCount <= 10 && badCharCount == 0) {
185		// Not many multi-byte chars.
186	59.7k	if (doubleByteCharCount == 0 && totalCharCount < 10) {
187		// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188		// We don't have enough data to have any confidence.
189		// Statistical analysis of single byte non-ASCII charcters would probably help here.
190	36.4k	confidence = 0;
191	36.4k	}
192	23.3k	else {
193		// ASCII or ISO file? It's probably not our encoding,
194		// but is not incompatible with our encoding, so don't give it a zero.
195	23.3k	confidence = 10;
196	23.3k	}
197
198	59.7k	return confidence;
199	59.7k	}
200
201		//
202		// No match if there are too many characters that don't fit the encoding scheme.
203		// (should we have zero tolerance for these?)
204		//
205	30.9k	if (doubleByteCharCount < 20*badCharCount) {
206	26.9k	confidence = 0;
207
208	26.9k	return confidence;
209	26.9k	}
210
211	4.02k	if (commonChars == 0) {
212		// We have no statistics on frequently occuring characters.
213		// Assess confidence purely on having a reasonable number of
214		// multi-byte characters (the more the better)
215	0	confidence = 30 + doubleByteCharCount - 20*badCharCount;
216
217	0	if (confidence > 100) {
218	0	confidence = 100;
219	0	}
220	4.02k	} else {
221		//
222		// Frequency of occurence statistics exist.
223		//
224
225	4.02k	double maxVal = log((double)doubleByteCharCount / 4); /(float)?/
226	4.02k	double scaleFactor = 90.0 / maxVal;
227	4.02k	confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228
229	4.02k	confidence = min(confidence, 100);
230	4.02k	}
231
232	4.02k	if (confidence < 0) {
233	0	confidence = 0;
234	0	}
235
236	4.02k	return confidence;
237	30.9k	}
238
239		CharsetRecog_sjis::~CharsetRecog_sjis()
240		{
241		// nothing to do
242		}
243
244	191M	UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245	191M	it->index = it->nextIndex;
246	191M	it->error = FALSE;
247
248	191M	int32_t firstByte = it->charValue = it->nextByte(det);
249
250	191M	if (firstByte < 0) {
251	18.2k	return FALSE;
252	18.2k	}
253
254	191M	if (firstByte <= 0x7F \|\| (firstByte > 0xA0 && firstByte <= 0xDF)) {
255	150M	return TRUE;
256	150M	}
257
258	40.3M	int32_t secondByte = it->nextByte(det);
259	40.3M	if (secondByte >= 0) {
260	40.3M	it->charValue = (firstByte << 8) \| secondByte;
261	40.3M	}
262		// else we'll handle the error later.
263
264	40.3M	if (! ((secondByte >= 0x40 && secondByte <= 0x7F) \|\| (secondByte >= 0x80 && secondByte <= 0xFE))) {
265		// Illegal second byte value.
266	4.02M	it->error = TRUE;
267	4.02M	}
268
269	40.3M	return TRUE;
270	191M	}
271
272	38.8k	UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273	38.8k	int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274	38.8k	results->set(det, this, confidence);
275	38.8k	return (confidence > 0);
276	38.8k	}
277
278		const char *CharsetRecog_sjis::getName() const
279	38.8k	{
280	38.8k	return "Shift_JIS";
281	38.8k	}
282
283		const char *CharsetRecog_sjis::getLanguage() const
284	38.8k	{
285	38.8k	return "ja";
286	38.8k	}
287
288		CharsetRecog_euc::~CharsetRecog_euc()
289		{
290		// nothing to do
291		}
292
293	258M	UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294	258M	int32_t firstByte = 0;
295	258M	int32_t secondByte = 0;
296	258M	int32_t thirdByte = 0;
297
298	258M	it->index = it->nextIndex;
299	258M	it->error = FALSE;
300	258M	firstByte = it->charValue = it->nextByte(det);
301
302	258M	if (firstByte < 0) {
303		// Ran off the end of the input data
304	35.8k	return FALSE;
305	35.8k	}
306
307	258M	if (firstByte <= 0x8D) {
308		// single byte char
309	193M	return TRUE;
310	193M	}
311
312	64.4M	secondByte = it->nextByte(det);
313	64.4M	if (secondByte >= 0) {
314	64.4M	it->charValue = (it->charValue << 8) \| secondByte;
315	64.4M	}
316		// else we'll handle the error later.
317
318	64.4M	if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319		// Two byte Char
320	35.3M	if (secondByte < 0xA1) {
321	3.82M	it->error = TRUE;
322	3.82M	}
323
324	35.3M	return TRUE;
325	35.3M	}
326
327	29.1M	if (firstByte == 0x8E) {
328		// Code Set 2.
329		// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330		// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331		// We don't know which we've got.
332		// Treat it like EUC-JP. If the data really was EUC-TW, the following two
333		// bytes will look like a well formed 2 byte char.
334	20.4k	if (secondByte < 0xA1) {
335	16.6k	it->error = TRUE;
336	16.6k	}
337
338	20.4k	return TRUE;
339	20.4k	}
340
341	29.1M	if (firstByte == 0x8F) {
342		// Code set 3.
343		// Three byte total char size, two bytes of actual char value.
344	468k	thirdByte = it->nextByte(det);
345	468k	it->charValue = (it->charValue << 8) \| thirdByte;
346
347	468k	if (thirdByte < 0xa1) {
348		// Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349	7.42k	it->error = TRUE;
350	7.42k	}
351	468k	}
352
353	29.1M	return TRUE;
354
355	29.1M	}
356
357		CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358		{
359		// nothing to do
360		}
361
362		const char *CharsetRecog_euc_jp::getName() const
363	38.8k	{
364	38.8k	return "EUC-JP";
365	38.8k	}
366
367		const char *CharsetRecog_euc_jp::getLanguage() const
368	38.8k	{
369	38.8k	return "ja";
370	38.8k	}
371
372		UBool CharsetRecog_euc_jp::match(InputText det, CharsetMatch results) const
373	38.8k	{
374	38.8k	int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375	38.8k	results->set(det, this, confidence);
376	38.8k	return (confidence > 0);
377	38.8k	}
378
379		CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380		{
381		// nothing to do
382		}
383
384		const char *CharsetRecog_euc_kr::getName() const
385	38.8k	{
386	38.8k	return "EUC-KR";
387	38.8k	}
388
389		const char *CharsetRecog_euc_kr::getLanguage() const
390	38.8k	{
391	38.8k	return "ko";
392	38.8k	}
393
394		UBool CharsetRecog_euc_kr::match(InputText det, CharsetMatch results) const
395	38.8k	{
396	38.8k	int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397	38.8k	results->set(det, this, confidence);
398	38.8k	return (confidence > 0);
399	38.8k	}
400
401		CharsetRecog_big5::~CharsetRecog_big5()
402		{
403		// nothing to do
404		}
405
406		UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407	181M	{
408	181M	int32_t firstByte;
409
410	181M	it->index = it->nextIndex;
411	181M	it->error = FALSE;
412	181M	firstByte = it->charValue = it->nextByte(det);
413
414	181M	if (firstByte < 0) {
415	17.6k	return FALSE;
416	17.6k	}
417
418	181M	if (firstByte <= 0x7F \|\| firstByte == 0xFF) {
419		// single byte character.
420	136M	return TRUE;
421	136M	}
422
423	44.8M	int32_t secondByte = it->nextByte(det);
424	44.8M	if (secondByte >= 0) {
425	44.8M	it->charValue = (it->charValue << 8) \| secondByte;
426	44.8M	}
427		// else we'll handle the error later.
428
429	44.8M	if (secondByte < 0x40 \|\| secondByte == 0x7F \|\| secondByte == 0xFF) {
430	3.03M	it->error = TRUE;
431	3.03M	}
432
433	44.8M	return TRUE;
434	181M	}
435
436		const char *CharsetRecog_big5::getName() const
437	38.8k	{
438	38.8k	return "Big5";
439	38.8k	}
440
441		const char *CharsetRecog_big5::getLanguage() const
442	38.8k	{
443	38.8k	return "zh";
444	38.8k	}
445
446		UBool CharsetRecog_big5::match(InputText det, CharsetMatch results) const
447	38.8k	{
448	38.8k	int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449	38.8k	results->set(det, this, confidence);
450	38.8k	return (confidence > 0);
451	38.8k	}
452
453		CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454		{
455		// nothing to do
456		}
457
458	318M	UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459	318M	int32_t firstByte = 0;
460	318M	int32_t secondByte = 0;
461	318M	int32_t thirdByte = 0;
462	318M	int32_t fourthByte = 0;
463
464	318M	it->index = it->nextIndex;
465	318M	it->error = FALSE;
466	318M	firstByte = it->charValue = it->nextByte(det);
467
468	318M	if (firstByte < 0) {
469		// Ran off the end of the input data
470	18.9k	return FALSE;
471	18.9k	}
472
473	318M	if (firstByte <= 0x80) {
474		// single byte char
475	228M	return TRUE;
476	228M	}
477
478	89.4M	secondByte = it->nextByte(det);
479	89.4M	if (secondByte >= 0) {
480	89.4M	it->charValue = (it->charValue << 8) \| secondByte;
481	89.4M	}
482		// else we'll handle the error later.
483
484	89.4M	if (firstByte >= 0x81 && firstByte <= 0xFE) {
485		// Two byte Char
486	77.1M	if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && secondByte <= 0xFE)) {
487	69.9M	return TRUE;
488	69.9M	}
489
490		// Four byte char
491	7.16M	if (secondByte >= 0x30 && secondByte <= 0x39) {
492	2.44M	thirdByte = it->nextByte(det);
493
494	2.44M	if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495	1.11M	fourthByte = it->nextByte(det);
496
497	1.11M	if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498	109k	it->charValue = (it->charValue << 16) \| (thirdByte << 8) \| fourthByte;
499
500	109k	return TRUE;
501	109k	}
502	1.11M	}
503	2.44M	}
504
505		// Something wasn't valid, or we ran out of data (-1).
506	7.06M	it->error = TRUE;
507	7.06M	}
508
509	19.3M	return TRUE;
510	89.4M	}
511
512		const char *CharsetRecog_gb_18030::getName() const
513	38.8k	{
514	38.8k	return "GB18030";
515	38.8k	}
516
517		const char *CharsetRecog_gb_18030::getLanguage() const
518	38.8k	{
519	38.8k	return "zh";
520	38.8k	}
521
522		UBool CharsetRecog_gb_18030::match(InputText det, CharsetMatch results) const
523	38.8k	{
524	38.8k	int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525	38.8k	results->set(det, this, confidence);
526	38.8k	return (confidence > 0);
527	38.8k	}
528
529		U_NAMESPACE_END
530		#endif