/src/icu/source/i18n/csrmbcs.cpp

Source
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 2005-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "cmemory.h"
#include "csmatch.h"
#include "csrmbcs.h"

#include <math.h>

U_NAMESPACE_BEGIN

#define min(x,y) (((x)<(y))?(x):(y))

static const uint16_t commonChars_sjis [] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurrence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};

static const uint16_t commonChars_euc_jp[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurrence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};

static const uint16_t commonChars_euc_kr[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurrence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};

static const uint16_t commonChars_big5[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurrence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};

static const uint16_t commonChars_gb_18030[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurrence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};

static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
{
    int32_t start = 0, end = len-1;
    int32_t mid = (start+end)/2;

    while(start <= end) {
        if(array[mid] == value) {
            return mid;
        }

        if(array[mid] < value){
            start = mid+1;
        } else {
            end = mid-1;
        }

        mid = (start+end)/2;
    }

    return -1;
}

IteratedChar::IteratedChar() : 
charValue(0), index(-1), nextIndex(0), error(false), done(false)
{
    // nothing else to do.
}

/*void IteratedChar::reset()
{
    charValue = 0;
    index     = -1;
    nextIndex = 0;
    error     = false;
    done      = false;
}*/

int32_t IteratedChar::nextByte(InputText *det)
{
    if (nextIndex >= det->fRawLength) {
        done = true;

        return -1;
    }

    return det->fRawInput[nextIndex++];
}

CharsetRecog_mbcs::~CharsetRecog_mbcs()
{
    // nothing to do.
}

int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
    int32_t doubleByteCharCount = 0;
    int32_t commonCharCount     = 0;
    int32_t badCharCount        = 0;
    int32_t totalCharCount      = 0;
    int32_t confidence          = 0;
    IteratedChar iter;

    while (nextChar(&iter, det)) {
        totalCharCount++;

        if (iter.error) {
            badCharCount++;
        } else {
            if (iter.charValue > 0xFF) {
                doubleByteCharCount++;

                if (commonChars != nullptr) {
                    if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
                        commonCharCount += 1;
                    }
                }
            }
        }


        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
            // Bail out early if the byte data is not matching the encoding scheme.
            // break detectBlock;
            return confidence;
        }
    }

    if (doubleByteCharCount <= 10 && badCharCount == 0) {
        // Not many multi-byte chars.
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
            // We don't have enough data to have any confidence.
            // Statistical analysis of single byte non-ASCII characters would probably help here.
            confidence = 0;
        }
        else {
            //   ASCII or ISO file?  It's probably not our encoding,
            //   but is not incompatible with our encoding, so don't give it a zero.
            confidence = 10;
        }

        return confidence;
    }

    //
    //  No match if there are too many characters that don't fit the encoding scheme.
    //    (should we have zero tolerance for these?)
    //
    if (doubleByteCharCount < 20*badCharCount) {
        confidence = 0;

        return confidence;
    }

    if (commonChars == nullptr) {
        // We have no statistics on frequently occurring characters.
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better)
        confidence = 30 + doubleByteCharCount - 20*badCharCount;

        if (confidence > 100) {
            confidence = 100;
        }
    } else {
        //
        // Frequency of occurrence statistics exist.
        //

        double maxVal = log(static_cast<double>(doubleByteCharCount) / 4); /*(float)?*/
        double scaleFactor = 90.0 / maxVal;
        confidence = static_cast<int32_t>(log(static_cast<double>(commonCharCount) + 1) * scaleFactor + 10.0);

        confidence = min(confidence, 100);
    }

    if (confidence < 0) {
        confidence = 0;
    }

    return confidence;
}

CharsetRecog_sjis::~CharsetRecog_sjis()
{
    // nothing to do
}

UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
    it->index = it->nextIndex;
    it->error = false;

    int32_t firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return false;
    }

    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
        return true;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (firstByte << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
        // Illegal second byte value.
        it->error = true;
    }

    return true;
}

UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
    results->set(det, this, confidence);
    return (confidence > 0);
}

const char *CharsetRecog_sjis::getName() const
{
    return "Shift_JIS";
}

const char *CharsetRecog_sjis::getLanguage() const
{
    return "ja";
}

CharsetRecog_euc::~CharsetRecog_euc()
{
    // nothing to do
}

UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;

    it->index = it->nextIndex;
    it->error = false;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return false;
    }

    if (firstByte <= 0x8D) {
        // single byte char
        return true;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
        // Two byte Char
        if (secondByte < 0xA1) {
            it->error = true;
        }

        return true;
    }

    if (firstByte == 0x8E) {
        // Code Set 2.
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
        // We don't know which we've got.
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
        //   bytes will look like a well formed 2 byte char.
        if (secondByte < 0xA1) {
            it->error = true;
        }

        return true;
    }

    if (firstByte == 0x8F) {
        // Code set 3.
        // Three byte total char size, two bytes of actual char value.
        thirdByte    = it->nextByte(det);
        it->charValue = (it->charValue << 8) | thirdByte;

        if (thirdByte < 0xa1) {
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
            it->error = true;
        }
    }

    return true;

}

CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
{
    // nothing to do
}

const char *CharsetRecog_euc_jp::getName() const
{
    return "EUC-JP";
}

const char *CharsetRecog_euc_jp::getLanguage() const
{
    return "ja";
}

UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
{
    // nothing to do
}

const char *CharsetRecog_euc_kr::getName() const
{
    return "EUC-KR";
}

const char *CharsetRecog_euc_kr::getLanguage() const
{
    return "ko";
}

UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_big5::~CharsetRecog_big5()
{
    // nothing to do
}

UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
{
    int32_t firstByte;

    it->index = it->nextIndex;
    it->error = false;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return false;
    }

    if (firstByte <= 0x7F || firstByte == 0xFF) {
        // single byte character.
        return true;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0)  {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
        it->error = true;
    }

    return true;
}

const char *CharsetRecog_big5::getName() const
{
    return "Big5";
}

const char *CharsetRecog_big5::getLanguage() const
{
    return "zh";
}

UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
{
    // nothing to do
}

UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;
    int32_t fourthByte = 0;

    it->index = it->nextIndex;
    it->error = false;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return false;
    }

    if (firstByte <= 0x80) {
        // single byte char
        return true;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0x81 && firstByte <= 0xFE) {
        // Two byte Char
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
            return true;
        }

        // Four byte char
        if (secondByte >= 0x30 && secondByte <= 0x39) {
            thirdByte = it->nextByte(det);

            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
                fourthByte = it->nextByte(det);

                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;

                    return true;
                }
            }
        }

        // Something wasn't valid, or we ran out of data (-1).
        it->error = true;
    }

    return true;
}

const char *CharsetRecog_gb_18030::getName() const
{
    return "GB18030";
}

const char *CharsetRecog_gb_18030::getLanguage() const
{
    return "zh";
}

UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
    results->set(det, this, confidence);
    return (confidence > 0);
}

U_NAMESPACE_END
#endif

Coverage Report

Created: 2026-06-13 06:44

Line	Count	Source
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		**********************************************************************
5		* Copyright (C) 2005-2016, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		**********************************************************************
8		*/
9
10		#include "unicode/utypes.h"
11
12		#if !UCONFIG_NO_CONVERSION
13
14		#include "cmemory.h"
15		#include "csmatch.h"
16		#include "csrmbcs.h"
17
18		#include <math.h>
19
20		U_NAMESPACE_BEGIN
21
22	5.85k	#define min(x,y) (((x)<(y))?(x):(y))
23
24		static const uint16_t commonChars_sjis [] = {
25		// TODO: This set of data comes from the character frequency-
26		// of-occurrence analysis tool. The data needs to be moved
27		// into a resource and loaded from there.
28		0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29		0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30		0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31		0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32		0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33		0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35		static const uint16_t commonChars_euc_jp[] = {
36		// TODO: This set of data comes from the character frequency-
37		// of-occurrence analysis tool. The data needs to be moved
38		// into a resource and loaded from there.
39		0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40		0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41		0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42		0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43		0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44		0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45		0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46		0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47		0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48		0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50		static const uint16_t commonChars_euc_kr[] = {
51		// TODO: This set of data comes from the character frequency-
52		// of-occurrence analysis tool. The data needs to be moved
53		// into a resource and loaded from there.
54		0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55		0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56		0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57		0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58		0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59		0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60		0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61		0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62		0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63		0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65		static const uint16_t commonChars_big5[] = {
66		// TODO: This set of data comes from the character frequency-
67		// of-occurrence analysis tool. The data needs to be moved
68		// into a resource and loaded from there.
69		0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70		0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71		0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72		0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73		0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74		0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75		0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76		0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77		0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78		0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80		static const uint16_t commonChars_gb_18030[] = {
81		// TODO: This set of data comes from the character frequency-
82		// of-occurrence analysis tool. The data needs to be moved
83		// into a resource and loaded from there.
84		0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85		0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86		0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87		0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88		0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89		0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90		0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91		0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92		0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93		0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95		static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96	263M	{
97	263M	int32_t start = 0, end = len-1;
98	263M	int32_t mid = (start+end)/2;
99
100	1.98G	while(start <= end) {
101	1.72G	if(array[mid] == value) {
102	161k	return mid;
103	161k	}
104
105	1.72G	if(array[mid] < value){
106	1.14G	start = mid+1;
107	1.14G	} else {
108	573M	end = mid-1;
109	573M	}
110
111	1.72G	mid = (start+end)/2;
112	1.72G	}
113
114	262M	return -1;
115	263M	}
116
117		IteratedChar::IteratedChar() :
118	3.29M	charValue(0), index(-1), nextIndex(0), error(false), done(false)
119	3.29M	{
120		// nothing else to do.
121	3.29M	}
122
123		/*void IteratedChar::reset()
124		{
125		charValue = 0;
126		index = -1;
127		nextIndex = 0;
128		error = false;
129		done = false;
130		}*/
131
132		int32_t IteratedChar::nextByte(InputText *det)
133	1.46G	{
134	1.46G	if (nextIndex >= det->fRawLength) {
135	3.43M	done = true;
136
137	3.43M	return -1;
138	3.43M	}
139
140	1.46G	return det->fRawInput[nextIndex++];
141	1.46G	}
142
143		CharsetRecog_mbcs::~CharsetRecog_mbcs()
144	0	{
145		// nothing to do.
146	0	}
147
148	3.29M	int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149	3.29M	int32_t doubleByteCharCount = 0;
150	3.29M	int32_t commonCharCount = 0;
151	3.29M	int32_t badCharCount = 0;
152	3.29M	int32_t totalCharCount = 0;
153	3.29M	int32_t confidence = 0;
154	3.29M	IteratedChar iter;
155
156	1.17G	while (nextChar(&iter, det)) {
157	1.17G	totalCharCount++;
158
159	1.17G	if (iter.error) {
160	19.8M	badCharCount++;
161	1.15G	} else {
162	1.15G	if (iter.charValue > 0xFF) {
163	263M	doubleByteCharCount++;
164
165	263M	if (commonChars != nullptr) {
166	263M	if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
167	161k	commonCharCount += 1;
168	161k	}
169	263M	}
170	263M	}
171	1.15G	}
172
173
174	1.17G	if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
175		// Bail out early if the byte data is not matching the encoding scheme.
176		// break detectBlock;
177	163k	return confidence;
178	163k	}
179	1.17G	}
180
181	3.13M	if (doubleByteCharCount <= 10 && badCharCount == 0) {
182		// Not many multi-byte chars.
183	2.79M	if (doubleByteCharCount == 0 && totalCharCount < 10) {
184		// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
185		// We don't have enough data to have any confidence.
186		// Statistical analysis of single byte non-ASCII characters would probably help here.
187	2.38M	confidence = 0;
188	2.38M	}
189	406k	else {
190		// ASCII or ISO file? It's probably not our encoding,
191		// but is not incompatible with our encoding, so don't give it a zero.
192	406k	confidence = 10;
193	406k	}
194
195	2.79M	return confidence;
196	2.79M	}
197
198		//
199		// No match if there are too many characters that don't fit the encoding scheme.
200		// (should we have zero tolerance for these?)
201		//
202	341k	if (doubleByteCharCount < 20*badCharCount) {
203	335k	confidence = 0;
204
205	335k	return confidence;
206	335k	}
207
208	5.85k	if (commonChars == nullptr) {
209		// We have no statistics on frequently occurring characters.
210		// Assess confidence purely on having a reasonable number of
211		// multi-byte characters (the more the better)
212	0	confidence = 30 + doubleByteCharCount - 20*badCharCount;
213
214	0	if (confidence > 100) {
215	0	confidence = 100;
216	0	}
217	5.85k	} else {
218		//
219		// Frequency of occurrence statistics exist.
220		//
221
222	5.85k	double maxVal = log(static_cast<double>(doubleByteCharCount) / 4); /(float)?/
223	5.85k	double scaleFactor = 90.0 / maxVal;
224	5.85k	confidence = static_cast<int32_t>(log(static_cast<double>(commonCharCount) + 1) * scaleFactor + 10.0);
225
226	5.85k	confidence = min(confidence, 100);
227	5.85k	}
228
229	5.85k	if (confidence < 0) {
230	0	confidence = 0;
231	0	}
232
233	5.85k	return confidence;
234	341k	}
235
236		CharsetRecog_sjis::~CharsetRecog_sjis()
237		{
238		// nothing to do
239		}
240
241	202M	UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
242	202M	it->index = it->nextIndex;
243	202M	it->error = false;
244
245	202M	int32_t firstByte = it->charValue = it->nextByte(det);
246
247	202M	if (firstByte < 0) {
248	615k	return false;
249	615k	}
250
251	201M	if (firstByte <= 0x7F \|\| (firstByte > 0xA0 && firstByte <= 0xDF)) {
252	157M	return true;
253	157M	}
254
255	44.5M	int32_t secondByte = it->nextByte(det);
256	44.5M	if (secondByte >= 0) {
257	44.5M	it->charValue = (firstByte << 8) \| secondByte;
258	44.5M	}
259		// else we'll handle the error later.
260
261	44.5M	if (! ((secondByte >= 0x40 && secondByte <= 0x7F) \|\| (secondByte >= 0x80 && secondByte <= 0xFE))) {
262		// Illegal second byte value.
263	4.02M	it->error = true;
264	4.02M	}
265
266	44.5M	return true;
267	201M	}
268
269	659k	UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
270	659k	int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
271	659k	results->set(det, this, confidence);
272	659k	return (confidence > 0);
273	659k	}
274
275		const char *CharsetRecog_sjis::getName() const
276	659k	{
277	659k	return "Shift_JIS";
278	659k	}
279
280		const char *CharsetRecog_sjis::getLanguage() const
281	659k	{
282	659k	return "ja";
283	659k	}
284
285		CharsetRecog_euc::~CharsetRecog_euc()
286		{
287		// nothing to do
288		}
289
290	412M	UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
291	412M	int32_t firstByte = 0;
292	412M	int32_t secondByte = 0;
293	412M	int32_t thirdByte = 0;
294
295	412M	it->index = it->nextIndex;
296	412M	it->error = false;
297	412M	firstByte = it->charValue = it->nextByte(det);
298
299	412M	if (firstByte < 0) {
300		// Ran off the end of the input data
301	1.26M	return false;
302	1.26M	}
303
304	411M	if (firstByte <= 0x8D) {
305		// single byte char
306	328M	return true;
307	328M	}
308
309	83.3M	secondByte = it->nextByte(det);
310	83.3M	if (secondByte >= 0) {
311	83.1M	it->charValue = (it->charValue << 8) \| secondByte;
312	83.1M	}
313		// else we'll handle the error later.
314
315	83.3M	if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316		// Two byte Char
317	47.6M	if (secondByte < 0xA1) {
318	5.15M	it->error = true;
319	5.15M	}
320
321	47.6M	return true;
322	47.6M	}
323
324	35.7M	if (firstByte == 0x8E) {
325		// Code Set 2.
326		// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327		// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328		// We don't know which we've got.
329		// Treat it like EUC-JP. If the data really was EUC-TW, the following two
330		// bytes will look like a well formed 2 byte char.
331	21.9k	if (secondByte < 0xA1) {
332	19.0k	it->error = true;
333	19.0k	}
334
335	21.9k	return true;
336	21.9k	}
337
338	35.7M	if (firstByte == 0x8F) {
339		// Code set 3.
340		// Three byte total char size, two bytes of actual char value.
341	732k	thirdByte = it->nextByte(det);
342	732k	it->charValue = (it->charValue << 8) \| thirdByte;
343
344	732k	if (thirdByte < 0xa1) {
345		// Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346	11.4k	it->error = true;
347	11.4k	}
348	732k	}
349
350	35.7M	return true;
351
352	35.7M	}
353
354		CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355		{
356		// nothing to do
357		}
358
359		const char *CharsetRecog_euc_jp::getName() const
360	659k	{
361	659k	return "EUC-JP";
362	659k	}
363
364		const char *CharsetRecog_euc_jp::getLanguage() const
365	659k	{
366	659k	return "ja";
367	659k	}
368
369		UBool CharsetRecog_euc_jp::match(InputText det, CharsetMatch results) const
370	659k	{
371	659k	int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
372	659k	results->set(det, this, confidence);
373	659k	return (confidence > 0);
374	659k	}
375
376		CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
377		{
378		// nothing to do
379		}
380
381		const char *CharsetRecog_euc_kr::getName() const
382	659k	{
383	659k	return "EUC-KR";
384	659k	}
385
386		const char *CharsetRecog_euc_kr::getLanguage() const
387	659k	{
388	659k	return "ko";
389	659k	}
390
391		UBool CharsetRecog_euc_kr::match(InputText det, CharsetMatch results) const
392	659k	{
393	659k	int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
394	659k	results->set(det, this, confidence);
395	659k	return (confidence > 0);
396	659k	}
397
398		CharsetRecog_big5::~CharsetRecog_big5()
399		{
400		// nothing to do
401		}
402
403		UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
404	214M	{
405	214M	int32_t firstByte;
406
407	214M	it->index = it->nextIndex;
408	214M	it->error = false;
409	214M	firstByte = it->charValue = it->nextByte(det);
410
411	214M	if (firstByte < 0) {
412	628k	return false;
413	628k	}
414
415	214M	if (firstByte <= 0x7F \|\| firstByte == 0xFF) {
416		// single byte character.
417	158M	return true;
418	158M	}
419
420	56.0M	int32_t secondByte = it->nextByte(det);
421	56.0M	if (secondByte >= 0) {
422	55.9M	it->charValue = (it->charValue << 8) \| secondByte;
423	55.9M	}
424		// else we'll handle the error later.
425
426	56.0M	if (secondByte < 0x40 \|\| secondByte == 0x7F \|\| secondByte == 0xFF) {
427	3.39M	it->error = true;
428	3.39M	}
429
430	56.0M	return true;
431	214M	}
432
433		const char *CharsetRecog_big5::getName() const
434	659k	{
435	659k	return "Big5";
436	659k	}
437
438		const char *CharsetRecog_big5::getLanguage() const
439	659k	{
440	659k	return "zh";
441	659k	}
442
443		UBool CharsetRecog_big5::match(InputText det, CharsetMatch results) const
444	659k	{
445	659k	int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
446	659k	results->set(det, this, confidence);
447	659k	return (confidence > 0);
448	659k	}
449
450		CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
451		{
452		// nothing to do
453		}
454
455	347M	UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
456	347M	int32_t firstByte = 0;
457	347M	int32_t secondByte = 0;
458	347M	int32_t thirdByte = 0;
459	347M	int32_t fourthByte = 0;
460
461	347M	it->index = it->nextIndex;
462	347M	it->error = false;
463	347M	firstByte = it->charValue = it->nextByte(det);
464
465	347M	if (firstByte < 0) {
466		// Ran off the end of the input data
467	630k	return false;
468	630k	}
469
470	346M	if (firstByte <= 0x80) {
471		// single byte char
472	247M	return true;
473	247M	}
474
475	99.1M	secondByte = it->nextByte(det);
476	99.1M	if (secondByte >= 0) {
477	99.0M	it->charValue = (it->charValue << 8) \| secondByte;
478	99.0M	}
479		// else we'll handle the error later.
480
481	99.1M	if (firstByte >= 0x81 && firstByte <= 0xFE) {
482		// Two byte Char
483	87.2M	if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && secondByte <= 0xFE)) {
484	79.8M	return true;
485	79.8M	}
486
487		// Four byte char
488	7.36M	if (secondByte >= 0x30 && secondByte <= 0x39) {
489	2.40M	thirdByte = it->nextByte(det);
490
491	2.40M	if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
492	1.09M	fourthByte = it->nextByte(det);
493
494	1.09M	if (fourthByte >= 0x30 && fourthByte <= 0x39) {
495	111k	it->charValue = (it->charValue << 16) \| (thirdByte << 8) \| fourthByte;
496
497	111k	return true;
498	111k	}
499	1.09M	}
500	2.40M	}
501
502		// Something wasn't valid, or we ran out of data (-1).
503	7.25M	it->error = true;
504	7.25M	}
505
506	19.1M	return true;
507	99.1M	}
508
509		const char *CharsetRecog_gb_18030::getName() const
510	659k	{
511	659k	return "GB18030";
512	659k	}
513
514		const char *CharsetRecog_gb_18030::getLanguage() const
515	659k	{
516	659k	return "zh";
517	659k	}
518
519		UBool CharsetRecog_gb_18030::match(InputText det, CharsetMatch results) const
520	659k	{
521	659k	int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
522	659k	results->set(det, this, confidence);
523	659k	return (confidence > 0);
524	659k	}
525
526		U_NAMESPACE_END
527		#endif