/src/mozilla-central/intl/icu/source/i18n/csrmbcs.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 2005-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "cmemory.h"
#include "csmatch.h"
#include "csrmbcs.h"

#include <math.h>

U_NAMESPACE_BEGIN

#define min(x,y) (((x)<(y))?(x):(y))

static const uint16_t commonChars_sjis [] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};

static const uint16_t commonChars_euc_jp[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};

static const uint16_t commonChars_euc_kr[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};

static const uint16_t commonChars_big5[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};

static const uint16_t commonChars_gb_18030[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};

static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
{
    int32_t start = 0, end = len-1;
    int32_t mid = (start+end)/2;

    while(start <= end) {
        if(array[mid] == value) {
            return mid;
        }

        if(array[mid] < value){
            start = mid+1;
        } else {
            end = mid-1;
        }

        mid = (start+end)/2;
    }

    return -1;
}

IteratedChar::IteratedChar() : 
charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
{
    // nothing else to do.
}

/*void IteratedChar::reset()
{
    charValue = 0;
    index     = -1;
    nextIndex = 0;
    error     = FALSE;
    done      = FALSE;
}*/

int32_t IteratedChar::nextByte(InputText *det)
{
    if (nextIndex >= det->fRawLength) {
        done = TRUE;

        return -1;
    }

    return det->fRawInput[nextIndex++];
}

CharsetRecog_mbcs::~CharsetRecog_mbcs()
{
    // nothing to do.
}

int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
    int32_t singleByteCharCount = 0;
    int32_t doubleByteCharCount = 0;
    int32_t commonCharCount     = 0;
    int32_t badCharCount        = 0;
    int32_t totalCharCount      = 0;
    int32_t confidence          = 0;
    IteratedChar iter;

    while (nextChar(&iter, det)) {
        totalCharCount++;

        if (iter.error) {
            badCharCount++;
        } else {
            if (iter.charValue <= 0xFF) {
                singleByteCharCount++;
            } else {
                doubleByteCharCount++;

                if (commonChars != 0) {
                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
                        commonCharCount += 1;
                    }
                }
            }
        }


        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
            // Bail out early if the byte data is not matching the encoding scheme.
            // break detectBlock;
            return confidence;
        }
    }

    if (doubleByteCharCount <= 10 && badCharCount == 0) {
        // Not many multi-byte chars.
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
            // We don't have enough data to have any confidence.
            // Statistical analysis of single byte non-ASCII charcters would probably help here.
            confidence = 0;
        }
        else {
            //   ASCII or ISO file?  It's probably not our encoding,
            //   but is not incompatible with our encoding, so don't give it a zero.
            confidence = 10;
        }

        return confidence;
    }

    //
    //  No match if there are too many characters that don't fit the encoding scheme.
    //    (should we have zero tolerance for these?)
    //
    if (doubleByteCharCount < 20*badCharCount) {
        confidence = 0;

        return confidence;
    }

    if (commonChars == 0) {
        // We have no statistics on frequently occuring characters.
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better)
        confidence = 30 + doubleByteCharCount - 20*badCharCount;

        if (confidence > 100) {
            confidence = 100;
        }
    } else {
        //
        // Frequency of occurence statistics exist.
        //

        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
        double scaleFactor = 90.0 / maxVal;
        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);

        confidence = min(confidence, 100);
    }

    if (confidence < 0) {
        confidence = 0;
    }

    return confidence;
}

CharsetRecog_sjis::~CharsetRecog_sjis()
{
    // nothing to do
}

UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
    it->index = it->nextIndex;
    it->error = FALSE;

    int32_t firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return FALSE;
    }

    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
        return TRUE;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (firstByte << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
        // Illegal second byte value.
        it->error = TRUE;
    }

    return TRUE;
}

UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
    results->set(det, this, confidence);
    return (confidence > 0);
}

const char *CharsetRecog_sjis::getName() const
{
    return "Shift_JIS";
}

const char *CharsetRecog_sjis::getLanguage() const
{
    return "ja";
}

CharsetRecog_euc::~CharsetRecog_euc()
{
    // nothing to do
}

UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return FALSE;
    }

    if (firstByte <= 0x8D) {
        // single byte char
        return TRUE;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
        // Two byte Char
        if (secondByte < 0xA1) {
            it->error = TRUE;
        }

        return TRUE;
    }

    if (firstByte == 0x8E) {
        // Code Set 2.
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
        // We don't know which we've got.
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
        //   bytes will look like a well formed 2 byte char.
        if (secondByte < 0xA1) {
            it->error = TRUE;
        }

        return TRUE;
    }

    if (firstByte == 0x8F) {
        // Code set 3.
        // Three byte total char size, two bytes of actual char value.
        thirdByte    = it->nextByte(det);
        it->charValue = (it->charValue << 8) | thirdByte;

        if (thirdByte < 0xa1) {
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
            it->error = TRUE;
        }
    }

    return TRUE;

}

CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
{
    // nothing to do
}

const char *CharsetRecog_euc_jp::getName() const
{
    return "EUC-JP";
}

const char *CharsetRecog_euc_jp::getLanguage() const
{
    return "ja";
}

UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
{
    // nothing to do
}

const char *CharsetRecog_euc_kr::getName() const
{
    return "EUC-KR";
}

const char *CharsetRecog_euc_kr::getLanguage() const
{
    return "ko";
}

UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_big5::~CharsetRecog_big5()
{
    // nothing to do
}

UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
{
    int32_t firstByte;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return FALSE;
    }

    if (firstByte <= 0x7F || firstByte == 0xFF) {
        // single byte character.
        return TRUE;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0)  {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
        it->error = TRUE;
    }

    return TRUE;
}

const char *CharsetRecog_big5::getName() const
{
    return "Big5";
}

const char *CharsetRecog_big5::getLanguage() const
{
    return "zh";
}

UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
    results->set(det, this, confidence);
    return (confidence > 0);
}

CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
{
    // nothing to do
}

UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;
    int32_t fourthByte = 0;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return FALSE;
    }

    if (firstByte <= 0x80) {
        // single byte char
        return TRUE;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0x81 && firstByte <= 0xFE) {
        // Two byte Char
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
            return TRUE;
        }

        // Four byte char
        if (secondByte >= 0x30 && secondByte <= 0x39) {
            thirdByte = it->nextByte(det);

            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
                fourthByte = it->nextByte(det);

                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;

                    return TRUE;
                }
            }
        }

        // Something wasn't valid, or we ran out of data (-1).
        it->error = TRUE;
    }

    return TRUE;
}

const char *CharsetRecog_gb_18030::getName() const
{
    return "GB18030";
}

const char *CharsetRecog_gb_18030::getLanguage() const
{
    return "zh";
}

UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
{
    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
    results->set(det, this, confidence);
    return (confidence > 0);
}

U_NAMESPACE_END
#endif

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		**********************************************************************
5		* Copyright (C) 2005-2016, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		**********************************************************************
8		*/
9
10		#include "unicode/utypes.h"
11
12		#if !UCONFIG_NO_CONVERSION
13
14		#include "cmemory.h"
15		#include "csmatch.h"
16		#include "csrmbcs.h"
17
18		#include <math.h>
19
20		U_NAMESPACE_BEGIN
21
22	0	#define min(x,y) (((x)<(y))?(x):(y))
23
24		static const uint16_t commonChars_sjis [] = {
25		// TODO: This set of data comes from the character frequency-
26		// of-occurence analysis tool. The data needs to be moved
27		// into a resource and loaded from there.
28		0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29		0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30		0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31		0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32		0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33		0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35		static const uint16_t commonChars_euc_jp[] = {
36		// TODO: This set of data comes from the character frequency-
37		// of-occurence analysis tool. The data needs to be moved
38		// into a resource and loaded from there.
39		0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40		0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41		0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42		0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43		0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44		0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45		0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46		0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47		0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48		0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50		static const uint16_t commonChars_euc_kr[] = {
51		// TODO: This set of data comes from the character frequency-
52		// of-occurence analysis tool. The data needs to be moved
53		// into a resource and loaded from there.
54		0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55		0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56		0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57		0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58		0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59		0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60		0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61		0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62		0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63		0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65		static const uint16_t commonChars_big5[] = {
66		// TODO: This set of data comes from the character frequency-
67		// of-occurence analysis tool. The data needs to be moved
68		// into a resource and loaded from there.
69		0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70		0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71		0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72		0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73		0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74		0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75		0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76		0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77		0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78		0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80		static const uint16_t commonChars_gb_18030[] = {
81		// TODO: This set of data comes from the character frequency-
82		// of-occurence analysis tool. The data needs to be moved
83		// into a resource and loaded from there.
84		0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85		0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86		0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87		0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88		0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89		0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90		0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91		0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92		0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93		0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95		static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96	0	{
97	0	int32_t start = 0, end = len-1;
98	0	int32_t mid = (start+end)/2;
99	0
100	0	while(start <= end) {
101	0	if(array[mid] == value) {
102	0	return mid;
103	0	}
104	0
105	0	if(array[mid] < value){
106	0	start = mid+1;
107	0	} else {
108	0	end = mid-1;
109	0	}
110	0
111	0	mid = (start+end)/2;
112	0	}
113	0
114	0	return -1;
115	0	}
116
117		IteratedChar::IteratedChar() :
118		charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119	0	{
120	0	// nothing else to do.
121	0	}
122
123		/*void IteratedChar::reset()
124		{
125		charValue = 0;
126		index = -1;
127		nextIndex = 0;
128		error = FALSE;
129		done = FALSE;
130		}*/
131
132		int32_t IteratedChar::nextByte(InputText *det)
133	0	{
134	0	if (nextIndex >= det->fRawLength) {
135	0	done = TRUE;
136	0
137	0	return -1;
138	0	}
139	0
140	0	return det->fRawInput[nextIndex++];
141	0	}
142
143		CharsetRecog_mbcs::~CharsetRecog_mbcs()
144	0	{
145	0	// nothing to do.
146	0	}
147
148	0	int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149	0	int32_t singleByteCharCount = 0;
150	0	int32_t doubleByteCharCount = 0;
151	0	int32_t commonCharCount = 0;
152	0	int32_t badCharCount = 0;
153	0	int32_t totalCharCount = 0;
154	0	int32_t confidence = 0;
155	0	IteratedChar iter;
156	0
157	0	while (nextChar(&iter, det)) {
158	0	totalCharCount++;
159	0
160	0	if (iter.error) {
161	0	badCharCount++;
162	0	} else {
163	0	if (iter.charValue <= 0xFF) {
164	0	singleByteCharCount++;
165	0	} else {
166	0	doubleByteCharCount++;
167	0
168	0	if (commonChars != 0) {
169	0	if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170	0	commonCharCount += 1;
171	0	}
172	0	}
173	0	}
174	0	}
175	0
176	0
177	0	if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178	0	// Bail out early if the byte data is not matching the encoding scheme.
179	0	// break detectBlock;
180	0	return confidence;
181	0	}
182	0	}
183	0
184	0	if (doubleByteCharCount <= 10 && badCharCount == 0) {
185	0	// Not many multi-byte chars.
186	0	if (doubleByteCharCount == 0 && totalCharCount < 10) {
187	0	// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188	0	// We don't have enough data to have any confidence.
189	0	// Statistical analysis of single byte non-ASCII charcters would probably help here.
190	0	confidence = 0;
191	0	}
192	0	else {
193	0	// ASCII or ISO file? It's probably not our encoding,
194	0	// but is not incompatible with our encoding, so don't give it a zero.
195	0	confidence = 10;
196	0	}
197	0
198	0	return confidence;
199	0	}
200	0
201	0	//
202	0	// No match if there are too many characters that don't fit the encoding scheme.
203	0	// (should we have zero tolerance for these?)
204	0	//
205	0	if (doubleByteCharCount < 20*badCharCount) {
206	0	confidence = 0;
207	0
208	0	return confidence;
209	0	}
210	0
211	0	if (commonChars == 0) {
212	0	// We have no statistics on frequently occuring characters.
213	0	// Assess confidence purely on having a reasonable number of
214	0	// multi-byte characters (the more the better)
215	0	confidence = 30 + doubleByteCharCount - 20*badCharCount;
216	0
217	0	if (confidence > 100) {
218	0	confidence = 100;
219	0	}
220	0	} else {
221	0	//
222	0	// Frequency of occurence statistics exist.
223	0	//
224	0
225	0	double maxVal = log((double)doubleByteCharCount / 4); /(float)?/
226	0	double scaleFactor = 90.0 / maxVal;
227	0	confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228	0
229	0	confidence = min(confidence, 100);
230	0	}
231	0
232	0	if (confidence < 0) {
233	0	confidence = 0;
234	0	}
235	0
236	0	return confidence;
237	0	}
238
239		CharsetRecog_sjis::~CharsetRecog_sjis()
240		{
241		// nothing to do
242		}
243
244	0	UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245	0	it->index = it->nextIndex;
246	0	it->error = FALSE;
247	0
248	0	int32_t firstByte = it->charValue = it->nextByte(det);
249	0
250	0	if (firstByte < 0) {
251	0	return FALSE;
252	0	}
253	0
254	0	if (firstByte <= 0x7F \|\| (firstByte > 0xA0 && firstByte <= 0xDF)) {
255	0	return TRUE;
256	0	}
257	0
258	0	int32_t secondByte = it->nextByte(det);
259	0	if (secondByte >= 0) {
260	0	it->charValue = (firstByte << 8) \| secondByte;
261	0	}
262	0	// else we'll handle the error later.
263	0
264	0	if (! ((secondByte >= 0x40 && secondByte <= 0x7F) \|\| (secondByte >= 0x80 && secondByte <= 0xFE))) {
265	0	// Illegal second byte value.
266	0	it->error = TRUE;
267	0	}
268	0
269	0	return TRUE;
270	0	}
271
272	0	UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273	0	int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274	0	results->set(det, this, confidence);
275	0	return (confidence > 0);
276	0	}
277
278		const char *CharsetRecog_sjis::getName() const
279	0	{
280	0	return "Shift_JIS";
281	0	}
282
283		const char *CharsetRecog_sjis::getLanguage() const
284	0	{
285	0	return "ja";
286	0	}
287
288		CharsetRecog_euc::~CharsetRecog_euc()
289		{
290		// nothing to do
291		}
292
293	0	UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294	0	int32_t firstByte = 0;
295	0	int32_t secondByte = 0;
296	0	int32_t thirdByte = 0;
297	0
298	0	it->index = it->nextIndex;
299	0	it->error = FALSE;
300	0	firstByte = it->charValue = it->nextByte(det);
301	0
302	0	if (firstByte < 0) {
303	0	// Ran off the end of the input data
304	0	return FALSE;
305	0	}
306	0
307	0	if (firstByte <= 0x8D) {
308	0	// single byte char
309	0	return TRUE;
310	0	}
311	0
312	0	secondByte = it->nextByte(det);
313	0	if (secondByte >= 0) {
314	0	it->charValue = (it->charValue << 8) \| secondByte;
315	0	}
316	0	// else we'll handle the error later.
317	0
318	0	if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319	0	// Two byte Char
320	0	if (secondByte < 0xA1) {
321	0	it->error = TRUE;
322	0	}
323	0
324	0	return TRUE;
325	0	}
326	0
327	0	if (firstByte == 0x8E) {
328	0	// Code Set 2.
329	0	// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330	0	// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331	0	// We don't know which we've got.
332	0	// Treat it like EUC-JP. If the data really was EUC-TW, the following two
333	0	// bytes will look like a well formed 2 byte char.
334	0	if (secondByte < 0xA1) {
335	0	it->error = TRUE;
336	0	}
337	0
338	0	return TRUE;
339	0	}
340	0
341	0	if (firstByte == 0x8F) {
342	0	// Code set 3.
343	0	// Three byte total char size, two bytes of actual char value.
344	0	thirdByte = it->nextByte(det);
345	0	it->charValue = (it->charValue << 8) \| thirdByte;
346	0
347	0	if (thirdByte < 0xa1) {
348	0	// Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349	0	it->error = TRUE;
350	0	}
351	0	}
352	0
353	0	return TRUE;
354	0
355	0	}
356
357		CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358		{
359		// nothing to do
360		}
361
362		const char *CharsetRecog_euc_jp::getName() const
363	0	{
364	0	return "EUC-JP";
365	0	}
366
367		const char *CharsetRecog_euc_jp::getLanguage() const
368	0	{
369	0	return "ja";
370	0	}
371
372		UBool CharsetRecog_euc_jp::match(InputText det, CharsetMatch results) const
373	0	{
374	0	int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375	0	results->set(det, this, confidence);
376	0	return (confidence > 0);
377	0	}
378
379		CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380		{
381		// nothing to do
382		}
383
384		const char *CharsetRecog_euc_kr::getName() const
385	0	{
386	0	return "EUC-KR";
387	0	}
388
389		const char *CharsetRecog_euc_kr::getLanguage() const
390	0	{
391	0	return "ko";
392	0	}
393
394		UBool CharsetRecog_euc_kr::match(InputText det, CharsetMatch results) const
395	0	{
396	0	int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397	0	results->set(det, this, confidence);
398	0	return (confidence > 0);
399	0	}
400
401		CharsetRecog_big5::~CharsetRecog_big5()
402		{
403		// nothing to do
404		}
405
406		UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407	0	{
408	0	int32_t firstByte;
409	0
410	0	it->index = it->nextIndex;
411	0	it->error = FALSE;
412	0	firstByte = it->charValue = it->nextByte(det);
413	0
414	0	if (firstByte < 0) {
415	0	return FALSE;
416	0	}
417	0
418	0	if (firstByte <= 0x7F \|\| firstByte == 0xFF) {
419	0	// single byte character.
420	0	return TRUE;
421	0	}
422	0
423	0	int32_t secondByte = it->nextByte(det);
424	0	if (secondByte >= 0) {
425	0	it->charValue = (it->charValue << 8) \| secondByte;
426	0	}
427	0	// else we'll handle the error later.
428	0
429	0	if (secondByte < 0x40 \|\| secondByte == 0x7F \|\| secondByte == 0xFF) {
430	0	it->error = TRUE;
431	0	}
432	0
433	0	return TRUE;
434	0	}
435
436		const char *CharsetRecog_big5::getName() const
437	0	{
438	0	return "Big5";
439	0	}
440
441		const char *CharsetRecog_big5::getLanguage() const
442	0	{
443	0	return "zh";
444	0	}
445
446		UBool CharsetRecog_big5::match(InputText det, CharsetMatch results) const
447	0	{
448	0	int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449	0	results->set(det, this, confidence);
450	0	return (confidence > 0);
451	0	}
452
453		CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454		{
455		// nothing to do
456		}
457
458	0	UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459	0	int32_t firstByte = 0;
460	0	int32_t secondByte = 0;
461	0	int32_t thirdByte = 0;
462	0	int32_t fourthByte = 0;
463	0
464	0	it->index = it->nextIndex;
465	0	it->error = FALSE;
466	0	firstByte = it->charValue = it->nextByte(det);
467	0
468	0	if (firstByte < 0) {
469	0	// Ran off the end of the input data
470	0	return FALSE;
471	0	}
472	0
473	0	if (firstByte <= 0x80) {
474	0	// single byte char
475	0	return TRUE;
476	0	}
477	0
478	0	secondByte = it->nextByte(det);
479	0	if (secondByte >= 0) {
480	0	it->charValue = (it->charValue << 8) \| secondByte;
481	0	}
482	0	// else we'll handle the error later.
483	0
484	0	if (firstByte >= 0x81 && firstByte <= 0xFE) {
485	0	// Two byte Char
486	0	if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && secondByte <= 0xFE)) {
487	0	return TRUE;
488	0	}
489	0
490	0	// Four byte char
491	0	if (secondByte >= 0x30 && secondByte <= 0x39) {
492	0	thirdByte = it->nextByte(det);
493	0
494	0	if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495	0	fourthByte = it->nextByte(det);
496	0
497	0	if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498	0	it->charValue = (it->charValue << 16) \| (thirdByte << 8) \| fourthByte;
499	0
500	0	return TRUE;
501	0	}
502	0	}
503	0	}
504	0
505	0	// Something wasn't valid, or we ran out of data (-1).
506	0	it->error = TRUE;
507	0	}
508	0
509	0	return TRUE;
510	0	}
511
512		const char *CharsetRecog_gb_18030::getName() const
513	0	{
514	0	return "GB18030";
515	0	}
516
517		const char *CharsetRecog_gb_18030::getLanguage() const
518	0	{
519	0	return "zh";
520	0	}
521
522		UBool CharsetRecog_gb_18030::match(InputText det, CharsetMatch results) const
523	0	{
524	0	int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525	0	results->set(det, this, confidence);
526	0	return (confidence > 0);
527	0	}
528
529		U_NAMESPACE_END
530		#endif