/src/icu/source/i18n/inputext.cpp

Source
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 2005-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "inputext.h"

#include "cmemory.h"
#include "cstring.h"

#include <string.h>

U_NAMESPACE_BEGIN

#define BUFFER_SIZE 8192

#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))

InputText::InputText(UErrorCode &status)
    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                 //   removed if appropriate.
      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
                                                 //   Value is percent, not absolute.
      fDeclaredEncoding(0),
      fRawInput(0),
      fRawLength(0)
{
    if (fInputBytes == NULL || fByteStats == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
}

InputText::~InputText()
{
    DELETE_ARRAY(fDeclaredEncoding);
    DELETE_ARRAY(fByteStats);
    DELETE_ARRAY(fInputBytes);
}

void InputText::setText(const char *in, int32_t len)
{
    fInputLen  = 0;
    fC1Bytes   = FALSE;
    fRawInput  = (const uint8_t *) in;
    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
}

void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
{
    if(encoding) {
        if (len == -1) {
            len = (int32_t)uprv_strlen(encoding);
        }

        len += 1;     // to make place for the \0 at the end.
        uprv_free(fDeclaredEncoding);
        fDeclaredEncoding = NEW_ARRAY(char, len);
        uprv_strncpy(fDeclaredEncoding, encoding, len);
    }
}

UBool InputText::isSet() const 
{
    return fRawInput != NULL;
}

/**
*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
*               it by removing what appears to be html markup.
* 
* @internal
*/
void InputText::MungeInput(UBool fStripTags) {
    int     srci = 0;
    int     dsti = 0;
    uint8_t b;
    bool    inMarkup = FALSE;
    int32_t openTags = 0;
    int32_t badTags  = 0;

    //
    //  html / xml markup stripping.
    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    //     discard everything within < brackets >
    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    //     guess as to whether the input was actually marked up at all.
    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
    if (fStripTags) {
        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
            b = fRawInput[srci];

            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
                if (inMarkup) {
                    badTags += 1;
                }

                inMarkup = TRUE;
                openTags += 1;
            }

            if (! inMarkup) {
                fInputBytes[dsti++] = b;
            }

            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
                inMarkup = FALSE;
            }
        }

        fInputLen = dsti;
    }

    //
    //  If it looks like this input wasn't marked up, or if it looks like it's
    //    essentially nothing but markup abandon the markup stripping.
    //    Detection will have to work on the unstripped input.
    //
    if (openTags<5 || openTags/5 < badTags || 
        (fInputLen < 100 && fRawLength>600))
    {
        int32_t limit = fRawLength;

        if (limit > BUFFER_SIZE) {
            limit = BUFFER_SIZE;
        }

        for (srci=0; srci<limit; srci++) {
            fInputBytes[srci] = fRawInput[srci];
        }

        fInputLen = srci;
    }

    //
    // Tally up the byte occurence statistics.
    // These are available for use by the various detectors.
    //

    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

    for (srci = 0; srci < fInputLen; srci += 1) {
        fByteStats[fInputBytes[srci]] += 1;
    }

    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
        if (fByteStats[i] != 0) {
            fC1Bytes = TRUE;
            break;
        }
    }
}

U_NAMESPACE_END
#endif


Line	Count	Source
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		**********************************************************************
5		* Copyright (C) 2005-2016, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		**********************************************************************
8		*/
9
10		#include "unicode/utypes.h"
11
12		#if !UCONFIG_NO_CONVERSION
13
14		#include "inputext.h"
15
16		#include "cmemory.h"
17		#include "cstring.h"
18
19		#include <string.h>
20
21		U_NAMESPACE_BEGIN
22
23	218k	#define BUFFER_SIZE 8192
24
25	62.6k	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
26	94.0k	#define DELETE_ARRAY(array) uprv_free((void *) (array))
27
28		InputText::InputText(UErrorCode &status)
29	31.3k	: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
30		// removed if appropriate.
31	31.3k	fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
32		// Value is percent, not absolute.
33	31.3k	fDeclaredEncoding(0),
34	31.3k	fRawInput(0),
35	31.3k	fRawLength(0)
36	31.3k	{
37	31.3k	if (fInputBytes == NULL \|\| fByteStats == NULL) {
38	0	status = U_MEMORY_ALLOCATION_ERROR;
39	0	}
40	31.3k	}
41
42		InputText::~InputText()
43	31.3k	{
44	31.3k	DELETE_ARRAY(fDeclaredEncoding);
45	31.3k	DELETE_ARRAY(fByteStats);
46	31.3k	DELETE_ARRAY(fInputBytes);
47	31.3k	}
48
49		void InputText::setText(const char *in, int32_t len)
50	31.2k	{
51	31.2k	fInputLen = 0;
52	31.2k	fC1Bytes = FALSE;
53	31.2k	fRawInput = (const uint8_t *) in;
54	31.2k	fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
55	31.2k	}
56
57		void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58	0	{
59	0	if(encoding) {
60	0	if (len == -1) {
61	0	len = (int32_t)uprv_strlen(encoding);
62	0	}
63
64	0	len += 1; // to make place for the \0 at the end.
65	0	uprv_free(fDeclaredEncoding);
66	0	fDeclaredEncoding = NEW_ARRAY(char, len);
67	0	uprv_strncpy(fDeclaredEncoding, encoding, len);
68	0	}
69	0	}
70
71		UBool InputText::isSet() const
72	31.2k	{
73	31.2k	return fRawInput != NULL;
74	31.2k	}
75
76		/**
77		* MungeInput - after getting a set of raw input data to be analyzed, preprocess
78		* it by removing what appears to be html markup.
79		*
80		* @internal
81		*/
82	31.2k	void InputText::MungeInput(UBool fStripTags) {
83	31.2k	int srci = 0;
84	31.2k	int dsti = 0;
85	31.2k	uint8_t b;
86	31.2k	bool inMarkup = FALSE;
87	31.2k	int32_t openTags = 0;
88	31.2k	int32_t badTags = 0;
89
90		//
91		// html / xml markup stripping.
92		// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93		// discard everything within < brackets >
94		// Count how many total '<' and illegal (nested) '<' occur, so we can make some
95		// guess as to whether the input was actually marked up at all.
96		// TODO: Think about how this interacts with EBCDIC charsets that are detected.
97	31.2k	if (fStripTags) {
98	187k	for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99	185k	b = fRawInput[srci];
100
101	185k	if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102	0	if (inMarkup) {
103	0	badTags += 1;
104	0	}
105
106	0	inMarkup = TRUE;
107	0	openTags += 1;
108	0	}
109
110	185k	if (! inMarkup) {
111	185k	fInputBytes[dsti++] = b;
112	185k	}
113
114	185k	if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115	0	inMarkup = FALSE;
116	0	}
117	185k	}
118
119	2.35k	fInputLen = dsti;
120	2.35k	}
121
122		//
123		// If it looks like this input wasn't marked up, or if it looks like it's
124		// essentially nothing but markup abandon the markup stripping.
125		// Detection will have to work on the unstripped input.
126		//
127	31.2k	if (openTags<5 \|\| openTags/5 < badTags \|\|
128	0	(fInputLen < 100 && fRawLength>600))
129	31.2k	{
130	31.2k	int32_t limit = fRawLength;
131
132	31.2k	if (limit > BUFFER_SIZE) {
133	1.47k	limit = BUFFER_SIZE;
134	1.47k	}
135
136	23.6M	for (srci=0; srci<limit; srci++) {
137	23.6M	fInputBytes[srci] = fRawInput[srci];
138	23.6M	}
139
140	31.2k	fInputLen = srci;
141	31.2k	}
142
143		//
144		// Tally up the byte occurence statistics.
145		// These are available for use by the various detectors.
146		//
147
148	31.2k	uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149
150	23.6M	for (srci = 0; srci < fInputLen; srci += 1) {
151	23.6M	fByteStats[fInputBytes[srci]] += 1;
152	23.6M	}
153
154	402k	for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155	394k	if (fByteStats[i] != 0) {
156	23.4k	fC1Bytes = TRUE;
157	23.4k	break;
158	23.4k	}
159	394k	}
160	31.2k	}
161
162		U_NAMESPACE_END
163		#endif
164

Coverage Report

Created: 2026-03-12 06:42