/src/icu/source/i18n/inputext.cpp
Line | Count | Source |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ********************************************************************** |
5 | | * Copyright (C) 2005-2016, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ********************************************************************** |
8 | | */ |
9 | | |
10 | | #include "unicode/utypes.h" |
11 | | |
12 | | #if !UCONFIG_NO_CONVERSION |
13 | | |
14 | | #include "inputext.h" |
15 | | |
16 | | #include "cmemory.h" |
17 | | #include "cstring.h" |
18 | | |
19 | | #include <string.h> |
20 | | |
21 | | U_NAMESPACE_BEGIN |
22 | | |
23 | 218k | #define BUFFER_SIZE 8192 |
24 | | |
25 | 62.6k | #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
26 | 94.0k | #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
27 | | |
28 | | InputText::InputText(UErrorCode &status) |
29 | 31.3k | : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been |
30 | | // removed if appropriate. |
31 | 31.3k | fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. |
32 | | // Value is percent, not absolute. |
33 | 31.3k | fDeclaredEncoding(0), |
34 | 31.3k | fRawInput(0), |
35 | 31.3k | fRawLength(0) |
36 | 31.3k | { |
37 | 31.3k | if (fInputBytes == NULL || fByteStats == NULL) { |
38 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
39 | 0 | } |
40 | 31.3k | } |
41 | | |
42 | | InputText::~InputText() |
43 | 31.3k | { |
44 | 31.3k | DELETE_ARRAY(fDeclaredEncoding); |
45 | 31.3k | DELETE_ARRAY(fByteStats); |
46 | 31.3k | DELETE_ARRAY(fInputBytes); |
47 | 31.3k | } |
48 | | |
49 | | void InputText::setText(const char *in, int32_t len) |
50 | 31.2k | { |
51 | 31.2k | fInputLen = 0; |
52 | 31.2k | fC1Bytes = FALSE; |
53 | 31.2k | fRawInput = (const uint8_t *) in; |
54 | 31.2k | fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
55 | 31.2k | } |
56 | | |
57 | | void InputText::setDeclaredEncoding(const char* encoding, int32_t len) |
58 | 0 | { |
59 | 0 | if(encoding) { |
60 | 0 | if (len == -1) { |
61 | 0 | len = (int32_t)uprv_strlen(encoding); |
62 | 0 | } |
63 | |
|
64 | 0 | len += 1; // to make place for the \0 at the end. |
65 | 0 | uprv_free(fDeclaredEncoding); |
66 | 0 | fDeclaredEncoding = NEW_ARRAY(char, len); |
67 | 0 | uprv_strncpy(fDeclaredEncoding, encoding, len); |
68 | 0 | } |
69 | 0 | } |
70 | | |
71 | | UBool InputText::isSet() const |
72 | 31.2k | { |
73 | 31.2k | return fRawInput != NULL; |
74 | 31.2k | } |
75 | | |
76 | | /** |
77 | | * MungeInput - after getting a set of raw input data to be analyzed, preprocess |
78 | | * it by removing what appears to be html markup. |
79 | | * |
80 | | * @internal |
81 | | */ |
82 | 31.2k | void InputText::MungeInput(UBool fStripTags) { |
83 | 31.2k | int srci = 0; |
84 | 31.2k | int dsti = 0; |
85 | 31.2k | uint8_t b; |
86 | 31.2k | bool inMarkup = FALSE; |
87 | 31.2k | int32_t openTags = 0; |
88 | 31.2k | int32_t badTags = 0; |
89 | | |
90 | | // |
91 | | // html / xml markup stripping. |
92 | | // quick and dirty, not 100% accurate, but hopefully good enough, statistically. |
93 | | // discard everything within < brackets > |
94 | | // Count how many total '<' and illegal (nested) '<' occur, so we can make some |
95 | | // guess as to whether the input was actually marked up at all. |
96 | | // TODO: Think about how this interacts with EBCDIC charsets that are detected. |
97 | 31.2k | if (fStripTags) { |
98 | 187k | for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { |
99 | 185k | b = fRawInput[srci]; |
100 | | |
101 | 185k | if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ |
102 | 0 | if (inMarkup) { |
103 | 0 | badTags += 1; |
104 | 0 | } |
105 | |
|
106 | 0 | inMarkup = TRUE; |
107 | 0 | openTags += 1; |
108 | 0 | } |
109 | | |
110 | 185k | if (! inMarkup) { |
111 | 185k | fInputBytes[dsti++] = b; |
112 | 185k | } |
113 | | |
114 | 185k | if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ |
115 | 0 | inMarkup = FALSE; |
116 | 0 | } |
117 | 185k | } |
118 | | |
119 | 2.35k | fInputLen = dsti; |
120 | 2.35k | } |
121 | | |
122 | | // |
123 | | // If it looks like this input wasn't marked up, or if it looks like it's |
124 | | // essentially nothing but markup abandon the markup stripping. |
125 | | // Detection will have to work on the unstripped input. |
126 | | // |
127 | 31.2k | if (openTags<5 || openTags/5 < badTags || |
128 | 0 | (fInputLen < 100 && fRawLength>600)) |
129 | 31.2k | { |
130 | 31.2k | int32_t limit = fRawLength; |
131 | | |
132 | 31.2k | if (limit > BUFFER_SIZE) { |
133 | 1.47k | limit = BUFFER_SIZE; |
134 | 1.47k | } |
135 | | |
136 | 23.6M | for (srci=0; srci<limit; srci++) { |
137 | 23.6M | fInputBytes[srci] = fRawInput[srci]; |
138 | 23.6M | } |
139 | | |
140 | 31.2k | fInputLen = srci; |
141 | 31.2k | } |
142 | | |
143 | | // |
144 | | // Tally up the byte occurence statistics. |
145 | | // These are available for use by the various detectors. |
146 | | // |
147 | | |
148 | 31.2k | uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); |
149 | | |
150 | 23.6M | for (srci = 0; srci < fInputLen; srci += 1) { |
151 | 23.6M | fByteStats[fInputBytes[srci]] += 1; |
152 | 23.6M | } |
153 | | |
154 | 402k | for (int32_t i = 0x80; i <= 0x9F; i += 1) { |
155 | 394k | if (fByteStats[i] != 0) { |
156 | 23.4k | fC1Bytes = TRUE; |
157 | 23.4k | break; |
158 | 23.4k | } |
159 | 394k | } |
160 | 31.2k | } |
161 | | |
162 | | U_NAMESPACE_END |
163 | | #endif |
164 | | |