/src/tesseract/src/ccutil/unichar.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: unichar.cpp |
3 | | // Description: Unicode character/ligature class. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #include <tesseract/unichar.h> |
20 | | #include "errcode.h" |
21 | | #include "tprintf.h" |
22 | | |
23 | 0 | #define UNI_MAX_LEGAL_UTF32 0x0010FFFF |
24 | | |
25 | | namespace tesseract { |
26 | | |
27 | | // Construct from a utf8 string. If len<0 then the string is null terminated. |
28 | | // If the string is too long to fit in the UNICHAR then it takes only what |
29 | | // will fit. Checks for illegal input and stops at an illegal sequence. |
30 | | // The resulting UNICHAR may be empty. |
31 | 93.4M | UNICHAR::UNICHAR(const char *utf8_str, int len) { |
32 | 93.4M | int total_len = 0; |
33 | 93.4M | int step = 0; |
34 | 93.4M | if (len < 0) { |
35 | 217M | for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) { |
36 | 124M | ; |
37 | 124M | } |
38 | 93.4M | } |
39 | 187M | for (total_len = 0; total_len < len; total_len += step) { |
40 | 93.5M | step = utf8_step(utf8_str + total_len); |
41 | 93.5M | if (total_len + step > UNICHAR_LEN) { |
42 | 0 | break; // Too long. |
43 | 0 | } |
44 | 93.5M | if (step == 0) { |
45 | 0 | break; // Illegal first byte. |
46 | 0 | } |
47 | 93.5M | int i; |
48 | 124M | for (i = 1; i < step; ++i) { |
49 | 30.7M | if ((utf8_str[total_len + i] & 0xc0) != 0x80) { |
50 | 0 | break; |
51 | 0 | } |
52 | 30.7M | } |
53 | 93.5M | if (i < step) { |
54 | 0 | break; // Illegal surrogate |
55 | 0 | } |
56 | 93.5M | } |
57 | 93.4M | memcpy(chars, utf8_str, total_len); |
58 | 93.4M | if (total_len < UNICHAR_LEN) { |
59 | 93.4M | chars[UNICHAR_LEN - 1] = total_len; |
60 | 2.67G | while (total_len < UNICHAR_LEN - 1) { |
61 | 2.58G | chars[total_len++] = 0; |
62 | 2.58G | } |
63 | 93.4M | } |
64 | 93.4M | } |
65 | | |
66 | | // Construct from a single UCS4 character. Illegal values are ignored, |
67 | | // resulting in an empty UNICHAR. |
68 | 0 | UNICHAR::UNICHAR(int unicode) { |
69 | 0 | const int bytemask = 0xBF; |
70 | 0 | const int bytemark = 0x80; |
71 | |
|
72 | 0 | if (unicode < 0x80) { |
73 | 0 | chars[UNICHAR_LEN - 1] = 1; |
74 | 0 | chars[2] = 0; |
75 | 0 | chars[1] = 0; |
76 | 0 | chars[0] = static_cast<char>(unicode); |
77 | 0 | } else if (unicode < 0x800) { |
78 | 0 | chars[UNICHAR_LEN - 1] = 2; |
79 | 0 | chars[2] = 0; |
80 | 0 | chars[1] = static_cast<char>((unicode | bytemark) & bytemask); |
81 | 0 | unicode >>= 6; |
82 | 0 | chars[0] = static_cast<char>(unicode | 0xc0); |
83 | 0 | } else if (unicode < 0x10000) { |
84 | 0 | chars[UNICHAR_LEN - 1] = 3; |
85 | 0 | chars[2] = static_cast<char>((unicode | bytemark) & bytemask); |
86 | 0 | unicode >>= 6; |
87 | 0 | chars[1] = static_cast<char>((unicode | bytemark) & bytemask); |
88 | 0 | unicode >>= 6; |
89 | 0 | chars[0] = static_cast<char>(unicode | 0xe0); |
90 | 0 | } else if (unicode <= UNI_MAX_LEGAL_UTF32) { |
91 | 0 | chars[UNICHAR_LEN - 1] = 4; |
92 | 0 | chars[3] = static_cast<char>((unicode | bytemark) & bytemask); |
93 | 0 | unicode >>= 6; |
94 | 0 | chars[2] = static_cast<char>((unicode | bytemark) & bytemask); |
95 | 0 | unicode >>= 6; |
96 | 0 | chars[1] = static_cast<char>((unicode | bytemark) & bytemask); |
97 | 0 | unicode >>= 6; |
98 | 0 | chars[0] = static_cast<char>(unicode | 0xf0); |
99 | 0 | } else { |
100 | 0 | memset(chars, 0, UNICHAR_LEN); |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | // Get the first character as UCS-4. |
105 | 93.4M | int UNICHAR::first_uni() const { |
106 | 93.4M | static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080}; |
107 | 93.4M | int uni = 0; |
108 | 93.4M | int len = utf8_step(chars); |
109 | 93.4M | const char *src = chars; |
110 | | |
111 | 93.4M | switch (len) { |
112 | 0 | default: |
113 | 0 | break; |
114 | 0 | case 4: |
115 | 0 | uni += static_cast<unsigned char>(*src++); |
116 | 0 | uni <<= 6; |
117 | | // Fall through. |
118 | 14.4M | case 3: |
119 | 14.4M | uni += static_cast<unsigned char>(*src++); |
120 | 14.4M | uni <<= 6; |
121 | | // Fall through. |
122 | 16.3M | case 2: |
123 | 16.3M | uni += static_cast<unsigned char>(*src++); |
124 | 16.3M | uni <<= 6; |
125 | | // Fall through. |
126 | 93.4M | case 1: |
127 | 93.4M | uni += static_cast<unsigned char>(*src++); |
128 | 93.4M | } |
129 | 93.4M | uni -= utf8_offsets[len]; |
130 | 93.4M | return uni; |
131 | 93.4M | } |
132 | | |
133 | | // Get a terminated UTF8 string: Must delete[] it after use. |
134 | 0 | char *UNICHAR::utf8_str() const { |
135 | 0 | int len = utf8_len(); |
136 | 0 | char *str = new char[len + 1]; |
137 | 0 | memcpy(str, chars, len); |
138 | 0 | str[len] = 0; |
139 | 0 | return str; |
140 | 0 | } |
141 | | |
142 | | // Get the number of bytes in the first character of the given utf8 string. |
143 | 187M | int UNICHAR::utf8_step(const char *utf8_str) { |
144 | 187M | static const char utf8_bytes[256] = { |
145 | 187M | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
146 | 187M | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
147 | 187M | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
148 | 187M | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
149 | 187M | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
150 | 187M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
151 | 187M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
152 | 187M | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, |
153 | 187M | 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0}; |
154 | | |
155 | 187M | return utf8_bytes[static_cast<unsigned char>(*utf8_str)]; |
156 | 187M | } |
157 | | |
158 | 0 | UNICHAR::const_iterator &UNICHAR::const_iterator::operator++() { |
159 | 0 | ASSERT_HOST(it_ != nullptr); |
160 | 0 | int step = utf8_step(it_); |
161 | 0 | if (step == 0) { |
162 | 0 | tprintf("ERROR: Illegal UTF8 encountered.\n"); |
163 | 0 | for (int i = 0; i < 5 && it_[i] != '\0'; ++i) { |
164 | 0 | tprintf("Index %d char = 0x%x\n", i, it_[i]); |
165 | 0 | } |
166 | 0 | step = 1; |
167 | 0 | } |
168 | 0 | it_ += step; |
169 | 0 | return *this; |
170 | 0 | } |
171 | | |
172 | 0 | int UNICHAR::const_iterator::operator*() const { |
173 | 0 | ASSERT_HOST(it_ != nullptr); |
174 | 0 | const int len = utf8_step(it_); |
175 | 0 | if (len == 0) { |
176 | 0 | tprintf("WARNING: Illegal UTF8 encountered\n"); |
177 | 0 | return ' '; |
178 | 0 | } |
179 | 0 | UNICHAR uch(it_, len); |
180 | 0 | return uch.first_uni(); |
181 | 0 | } |
182 | | |
183 | 0 | int UNICHAR::const_iterator::get_utf8(char *utf8_output) const { |
184 | 0 | ASSERT_HOST(it_ != nullptr); |
185 | 0 | const int len = utf8_step(it_); |
186 | 0 | if (len == 0) { |
187 | 0 | tprintf("WARNING: Illegal UTF8 encountered\n"); |
188 | 0 | utf8_output[0] = ' '; |
189 | 0 | return 1; |
190 | 0 | } |
191 | 0 | strncpy(utf8_output, it_, len); |
192 | 0 | return len; |
193 | 0 | } |
194 | | |
195 | 0 | int UNICHAR::const_iterator::utf8_len() const { |
196 | 0 | ASSERT_HOST(it_ != nullptr); |
197 | 0 | const int len = utf8_step(it_); |
198 | 0 | if (len == 0) { |
199 | 0 | tprintf("WARNING: Illegal UTF8 encountered\n"); |
200 | 0 | return 1; |
201 | 0 | } |
202 | 0 | return len; |
203 | 0 | } |
204 | | |
205 | 0 | bool UNICHAR::const_iterator::is_legal() const { |
206 | 0 | return utf8_step(it_) > 0; |
207 | 0 | } |
208 | | |
209 | 0 | UNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) { |
210 | 0 | return UNICHAR::const_iterator(utf8_str); |
211 | 0 | } |
212 | | |
213 | 0 | UNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) { |
214 | 0 | return UNICHAR::const_iterator(utf8_str + len); |
215 | 0 | } |
216 | | |
217 | | // Converts a utf-8 string to a vector of unicodes. |
218 | | // Returns an empty vector if the input contains invalid UTF-8. |
219 | | /* static */ |
220 | 0 | std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) { |
221 | 0 | const int utf8_length = strlen(utf8_str); |
222 | 0 | std::vector<char32> unicodes; |
223 | 0 | unicodes.reserve(utf8_length); |
224 | 0 | const_iterator end_it(end(utf8_str, utf8_length)); |
225 | 0 | for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) { |
226 | 0 | if (it.is_legal()) { |
227 | 0 | unicodes.push_back(*it); |
228 | 0 | } else { |
229 | 0 | unicodes.clear(); |
230 | 0 | return unicodes; |
231 | 0 | } |
232 | 0 | } |
233 | 0 | return unicodes; |
234 | 0 | } |
235 | | |
236 | | // Returns an empty string if the input contains an invalid unicode. |
237 | 0 | std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) { |
238 | 0 | std::string utf8_str; |
239 | 0 | for (char32 ch : str32) { |
240 | 0 | UNICHAR uni_ch(ch); |
241 | 0 | int step; |
242 | 0 | if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) { |
243 | 0 | utf8_str.append(uni_ch.utf8(), step); |
244 | 0 | } else { |
245 | 0 | return ""; |
246 | 0 | } |
247 | 0 | } |
248 | 0 | return utf8_str; |
249 | 0 | } |
250 | | |
251 | | } // namespace tesseract |