/src/tesseract/src/ccutil/unicharset.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: unicharset.h |
3 | | // Description: Unicode character/ligature set class. |
4 | | // Author: Thomas Kielbus |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #ifndef TESSERACT_CCUTIL_UNICHARSET_H_ |
20 | | #define TESSERACT_CCUTIL_UNICHARSET_H_ |
21 | | |
22 | | #include "errcode.h" |
23 | | #include "unicharmap.h" |
24 | | |
25 | | #include <tesseract/unichar.h> |
26 | | #include "helpers.h" |
27 | | #include "serialis.h" |
28 | | |
29 | | #include <functional> // for std::function |
30 | | |
31 | | namespace tesseract { |
32 | | |
33 | | // Enum holding special values of unichar_id. Every unicharset has these. |
34 | | // Warning! Keep in sync with kSpecialUnicharCodes. |
35 | | enum SpecialUnicharCodes { |
36 | | UNICHAR_SPACE, |
37 | | UNICHAR_JOINED, |
38 | | UNICHAR_BROKEN, |
39 | | |
40 | | SPECIAL_UNICHAR_CODES_COUNT |
41 | | }; |
42 | | |
43 | | // Boolean flag for unichar_insert. It's a bit of a double negative to allow |
44 | | // the default value to be false. |
45 | | enum class OldUncleanUnichars { |
46 | | kFalse, |
47 | | kTrue, |
48 | | }; |
49 | | |
50 | | class TESS_API CHAR_FRAGMENT { |
51 | | public: |
52 | | // Minimum number of characters used for fragment representation. |
53 | | static const int kMinLen = 6; |
54 | | // Maximum number of characters used for fragment representation. |
55 | | static const int kMaxLen = 3 + UNICHAR_LEN + 2; |
56 | | // Maximum number of fragments per character. |
57 | | static const int kMaxChunks = 5; |
58 | | |
59 | | // Setters and Getters. |
60 | 570 | inline void set_all(const char *unichar, int pos, int total, bool natural) { |
61 | 570 | set_unichar(unichar); |
62 | 570 | set_pos(pos); |
63 | 570 | set_total(total); |
64 | 570 | set_natural(natural); |
65 | 570 | } |
66 | 570 | inline void set_unichar(const char *uch) { |
67 | 570 | strncpy(this->unichar, uch, sizeof(this->unichar)); |
68 | 570 | this->unichar[UNICHAR_LEN] = '\0'; |
69 | 570 | } |
70 | 570 | inline void set_pos(int p) { |
71 | 570 | this->pos = p; |
72 | 570 | } |
73 | 570 | inline void set_total(int t) { |
74 | 570 | this->total = t; |
75 | 570 | } |
76 | 828k | inline const char *get_unichar() const { |
77 | 828k | return this->unichar; |
78 | 828k | } |
79 | 260k | inline int get_pos() const { |
80 | 260k | return this->pos; |
81 | 260k | } |
82 | 260k | inline int get_total() const { |
83 | 260k | return this->total; |
84 | 260k | } |
85 | | |
86 | | // Returns the string that represents a fragment |
87 | | // with the given unichar, pos and total. |
88 | | static std::string to_string(const char *unichar, int pos, int total, |
89 | | bool natural); |
90 | | // Returns the string that represents this fragment. |
91 | 0 | std::string to_string() const { |
92 | 0 | return to_string(unichar, pos, total, natural); |
93 | 0 | } |
94 | | |
95 | | // Checks whether a fragment has the same unichar, |
96 | | // position and total as the given inputs. |
97 | | inline bool equals(const char *other_unichar, int other_pos, |
98 | 0 | int other_total) const { |
99 | 0 | return (strcmp(this->unichar, other_unichar) == 0 && |
100 | 0 | this->pos == other_pos && this->total == other_total); |
101 | 0 | } |
102 | 0 | inline bool equals(const CHAR_FRAGMENT *other) const { |
103 | 0 | return this->equals(other->get_unichar(), other->get_pos(), |
104 | 0 | other->get_total()); |
105 | 0 | } |
106 | | |
107 | | // Checks whether a given fragment is a continuation of this fragment. |
108 | | // Assumes that the given fragment pointer is not nullptr. |
109 | 709k | inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { |
110 | 709k | return (strcmp(this->unichar, fragment->get_unichar()) == 0 && |
111 | 709k | this->total == fragment->get_total() && |
112 | 709k | this->pos == fragment->get_pos() + 1); |
113 | 709k | } |
114 | | |
115 | | // Returns true if this fragment is a beginning fragment. |
116 | 117k | inline bool is_beginning() const { |
117 | 117k | return this->pos == 0; |
118 | 117k | } |
119 | | |
120 | | // Returns true if this fragment is an ending fragment. |
121 | 145k | inline bool is_ending() const { |
122 | 145k | return this->pos == this->total - 1; |
123 | 145k | } |
124 | | |
125 | | // Returns true if the fragment was a separate component to begin with, |
126 | | // ie did not need chopping to be isolated, but may have been separated |
127 | | // out from a multi-outline blob. |
128 | 0 | inline bool is_natural() const { |
129 | 0 | return natural; |
130 | 0 | } |
131 | 570 | void set_natural(bool value) { |
132 | 570 | natural = value; |
133 | 570 | } |
134 | | |
135 | | // Parses the string to see whether it represents a character fragment |
136 | | // (rather than a regular character). If so, allocates memory for a new |
137 | | // CHAR_FRAGMENT instance and fills it in with the corresponding fragment |
138 | | // information. Fragments are of the form: |
139 | | // |m|1|2, meaning chunk 1 of 2 of character m, or |
140 | | // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed |
141 | | // to divide the parts, as they were already separate connected components. |
142 | | // |
143 | | // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT |
144 | | // instance, otherwise (if the string does not represent a fragment or it |
145 | | // looks like it does, but parsing it as a fragment fails) returns nullptr. |
146 | | // |
147 | | // Note: The caller is responsible for deallocating memory |
148 | | // associated with the returned pointer. |
149 | | static CHAR_FRAGMENT *parse_from_string(const char *str); |
150 | | |
151 | | private: |
152 | | char unichar[UNICHAR_LEN + 1]; |
153 | | // True if the fragment was a separate component to begin with, |
154 | | // ie did not need chopping to be isolated, but may have been separated |
155 | | // out from a multi-outline blob. |
156 | | bool natural; |
157 | | int16_t pos; // fragment position in the character |
158 | | int16_t total; // total number of fragments in the character |
159 | | }; |
160 | | |
161 | | // The UNICHARSET class is an utility class for Tesseract that holds the |
162 | | // set of characters that are used by the engine. Each character is identified |
163 | | // by a unique number, from 0 to (size - 1). |
164 | | class TESS_API UNICHARSET { |
165 | | public: |
166 | | // Custom list of characters and their ligature forms (UTF8) |
167 | | // These map to unicode values in the private use area (PUC) and are supported |
168 | | // by only few font families (eg. Wyld, Adobe Caslon Pro). |
169 | | static const char *kCustomLigatures[][2]; |
170 | | |
171 | | // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. |
172 | | static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; |
173 | | |
174 | | // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h) |
175 | | enum Direction { |
176 | | U_LEFT_TO_RIGHT = 0, |
177 | | U_RIGHT_TO_LEFT = 1, |
178 | | U_EUROPEAN_NUMBER = 2, |
179 | | U_EUROPEAN_NUMBER_SEPARATOR = 3, |
180 | | U_EUROPEAN_NUMBER_TERMINATOR = 4, |
181 | | U_ARABIC_NUMBER = 5, |
182 | | U_COMMON_NUMBER_SEPARATOR = 6, |
183 | | U_BLOCK_SEPARATOR = 7, |
184 | | U_SEGMENT_SEPARATOR = 8, |
185 | | U_WHITE_SPACE_NEUTRAL = 9, |
186 | | U_OTHER_NEUTRAL = 10, |
187 | | U_LEFT_TO_RIGHT_EMBEDDING = 11, |
188 | | U_LEFT_TO_RIGHT_OVERRIDE = 12, |
189 | | U_RIGHT_TO_LEFT_ARABIC = 13, |
190 | | U_RIGHT_TO_LEFT_EMBEDDING = 14, |
191 | | U_RIGHT_TO_LEFT_OVERRIDE = 15, |
192 | | U_POP_DIRECTIONAL_FORMAT = 16, |
193 | | U_DIR_NON_SPACING_MARK = 17, |
194 | | U_BOUNDARY_NEUTRAL = 18, |
195 | | U_FIRST_STRONG_ISOLATE = 19, |
196 | | U_LEFT_TO_RIGHT_ISOLATE = 20, |
197 | | U_RIGHT_TO_LEFT_ISOLATE = 21, |
198 | | U_POP_DIRECTIONAL_ISOLATE = 22, |
199 | | #ifndef U_HIDE_DEPRECATED_API |
200 | | U_CHAR_DIRECTION_COUNT |
201 | | #endif // U_HIDE_DEPRECATED_API |
202 | | }; |
203 | | |
204 | | // Create an empty UNICHARSET |
205 | | UNICHARSET(); |
206 | | |
207 | | ~UNICHARSET(); |
208 | | |
209 | | // Return the UNICHAR_ID of a given unichar representation within the |
210 | | // UNICHARSET. |
211 | | UNICHAR_ID unichar_to_id(const char *const unichar_repr) const; |
212 | | |
213 | | // Return the UNICHAR_ID of a given unichar representation within the |
214 | | // UNICHARSET. Only the first length characters from unichar_repr are used. |
215 | | UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const; |
216 | | |
217 | | // Return the minimum number of bytes that matches a legal UNICHAR_ID, |
218 | | // while leaving the rest of the string encodable. Returns 0 if the |
219 | | // beginning of the string is not encodable. |
220 | | // WARNING: this function now encodes the whole string for precision. |
221 | | // Use encode_string in preference to repeatedly calling step. |
222 | | int step(const char *str) const; |
223 | | |
224 | | // Returns true if the given UTF-8 string is encodable with this UNICHARSET. |
225 | | // If not encodable, write the first byte offset which cannot be converted |
226 | | // into the second (return) argument. |
227 | | bool encodable_string(const char *str, unsigned *first_bad_position) const; |
228 | | |
229 | | // Encodes the given UTF-8 string with this UNICHARSET. |
230 | | // Any part of the string that cannot be encoded (because the utf8 can't |
231 | | // be broken up into pieces that are in the unicharset) then: |
232 | | // if give_up_on_failure, stops and returns a partial encoding, |
233 | | // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding. |
234 | | // Returns true if the encoding succeeds completely, false if there is at |
235 | | // least one failure. |
236 | | // If lengths is not nullptr, then it is filled with the corresponding |
237 | | // byte length of each encoded UNICHAR_ID. |
238 | | // If encoded_length is not nullptr then on return it contains the length of |
239 | | // str that was encoded. (if give_up_on_failure the location of the first |
240 | | // failure, otherwise strlen(str).) |
241 | | // WARNING: Caller must guarantee that str has already been cleaned of codes |
242 | | // that do not belong in the unicharset, or encoding may fail. |
243 | | // Use CleanupString to perform the cleaning. |
244 | | bool encode_string(const char *str, bool give_up_on_failure, |
245 | | std::vector<UNICHAR_ID> *encoding, |
246 | | std::vector<char> *lengths, |
247 | | unsigned *encoded_length) const; |
248 | | |
249 | | // Return the unichar representation corresponding to the given UNICHAR_ID |
250 | | // within the UNICHARSET. |
251 | | const char *id_to_unichar(UNICHAR_ID id) const; |
252 | | |
253 | | // Return the UTF8 representation corresponding to the given UNICHAR_ID after |
254 | | // resolving any private encodings internal to Tesseract. This method is |
255 | | // preferable to id_to_unichar for outputting text that will be visible to |
256 | | // external applications. |
257 | | const char *id_to_unichar_ext(UNICHAR_ID id) const; |
258 | | |
259 | | // Return a string that reformats the utf8 str into the str followed |
260 | | // by its hex unicodes. |
261 | | static std::string debug_utf8_str(const char *str); |
262 | | |
263 | | // Removes/replaces content that belongs in rendered text, but not in the |
264 | | // unicharset. |
265 | 1.31k | static std::string CleanupString(const char *utf8_str) { |
266 | 1.31k | return CleanupString(utf8_str, strlen(utf8_str)); |
267 | 1.31k | } |
268 | | static std::string CleanupString(const char *utf8_str, size_t length); |
269 | | |
270 | | // Return a string containing debug information on the unichar, including |
271 | | // the id_to_unichar, its hex unicodes and the properties. |
272 | | std::string debug_str(UNICHAR_ID id) const; |
273 | 0 | std::string debug_str(const char *unichar_repr) const { |
274 | 0 | return debug_str(unichar_to_id(unichar_repr)); |
275 | 0 | } |
276 | | |
277 | | // Adds a unichar representation to the set. If old_style is true, then |
278 | | // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL |
279 | | // characters are ignored/skipped as if they don't exist and n-grams that |
280 | | // can already be encoded are not added. |
281 | | void unichar_insert(const char *const unichar_repr, |
282 | | OldUncleanUnichars old_style); |
283 | 22 | void unichar_insert(const char *const unichar_repr) { |
284 | 22 | unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); |
285 | 22 | } |
286 | | // Adds a unichar representation to the set. Avoids setting old_style to true, |
287 | | // unless it is necessary to make the new unichar get added. |
288 | 672 | void unichar_insert_backwards_compatible(const char *const unichar_repr) { |
289 | 672 | std::string cleaned = CleanupString(unichar_repr); |
290 | 672 | if (cleaned != unichar_repr) { |
291 | 8 | unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); |
292 | 664 | } else { |
293 | 664 | auto old_size = size(); |
294 | 664 | unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); |
295 | 664 | if (size() == old_size) { |
296 | 0 | unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); |
297 | 0 | } |
298 | 664 | } |
299 | 672 | } |
300 | | |
301 | | // Return true if the given unichar id exists within the set. |
302 | | // Relies on the fact that unichar ids are contiguous in the unicharset. |
303 | 516M | bool contains_unichar_id(UNICHAR_ID unichar_id) const { |
304 | 516M | return static_cast<size_t>(unichar_id) < unichars.size(); |
305 | 516M | } |
306 | | |
307 | | // Return true if the given unichar representation exists within the set. |
308 | | bool contains_unichar(const char *const unichar_repr) const; |
309 | | bool contains_unichar(const char *const unichar_repr, int length) const; |
310 | | |
311 | | // Return true if the given unichar representation corresponds to the given |
312 | | // UNICHAR_ID within the set. |
313 | | bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const; |
314 | | |
315 | | // Delete CHAR_FRAGMENTs stored in properties of unichars array. |
316 | 14 | void delete_pointers_in_unichars() { |
317 | 244 | for (auto &unichar : unichars) { |
318 | 244 | delete unichar.properties.fragment; |
319 | 244 | unichar.properties.fragment = nullptr; |
320 | 244 | } |
321 | 14 | } |
322 | | |
323 | | // Clear the UNICHARSET (all the previous data is lost). |
324 | 14 | void clear() { |
325 | 14 | if (script_table != nullptr) { |
326 | 20 | for (int i = 0; i < script_table_size_used; ++i) { |
327 | 12 | delete[] script_table[i]; |
328 | 12 | } |
329 | 8 | delete[] script_table; |
330 | 8 | script_table = nullptr; |
331 | 8 | script_table_size_used = 0; |
332 | 8 | } |
333 | 14 | script_table_size_reserved = 0; |
334 | 14 | delete_pointers_in_unichars(); |
335 | 14 | unichars.clear(); |
336 | 14 | ids.clear(); |
337 | 14 | top_bottom_set_ = false; |
338 | 14 | script_has_upper_lower_ = false; |
339 | 14 | script_has_xheight_ = false; |
340 | 14 | old_style_included_ = false; |
341 | 14 | null_sid_ = 0; |
342 | 14 | common_sid_ = 0; |
343 | 14 | latin_sid_ = 0; |
344 | 14 | cyrillic_sid_ = 0; |
345 | 14 | greek_sid_ = 0; |
346 | 14 | han_sid_ = 0; |
347 | 14 | hiragana_sid_ = 0; |
348 | 14 | katakana_sid_ = 0; |
349 | 14 | thai_sid_ = 0; |
350 | 14 | hangul_sid_ = 0; |
351 | 14 | default_sid_ = 0; |
352 | 14 | } |
353 | | |
354 | | // Return the size of the set (the number of different UNICHAR it holds). |
355 | 652M | size_t size() const { |
356 | 652M | return unichars.size(); |
357 | 652M | } |
358 | | |
359 | | // Opens the file indicated by filename and saves unicharset to that file. |
360 | | // Returns true if the operation is successful. |
361 | 0 | bool save_to_file(const char *const filename) const { |
362 | 0 | FILE *file = fopen(filename, "w+b"); |
363 | 0 | if (file == nullptr) { |
364 | 0 | return false; |
365 | 0 | } |
366 | 0 | bool result = save_to_file(file); |
367 | 0 | fclose(file); |
368 | 0 | return result; |
369 | 0 | } |
370 | | |
371 | | // Saves the content of the UNICHARSET to the given file. |
372 | | // Returns true if the operation is successful. |
373 | 0 | bool save_to_file(FILE *file) const { |
374 | 0 | std::string str; |
375 | 0 | return save_to_string(str) && |
376 | 0 | tesseract::Serialize(file, &str[0], str.length()); |
377 | 0 | } |
378 | | |
379 | 0 | bool save_to_file(tesseract::TFile *file) const { |
380 | 0 | std::string str; |
381 | 0 | return save_to_string(str) && file->Serialize(&str[0], str.length()); |
382 | 0 | } |
383 | | |
384 | | // Saves the content of the UNICHARSET to the given string. |
385 | | // Returns true if the operation is successful. |
386 | | bool save_to_string(std::string &str) const; |
387 | | |
388 | | // Opens the file indicated by filename and loads the UNICHARSET |
389 | | // from the given file. The previous data is lost. |
390 | | // Returns true if the operation is successful. |
391 | 0 | bool load_from_file(const char *const filename, bool skip_fragments) { |
392 | 0 | FILE *file = fopen(filename, "rb"); |
393 | 0 | if (file == nullptr) { |
394 | 0 | return false; |
395 | 0 | } |
396 | 0 | bool result = load_from_file(file, skip_fragments); |
397 | 0 | fclose(file); |
398 | 0 | return result; |
399 | 0 | } |
400 | | // returns true if the operation is successful. |
401 | 0 | bool load_from_file(const char *const filename) { |
402 | 0 | return load_from_file(filename, false); |
403 | 0 | } |
404 | | |
405 | | // Loads the UNICHARSET from the given file. The previous data is lost. |
406 | | // Returns true if the operation is successful. |
407 | | bool load_from_file(FILE *file, bool skip_fragments); |
408 | 0 | bool load_from_file(FILE *file) { |
409 | 0 | return load_from_file(file, false); |
410 | 0 | } |
411 | | bool load_from_file(tesseract::TFile *file, bool skip_fragments); |
412 | | |
413 | | // Sets up internal data after loading the file, based on the char |
414 | | // properties. Called from load_from_file, but also needs to be run |
415 | | // during set_unicharset_properties. |
416 | | void post_load_setup(); |
417 | | |
418 | | // Returns true if right_to_left scripts are significant in the unicharset, |
419 | | // but without being so sensitive that "universal" unicharsets containing |
420 | | // characters from many scripts, like orientation and script detection, |
421 | | // look like they are right_to_left. |
422 | | bool major_right_to_left() const; |
423 | | |
424 | | // Set a whitelist and/or blacklist of characters to recognize. |
425 | | // An empty or nullptr whitelist enables everything (minus any blacklist). |
426 | | // An empty or nullptr blacklist disables nothing. |
427 | | // An empty or nullptr unblacklist has no effect. |
428 | | // The blacklist overrides the whitelist. |
429 | | // The unblacklist overrides the blacklist. |
430 | | // Each list is a string of utf8 character strings. Boundaries between |
431 | | // unicharset units are worked out automatically, and characters not in |
432 | | // the unicharset are silently ignored. |
433 | | void set_black_and_whitelist(const char *blacklist, const char *whitelist, |
434 | | const char *unblacklist); |
435 | | |
436 | | // Set the isalpha property of the given unichar to the given value. |
437 | 450 | void set_isalpha(UNICHAR_ID unichar_id, bool value) { |
438 | 450 | unichars[unichar_id].properties.isalpha = value; |
439 | 450 | } |
440 | | |
441 | | // Set the islower property of the given unichar to the given value. |
442 | 450 | void set_islower(UNICHAR_ID unichar_id, bool value) { |
443 | 450 | unichars[unichar_id].properties.islower = value; |
444 | 450 | } |
445 | | |
446 | | // Set the isupper property of the given unichar to the given value. |
447 | 450 | void set_isupper(UNICHAR_ID unichar_id, bool value) { |
448 | 450 | unichars[unichar_id].properties.isupper = value; |
449 | 450 | } |
450 | | |
451 | | // Set the isdigit property of the given unichar to the given value. |
452 | 450 | void set_isdigit(UNICHAR_ID unichar_id, bool value) { |
453 | 450 | unichars[unichar_id].properties.isdigit = value; |
454 | 450 | } |
455 | | |
456 | | // Set the ispunctuation property of the given unichar to the given value. |
457 | 450 | void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { |
458 | 450 | unichars[unichar_id].properties.ispunctuation = value; |
459 | 450 | } |
460 | | |
461 | | // Set the isngram property of the given unichar to the given value. |
462 | 38.3k | void set_isngram(UNICHAR_ID unichar_id, bool value) { |
463 | 38.3k | unichars[unichar_id].properties.isngram = value; |
464 | 38.3k | } |
465 | | |
466 | | // Set the script name of the given unichar to the given value. |
467 | | // Value is copied and thus can be a temporary; |
468 | 1.87k | void set_script(UNICHAR_ID unichar_id, const char *value) { |
469 | 1.87k | unichars[unichar_id].properties.script_id = add_script(value); |
470 | 1.87k | } |
471 | | |
472 | | // Set other_case unichar id in the properties for the given unichar id. |
473 | 450 | void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { |
474 | 450 | unichars[unichar_id].properties.other_case = other_case; |
475 | 450 | } |
476 | | |
477 | | // Set the direction property of the given unichar to the given value. |
478 | 450 | void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { |
479 | 450 | unichars[unichar_id].properties.direction = value; |
480 | 450 | } |
481 | | |
482 | | // Set mirror unichar id in the properties for the given unichar id. |
483 | 450 | void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { |
484 | 450 | unichars[unichar_id].properties.mirror = mirror; |
485 | 450 | } |
486 | | |
487 | | // Record normalized version of unichar with the given unichar_id. |
488 | 450 | void set_normed(UNICHAR_ID unichar_id, const char *normed) { |
489 | 450 | unichars[unichar_id].properties.normed = normed; |
490 | 450 | unichars[unichar_id].properties.normed_ids.clear(); |
491 | 450 | } |
492 | | // Sets the normed_ids vector from the normed string. normed_ids is not |
493 | | // stored in the file, and needs to be set when the UNICHARSET is loaded. |
494 | | void set_normed_ids(UNICHAR_ID unichar_id); |
495 | | |
496 | | // Return the isalpha property of the given unichar. |
497 | 86.5M | bool get_isalpha(UNICHAR_ID unichar_id) const { |
498 | 86.5M | if (INVALID_UNICHAR_ID == unichar_id) { |
499 | 0 | return false; |
500 | 0 | } |
501 | 86.5M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
502 | 86.5M | return unichars[unichar_id].properties.isalpha; |
503 | 86.5M | } |
504 | | |
505 | | // Return the islower property of the given unichar. |
506 | 48.0M | bool get_islower(UNICHAR_ID unichar_id) const { |
507 | 48.0M | if (INVALID_UNICHAR_ID == unichar_id) { |
508 | 0 | return false; |
509 | 0 | } |
510 | 48.0M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
511 | 48.0M | return unichars[unichar_id].properties.islower; |
512 | 48.0M | } |
513 | | |
514 | | // Return the isupper property of the given unichar. |
515 | 27.7M | bool get_isupper(UNICHAR_ID unichar_id) const { |
516 | 27.7M | if (INVALID_UNICHAR_ID == unichar_id) { |
517 | 0 | return false; |
518 | 0 | } |
519 | 27.7M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
520 | 27.7M | return unichars[unichar_id].properties.isupper; |
521 | 27.7M | } |
522 | | |
523 | | // Return the isdigit property of the given unichar. |
524 | 71.4M | bool get_isdigit(UNICHAR_ID unichar_id) const { |
525 | 71.4M | if (INVALID_UNICHAR_ID == unichar_id) { |
526 | 0 | return false; |
527 | 0 | } |
528 | 71.4M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
529 | 71.4M | return unichars[unichar_id].properties.isdigit; |
530 | 71.4M | } |
531 | | |
532 | | // Return the ispunctuation property of the given unichar. |
533 | 24.8M | bool get_ispunctuation(UNICHAR_ID unichar_id) const { |
534 | 24.8M | if (INVALID_UNICHAR_ID == unichar_id) { |
535 | 0 | return false; |
536 | 0 | } |
537 | 24.8M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
538 | 24.8M | return unichars[unichar_id].properties.ispunctuation; |
539 | 24.8M | } |
540 | | |
541 | | // Return the isngram property of the given unichar. |
542 | 2.60M | bool get_isngram(UNICHAR_ID unichar_id) const { |
543 | 2.60M | if (INVALID_UNICHAR_ID == unichar_id) { |
544 | 0 | return false; |
545 | 0 | } |
546 | 2.60M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
547 | 2.60M | return unichars[unichar_id].properties.isngram; |
548 | 2.60M | } |
549 | | |
550 | | // Returns whether the unichar id represents a unicode value in the private |
551 | | // use area. |
552 | | bool get_isprivate(UNICHAR_ID unichar_id) const; |
553 | | |
554 | | // Returns true if the ids have useful min/max top/bottom values. |
555 | 3.64M | bool top_bottom_useful() const { |
556 | 3.64M | return top_bottom_set_; |
557 | 3.64M | } |
558 | | // Sets all ranges to empty, so they can be expanded to set the values. |
559 | | void set_ranges_empty(); |
560 | | // Sets all the properties for this unicharset given a src_unicharset with |
561 | | // everything set. The unicharsets don't have to be the same, and graphemes |
562 | | // are correctly accounted for. |
563 | 0 | void SetPropertiesFromOther(const UNICHARSET &src) { |
564 | 0 | PartialSetPropertiesFromOther(0, src); |
565 | 0 | } |
566 | | // Sets properties from Other, starting only at the given index. |
567 | | void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src); |
568 | | // Expands the tops and bottoms and widths for this unicharset given a |
569 | | // src_unicharset with ranges in it. The unicharsets don't have to be the |
570 | | // same, and graphemes are correctly accounted for. |
571 | | void ExpandRangesFromOther(const UNICHARSET &src); |
572 | | // Makes this a copy of src. Clears this completely first, so the automattic |
573 | | // ids will not be present in this if not in src. |
574 | | void CopyFrom(const UNICHARSET &src); |
575 | | // For each id in src, if it does not occur in this, add it, as in |
576 | | // SetPropertiesFromOther, otherwise expand the ranges, as in |
577 | | // ExpandRangesFromOther. |
578 | | void AppendOtherUnicharset(const UNICHARSET &src); |
579 | | // Returns true if the acceptable ranges of the tops of the characters do |
580 | | // not overlap, making their x-height calculations distinct. |
581 | | bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const; |
582 | | // Returns the min and max bottom and top of the given unichar in |
583 | | // baseline-normalized coordinates, ie, where the baseline is |
584 | | // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight |
585 | | // (See normalis.h for the definitions). |
586 | | void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, |
587 | 6.93M | int *min_top, int *max_top) const { |
588 | 6.93M | if (INVALID_UNICHAR_ID == unichar_id) { |
589 | 0 | *min_bottom = *min_top = 0; |
590 | 0 | *max_bottom = *max_top = 256; // kBlnCellHeight |
591 | 0 | return; |
592 | 0 | } |
593 | 6.93M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
594 | 6.93M | *min_bottom = unichars[unichar_id].properties.min_bottom; |
595 | 6.93M | *max_bottom = unichars[unichar_id].properties.max_bottom; |
596 | 6.93M | *min_top = unichars[unichar_id].properties.min_top; |
597 | 6.93M | *max_top = unichars[unichar_id].properties.max_top; |
598 | 6.93M | } |
599 | | void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, |
600 | 450 | int min_top, int max_top) { |
601 | 450 | unichars[unichar_id].properties.min_bottom = |
602 | 450 | ClipToRange<int>(min_bottom, 0, UINT8_MAX); |
603 | 450 | unichars[unichar_id].properties.max_bottom = |
604 | 450 | ClipToRange<int>(max_bottom, 0, UINT8_MAX); |
605 | 450 | unichars[unichar_id].properties.min_top = |
606 | 450 | ClipToRange<int>(min_top, 0, UINT8_MAX); |
607 | 450 | unichars[unichar_id].properties.max_top = |
608 | 450 | ClipToRange<int>(max_top, 0, UINT8_MAX); |
609 | 450 | } |
610 | | // Returns the width stats (as mean, sd) of the given unichar relative to the |
611 | | // median advance of all characters in the character set. |
612 | | void get_width_stats(UNICHAR_ID unichar_id, float *width, |
613 | 0 | float *width_sd) const { |
614 | 0 | if (INVALID_UNICHAR_ID == unichar_id) { |
615 | 0 | *width = 0.0f; |
616 | 0 | *width_sd = 0.0f; |
617 | 0 | return; |
618 | 0 | } |
619 | 0 | ASSERT_HOST(contains_unichar_id(unichar_id)); |
620 | 0 | *width = unichars[unichar_id].properties.width; |
621 | 0 | *width_sd = unichars[unichar_id].properties.width_sd; |
622 | 0 | } |
623 | 450 | void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) { |
624 | 450 | unichars[unichar_id].properties.width = width; |
625 | 450 | unichars[unichar_id].properties.width_sd = width_sd; |
626 | 450 | } |
627 | | // Returns the stats of the x-bearing (as mean, sd) of the given unichar |
628 | | // relative to the median advance of all characters in the character set. |
629 | | void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, |
630 | 0 | float *bearing_sd) const { |
631 | 0 | if (INVALID_UNICHAR_ID == unichar_id) { |
632 | 0 | *bearing = *bearing_sd = 0.0f; |
633 | 0 | return; |
634 | 0 | } |
635 | 0 | ASSERT_HOST(contains_unichar_id(unichar_id)); |
636 | 0 | *bearing = unichars[unichar_id].properties.bearing; |
637 | 0 | *bearing_sd = unichars[unichar_id].properties.bearing_sd; |
638 | 0 | } |
639 | | void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, |
640 | 450 | float bearing_sd) { |
641 | 450 | unichars[unichar_id].properties.bearing = bearing; |
642 | 450 | unichars[unichar_id].properties.bearing_sd = bearing_sd; |
643 | 450 | } |
644 | | // Returns the stats of the x-advance of the given unichar (as mean, sd) |
645 | | // relative to the median advance of all characters in the character set. |
646 | | void get_advance_stats(UNICHAR_ID unichar_id, float *advance, |
647 | 0 | float *advance_sd) const { |
648 | 0 | if (INVALID_UNICHAR_ID == unichar_id) { |
649 | 0 | *advance = *advance_sd = 0; |
650 | 0 | return; |
651 | 0 | } |
652 | 0 | ASSERT_HOST(contains_unichar_id(unichar_id)); |
653 | 0 | *advance = unichars[unichar_id].properties.advance; |
654 | 0 | *advance_sd = unichars[unichar_id].properties.advance_sd; |
655 | 0 | } |
656 | | void set_advance_stats(UNICHAR_ID unichar_id, float advance, |
657 | 450 | float advance_sd) { |
658 | 450 | unichars[unichar_id].properties.advance = advance; |
659 | 450 | unichars[unichar_id].properties.advance_sd = advance_sd; |
660 | 450 | } |
661 | | // Returns true if the font metrics properties are empty. |
662 | 0 | bool PropertiesIncomplete(UNICHAR_ID unichar_id) const { |
663 | 0 | return unichars[unichar_id].properties.AnyRangeEmpty(); |
664 | 0 | } |
665 | | |
666 | | // Returns true if the script of the given id is space delimited. |
667 | | // Returns false for Han and Thai scripts. |
668 | 3.76M | bool IsSpaceDelimited(UNICHAR_ID unichar_id) const { |
669 | 3.76M | if (INVALID_UNICHAR_ID == unichar_id) { |
670 | 781k | return true; |
671 | 781k | } |
672 | 2.98M | int script_id = get_script(unichar_id); |
673 | 2.98M | return script_id != han_sid_ && script_id != thai_sid_ && |
674 | 2.98M | script_id != hangul_sid_ && script_id != hiragana_sid_ && |
675 | 2.98M | script_id != katakana_sid_; |
676 | 3.76M | } |
677 | | |
678 | | // Return the script name of the given unichar. |
679 | | // The returned pointer will always be the same for the same script, it's |
680 | | // managed by unicharset and thus MUST NOT be deleted |
681 | 15.4M | int get_script(UNICHAR_ID unichar_id) const { |
682 | 15.4M | if (INVALID_UNICHAR_ID == unichar_id) { |
683 | 0 | return null_sid_; |
684 | 0 | } |
685 | 15.4M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
686 | 15.4M | return unichars[unichar_id].properties.script_id; |
687 | 15.4M | } |
688 | | |
689 | | // Return the character properties, eg. alpha/upper/lower/digit/punct, |
690 | | // as a bit field of unsigned int. |
691 | | unsigned int get_properties(UNICHAR_ID unichar_id) const; |
692 | | |
693 | | // Return the character property as a single char. If a character has |
694 | | // multiple attributes, the main property is defined by the following order: |
695 | | // upper_case : 'A' |
696 | | // lower_case : 'a' |
697 | | // alpha : 'x' |
698 | | // digit : '0' |
699 | | // punctuation: 'p' |
700 | | char get_chartype(UNICHAR_ID unichar_id) const; |
701 | | |
702 | | // Get other_case unichar id in the properties for the given unichar id. |
703 | 18.7M | UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { |
704 | 18.7M | if (INVALID_UNICHAR_ID == unichar_id) { |
705 | 0 | return INVALID_UNICHAR_ID; |
706 | 0 | } |
707 | 18.7M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
708 | 18.7M | return unichars[unichar_id].properties.other_case; |
709 | 18.7M | } |
710 | | |
711 | | // Returns the direction property of the given unichar. |
712 | 4.08M | Direction get_direction(UNICHAR_ID unichar_id) const { |
713 | 4.08M | if (INVALID_UNICHAR_ID == unichar_id) { |
714 | 0 | return UNICHARSET::U_OTHER_NEUTRAL; |
715 | 0 | } |
716 | 4.08M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
717 | 4.08M | return unichars[unichar_id].properties.direction; |
718 | 4.08M | } |
719 | | |
720 | | // Get mirror unichar id in the properties for the given unichar id. |
721 | 230k | UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { |
722 | 230k | if (INVALID_UNICHAR_ID == unichar_id) { |
723 | 0 | return INVALID_UNICHAR_ID; |
724 | 0 | } |
725 | 230k | ASSERT_HOST(contains_unichar_id(unichar_id)); |
726 | 230k | return unichars[unichar_id].properties.mirror; |
727 | 230k | } |
728 | | |
729 | | // Returns UNICHAR_ID of the corresponding lower-case unichar. |
730 | 88 | UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { |
731 | 88 | if (INVALID_UNICHAR_ID == unichar_id) { |
732 | 0 | return INVALID_UNICHAR_ID; |
733 | 0 | } |
734 | 88 | ASSERT_HOST(contains_unichar_id(unichar_id)); |
735 | 88 | if (unichars[unichar_id].properties.islower) { |
736 | 84 | return unichar_id; |
737 | 84 | } |
738 | 4 | return unichars[unichar_id].properties.other_case; |
739 | 88 | } |
740 | | |
741 | | // Returns UNICHAR_ID of the corresponding upper-case unichar. |
742 | 0 | UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { |
743 | 0 | if (INVALID_UNICHAR_ID == unichar_id) { |
744 | 0 | return INVALID_UNICHAR_ID; |
745 | 0 | } |
746 | 0 | ASSERT_HOST(contains_unichar_id(unichar_id)); |
747 | 0 | if (unichars[unichar_id].properties.isupper) { |
748 | 0 | return unichar_id; |
749 | 0 | } |
750 | 0 | return unichars[unichar_id].properties.other_case; |
751 | 0 | } |
752 | | |
753 | | // Returns true if this UNICHARSET has the special codes in |
754 | | // SpecialUnicharCodes available. If false then there are normal unichars |
755 | | // at these codes and they should not be used. |
756 | 0 | bool has_special_codes() const { |
757 | 0 | return get_fragment(UNICHAR_BROKEN) != nullptr && |
758 | 0 | strcmp(id_to_unichar(UNICHAR_BROKEN), |
759 | 0 | kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; |
760 | 0 | } |
761 | | |
762 | | // Returns true if there are any repeated unicodes in the normalized |
763 | | // text of any unichar-id in the unicharset. |
764 | | bool AnyRepeatedUnicodes() const; |
765 | | |
766 | | // Return a pointer to the CHAR_FRAGMENT class if the given |
767 | | // unichar id represents a character fragment. |
768 | 34.5M | const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { |
769 | 34.5M | if (INVALID_UNICHAR_ID == unichar_id) { |
770 | 0 | return nullptr; |
771 | 0 | } |
772 | 34.5M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
773 | 34.5M | return unichars[unichar_id].properties.fragment; |
774 | 34.5M | } |
775 | | |
776 | | // Return the isalpha property of the given unichar representation. |
777 | 0 | bool get_isalpha(const char *const unichar_repr) const { |
778 | 0 | return get_isalpha(unichar_to_id(unichar_repr)); |
779 | 0 | } |
780 | | |
781 | | // Return the islower property of the given unichar representation. |
782 | 0 | bool get_islower(const char *const unichar_repr) const { |
783 | 0 | return get_islower(unichar_to_id(unichar_repr)); |
784 | 0 | } |
785 | | |
786 | | // Return the isupper property of the given unichar representation. |
787 | 0 | bool get_isupper(const char *const unichar_repr) const { |
788 | 0 | return get_isupper(unichar_to_id(unichar_repr)); |
789 | 0 | } |
790 | | |
791 | | // Return the isdigit property of the given unichar representation. |
792 | 0 | bool get_isdigit(const char *const unichar_repr) const { |
793 | 0 | return get_isdigit(unichar_to_id(unichar_repr)); |
794 | 0 | } |
795 | | |
796 | | // Return the ispunctuation property of the given unichar representation. |
797 | 0 | bool get_ispunctuation(const char *const unichar_repr) const { |
798 | 0 | return get_ispunctuation(unichar_to_id(unichar_repr)); |
799 | 0 | } |
800 | | |
801 | | // Return the character properties, eg. alpha/upper/lower/digit/punct, |
802 | | // of the given unichar representation |
803 | 0 | unsigned int get_properties(const char *const unichar_repr) const { |
804 | 0 | return get_properties(unichar_to_id(unichar_repr)); |
805 | 0 | } |
806 | | |
807 | 0 | char get_chartype(const char *const unichar_repr) const { |
808 | 0 | return get_chartype(unichar_to_id(unichar_repr)); |
809 | 0 | } |
810 | | |
811 | | // Return the script name of the given unichar representation. |
812 | | // The returned pointer will always be the same for the same script, it's |
813 | | // managed by unicharset and thus MUST NOT be deleted |
814 | 558 | int get_script(const char *const unichar_repr) const { |
815 | 558 | return get_script(unichar_to_id(unichar_repr)); |
816 | 558 | } |
817 | | |
818 | | // Return a pointer to the CHAR_FRAGMENT class struct if the given |
819 | | // unichar representation represents a character fragment. |
820 | 0 | const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const { |
821 | 0 | if (unichar_repr == nullptr || unichar_repr[0] == '\0' || |
822 | 0 | !ids.contains(unichar_repr, false)) { |
823 | 0 | return nullptr; |
824 | 0 | } |
825 | 0 | return get_fragment(unichar_to_id(unichar_repr)); |
826 | 0 | } |
827 | | |
828 | | // Return the isalpha property of the given unichar representation. |
829 | | // Only the first length characters from unichar_repr are used. |
830 | 17.8k | bool get_isalpha(const char *const unichar_repr, int length) const { |
831 | 17.8k | return get_isalpha(unichar_to_id(unichar_repr, length)); |
832 | 17.8k | } |
833 | | |
834 | | // Return the islower property of the given unichar representation. |
835 | | // Only the first length characters from unichar_repr are used. |
836 | 0 | bool get_islower(const char *const unichar_repr, int length) const { |
837 | 0 | return get_islower(unichar_to_id(unichar_repr, length)); |
838 | 0 | } |
839 | | |
840 | | // Return the isupper property of the given unichar representation. |
841 | | // Only the first length characters from unichar_repr are used. |
842 | 0 | bool get_isupper(const char *const unichar_repr, int length) const { |
843 | 0 | return get_isupper(unichar_to_id(unichar_repr, length)); |
844 | 0 | } |
845 | | |
846 | | // Return the isdigit property of the given unichar representation. |
847 | | // Only the first length characters from unichar_repr are used. |
848 | 9 | bool get_isdigit(const char *const unichar_repr, int length) const { |
849 | 9 | return get_isdigit(unichar_to_id(unichar_repr, length)); |
850 | 9 | } |
851 | | |
852 | | // Return the ispunctuation property of the given unichar representation. |
853 | | // Only the first length characters from unichar_repr are used. |
854 | 0 | bool get_ispunctuation(const char *const unichar_repr, int length) const { |
855 | 0 | return get_ispunctuation(unichar_to_id(unichar_repr, length)); |
856 | 0 | } |
857 | | |
858 | | // Returns normalized version of unichar with the given unichar_id. |
859 | 0 | const char *get_normed_unichar(UNICHAR_ID unichar_id) const { |
860 | 0 | if (unichar_id == UNICHAR_SPACE) { |
861 | 0 | return " "; |
862 | 0 | } |
863 | 0 | return unichars[unichar_id].properties.normed.c_str(); |
864 | 0 | } |
865 | | // Returns a vector of UNICHAR_IDs that represent the ids of the normalized |
866 | | // version of the given id. There may be more than one UNICHAR_ID in the |
867 | | // vector if unichar_id represents a ligature. |
868 | 9.88M | const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const { |
869 | 9.88M | return unichars[unichar_id].properties.normed_ids; |
870 | 9.88M | } |
871 | | |
872 | | // Return the script name of the given unichar representation. |
873 | | // Only the first length characters from unichar_repr are used. |
874 | | // The returned pointer will always be the same for the same script, it's |
875 | | // managed by unicharset and thus MUST NOT be deleted |
876 | 0 | int get_script(const char *const unichar_repr, int length) const { |
877 | 0 | return get_script(unichar_to_id(unichar_repr, length)); |
878 | 0 | } |
879 | | |
880 | | // Return the (current) number of scripts in the script table |
881 | 0 | int get_script_table_size() const { |
882 | 0 | return script_table_size_used; |
883 | 0 | } |
884 | | |
885 | | // Return the script string from its id |
886 | 226 | const char *get_script_from_script_id(int id) const { |
887 | 226 | if (id >= script_table_size_used || id < 0) { |
888 | 0 | return null_script; |
889 | 0 | } |
890 | 226 | return script_table[id]; |
891 | 226 | } |
892 | | |
893 | | // Returns the id from the name of the script, or 0 if script is not found. |
894 | | // Note that this is an expensive operation since it involves iteratively |
895 | | // comparing strings in the script table. To avoid dependency on STL, we |
896 | | // won't use a hash. Instead, the calling function can use this to lookup |
897 | | // and save the ID for relevant scripts for fast comparisons later. |
898 | | int get_script_id_from_name(const char *script_name) const; |
899 | | |
900 | | // Return true if the given script is the null script |
901 | 0 | bool is_null_script(const char *script) const { |
902 | 0 | return script == null_script; |
903 | 0 | } |
904 | | |
905 | | // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, |
906 | | // then the returned pointer will be the same. |
907 | | // The script parameter is copied and thus can be a temporary. |
908 | | int add_script(const char *script); |
909 | | |
910 | | // Return the enabled property of the given unichar. |
911 | 163M | bool get_enabled(UNICHAR_ID unichar_id) const { |
912 | 163M | ASSERT_HOST(contains_unichar_id(unichar_id)); |
913 | 163M | return unichars[unichar_id].properties.enabled; |
914 | 163M | } |
915 | | |
916 | 9.12M | int null_sid() const { |
917 | 9.12M | return null_sid_; |
918 | 9.12M | } |
919 | 15.9M | int common_sid() const { |
920 | 15.9M | return common_sid_; |
921 | 15.9M | } |
922 | 138k | int latin_sid() const { |
923 | 138k | return latin_sid_; |
924 | 138k | } |
925 | 0 | int cyrillic_sid() const { |
926 | 0 | return cyrillic_sid_; |
927 | 0 | } |
928 | 0 | int greek_sid() const { |
929 | 0 | return greek_sid_; |
930 | 0 | } |
931 | 9.12M | int han_sid() const { |
932 | 9.12M | return han_sid_; |
933 | 9.12M | } |
934 | 0 | int hiragana_sid() const { |
935 | 0 | return hiragana_sid_; |
936 | 0 | } |
937 | 1 | int katakana_sid() const { |
938 | 1 | return katakana_sid_; |
939 | 1 | } |
940 | 1 | int thai_sid() const { |
941 | 1 | return thai_sid_; |
942 | 1 | } |
943 | 0 | int hangul_sid() const { |
944 | 0 | return hangul_sid_; |
945 | 0 | } |
946 | 138k | int default_sid() const { |
947 | 138k | return default_sid_; |
948 | 138k | } |
949 | | |
950 | | // Returns true if the unicharset has the concept of upper/lower case. |
951 | 3.62M | bool script_has_upper_lower() const { |
952 | 3.62M | return script_has_upper_lower_; |
953 | 3.62M | } |
954 | | // Returns true if the unicharset has the concept of x-height. |
955 | | // script_has_xheight can be true even if script_has_upper_lower is not, |
956 | | // when the script has a sufficiently predominant top line with ascenders, |
957 | | // such as Devanagari and Thai. |
958 | 152k | bool script_has_xheight() const { |
959 | 152k | return script_has_xheight_; |
960 | 152k | } |
961 | | |
962 | | private: |
963 | | struct TESS_API UNICHAR_PROPERTIES { |
964 | | UNICHAR_PROPERTIES(); |
965 | | // Initializes all properties to sensible default values. |
966 | | void Init(); |
967 | | // Sets all ranges wide open. Initialization default in case there are |
968 | | // no useful values available. |
969 | | void SetRangesOpen(); |
970 | | // Sets all ranges to empty. Used before expanding with font-based data. |
971 | | void SetRangesEmpty(); |
972 | | // Returns true if any of the top/bottom/width/bearing/advance ranges/stats |
973 | | // is empty. |
974 | | bool AnyRangeEmpty() const; |
975 | | // Expands the ranges with the ranges from the src properties. |
976 | | void ExpandRangesFrom(const UNICHAR_PROPERTIES &src); |
977 | | // Copies the properties from src into this. |
978 | | void CopyFrom(const UNICHAR_PROPERTIES &src); |
979 | | |
980 | | bool isalpha; |
981 | | bool islower; |
982 | | bool isupper; |
983 | | bool isdigit; |
984 | | bool ispunctuation; |
985 | | bool isngram; |
986 | | bool enabled; |
987 | | // Possible limits of the top and bottom of the bounding box in |
988 | | // baseline-normalized coordinates, ie, where the baseline is |
989 | | // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight |
990 | | // (See normalis.h for the definitions). |
991 | | uint8_t min_bottom; |
992 | | uint8_t max_bottom; |
993 | | uint8_t min_top; |
994 | | uint8_t max_top; |
995 | | // Statistics of the widths of bounding box, relative to the median advance. |
996 | | float width; |
997 | | float width_sd; |
998 | | // Stats of the x-bearing and advance, also relative to the median advance. |
999 | | float bearing; |
1000 | | float bearing_sd; |
1001 | | float advance; |
1002 | | float advance_sd; |
1003 | | int script_id; |
1004 | | UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar |
1005 | | Direction direction; // direction of this unichar |
1006 | | // Mirror property is useful for reverse DAWG lookup for words in |
1007 | | // right-to-left languages (e.g. "(word)" would be in |
1008 | | // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. |
1009 | | // However, what we want in our DAWG is |
1010 | | // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not |
1011 | | // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. |
1012 | | UNICHAR_ID mirror; |
1013 | | // A string of unichar_ids that represent the corresponding normed string. |
1014 | | // For awkward characters like em-dash, this gives hyphen. |
1015 | | // For ligatures, this gives the string of normal unichars. |
1016 | | std::vector<UNICHAR_ID> normed_ids; |
1017 | | std::string normed; // normalized version of this unichar |
1018 | | // Contains meta information about the fragment if a unichar represents |
1019 | | // a fragment of a character, otherwise should be set to nullptr. |
1020 | | // It is assumed that character fragments are added to the unicharset |
1021 | | // after the corresponding 'base' characters. |
1022 | | CHAR_FRAGMENT *fragment; |
1023 | | }; |
1024 | | |
1025 | | struct UNICHAR_SLOT { |
1026 | | char representation[UNICHAR_LEN + 1]; |
1027 | | UNICHAR_PROPERTIES properties; |
1028 | | }; |
1029 | | |
1030 | | // Internal recursive version of encode_string above. |
1031 | | // str is the start of the whole string. |
1032 | | // str_index is the current position in str. |
1033 | | // str_length is the length of str. |
1034 | | // encoding is a working encoding of str. |
1035 | | // lengths is a working set of lengths of each element of encoding. |
1036 | | // best_total_length is the longest length of str that has been successfully |
1037 | | // encoded so far. |
1038 | | // On return: |
1039 | | // best_encoding contains the encoding that used the longest part of str. |
1040 | | // best_lengths (may be null) contains the lengths of best_encoding. |
1041 | | void encode_string(const char *str, int str_index, int str_length, |
1042 | | std::vector<UNICHAR_ID> *encoding, |
1043 | | std::vector<char> *lengths, unsigned *best_total_length, |
1044 | | std::vector<UNICHAR_ID> *best_encoding, |
1045 | | std::vector<char> *best_lengths) const; |
1046 | | |
1047 | | // Gets the properties for a grapheme string, combining properties for |
1048 | | // multiple characters in a meaningful way where possible. |
1049 | | // Returns false if no valid match was found in the unicharset. |
1050 | | // NOTE that script_id, mirror, and other_case refer to this unicharset on |
1051 | | // return and will need redirecting if the target unicharset is different. |
1052 | | bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const; |
1053 | | |
1054 | | // Load ourselves from a "file" where our only interface to the file is |
1055 | | // an implementation of fgets(). This is the parsing primitive accessed by |
1056 | | // the public routines load_from_file(). |
1057 | | bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb, |
1058 | | bool skip_fragments); |
1059 | | |
1060 | | // List of mappings to make when ingesting strings from the outside. |
1061 | | // The substitutions clean up text that should exists for rendering of |
1062 | | // synthetic data, but not in the recognition set. |
1063 | | static const char *kCleanupMaps[][2]; |
1064 | | static const char *null_script; |
1065 | | |
1066 | | std::vector<UNICHAR_SLOT> unichars; |
1067 | | UNICHARMAP ids; |
1068 | | char **script_table; |
1069 | | int script_table_size_used; |
1070 | | int script_table_size_reserved; |
1071 | | // True if the unichars have their tops/bottoms set. |
1072 | | bool top_bottom_set_; |
1073 | | // True if the unicharset has significant upper/lower case chars. |
1074 | | bool script_has_upper_lower_; |
1075 | | // True if the unicharset has a significant mean-line with significant |
1076 | | // ascenders above that. |
1077 | | bool script_has_xheight_; |
1078 | | // True if the set contains chars that would be changed by the cleanup. |
1079 | | bool old_style_included_; |
1080 | | |
1081 | | // A few convenient script name-to-id mapping without using hash. |
1082 | | // These are initialized when unicharset file is loaded. Anything |
1083 | | // missing from this list can be looked up using get_script_id_from_name. |
1084 | | int null_sid_; |
1085 | | int common_sid_; |
1086 | | int latin_sid_; |
1087 | | int cyrillic_sid_; |
1088 | | int greek_sid_; |
1089 | | int han_sid_; |
1090 | | int hiragana_sid_; |
1091 | | int katakana_sid_; |
1092 | | int thai_sid_; |
1093 | | int hangul_sid_; |
1094 | | // The most frequently occurring script in the charset. |
1095 | | int default_sid_; |
1096 | | }; |
1097 | | |
1098 | | } // namespace tesseract |
1099 | | |
1100 | | #endif // TESSERACT_CCUTIL_UNICHARSET_H_ |