/src/tesseract/src/ccutil/unicharset.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | ///////////////////////////////////////////////////////////////////////  | 
2  |  | // File:        unicharset.h  | 
3  |  | // Description: Unicode character/ligature set class.  | 
4  |  | // Author:      Thomas Kielbus  | 
5  |  | //  | 
6  |  | // (C) Copyright 2006, Google Inc.  | 
7  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
8  |  | // you may not use this file except in compliance with the License.  | 
9  |  | // You may obtain a copy of the License at  | 
10  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
11  |  | // Unless required by applicable law or agreed to in writing, software  | 
12  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
13  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
14  |  | // See the License for the specific language governing permissions and  | 
15  |  | // limitations under the License.  | 
16  |  | //  | 
17  |  | ///////////////////////////////////////////////////////////////////////  | 
18  |  |  | 
19  |  | #ifndef TESSERACT_CCUTIL_UNICHARSET_H_  | 
20  |  | #define TESSERACT_CCUTIL_UNICHARSET_H_  | 
21  |  |  | 
22  |  | #include "errcode.h"  | 
23  |  | #include "unicharmap.h"  | 
24  |  |  | 
25  |  | #include <tesseract/unichar.h>  | 
26  |  | #include "helpers.h"  | 
27  |  | #include "serialis.h"  | 
28  |  |  | 
29  |  | #include <functional> // for std::function  | 
30  |  |  | 
31  |  | namespace tesseract { | 
32  |  |  | 
33  |  | // Enum holding special values of unichar_id. Every unicharset has these.  | 
34  |  | // Warning! Keep in sync with kSpecialUnicharCodes.  | 
35  |  | enum SpecialUnicharCodes { | 
36  |  |   UNICHAR_SPACE,  | 
37  |  |   UNICHAR_JOINED,  | 
38  |  |   UNICHAR_BROKEN,  | 
39  |  |  | 
40  |  |   SPECIAL_UNICHAR_CODES_COUNT  | 
41  |  | };  | 
42  |  |  | 
43  |  | // Boolean flag for unichar_insert. It's a bit of a double negative to allow  | 
44  |  | // the default value to be false.  | 
45  |  | enum class OldUncleanUnichars { | 
46  |  |   kFalse,  | 
47  |  |   kTrue,  | 
48  |  | };  | 
49  |  |  | 
50  |  | class TESS_API CHAR_FRAGMENT { | 
51  |  | public:  | 
52  |  |   // Minimum number of characters used for fragment representation.  | 
53  |  |   static const int kMinLen = 6;  | 
54  |  |   // Maximum number of characters used for fragment representation.  | 
55  |  |   static const int kMaxLen = 3 + UNICHAR_LEN + 2;  | 
56  |  |   // Maximum number of fragments per character.  | 
57  |  |   static const int kMaxChunks = 5;  | 
58  |  |  | 
59  |  |   // Setters and Getters.  | 
60  | 1.14k  |   inline void set_all(const char *unichar, int pos, int total, bool natural) { | 
61  | 1.14k  |     set_unichar(unichar);  | 
62  | 1.14k  |     set_pos(pos);  | 
63  | 1.14k  |     set_total(total);  | 
64  | 1.14k  |     set_natural(natural);  | 
65  | 1.14k  |   }  | 
66  | 1.14k  |   inline void set_unichar(const char *uch) { | 
67  | 1.14k  |     strncpy(this->unichar, uch, sizeof(this->unichar));  | 
68  | 1.14k  |     this->unichar[UNICHAR_LEN] = '\0';  | 
69  | 1.14k  |   }  | 
70  | 1.14k  |   inline void set_pos(int p) { | 
71  | 1.14k  |     this->pos = p;  | 
72  | 1.14k  |   }  | 
73  | 1.14k  |   inline void set_total(int t) { | 
74  | 1.14k  |     this->total = t;  | 
75  | 1.14k  |   }  | 
76  | 2.80M  |   inline const char *get_unichar() const { | 
77  | 2.80M  |     return this->unichar;  | 
78  | 2.80M  |   }  | 
79  | 876k  |   inline int get_pos() const { | 
80  | 876k  |     return this->pos;  | 
81  | 876k  |   }  | 
82  | 876k  |   inline int get_total() const { | 
83  | 876k  |     return this->total;  | 
84  | 876k  |   }  | 
85  |  |  | 
86  |  |   // Returns the string that represents a fragment  | 
87  |  |   // with the given unichar, pos and total.  | 
88  |  |   static std::string to_string(const char *unichar, int pos, int total,  | 
89  |  |                                bool natural);  | 
90  |  |   // Returns the string that represents this fragment.  | 
91  | 0  |   std::string to_string() const { | 
92  | 0  |     return to_string(unichar, pos, total, natural);  | 
93  | 0  |   }  | 
94  |  |  | 
95  |  |   // Checks whether a fragment has the same unichar,  | 
96  |  |   // position and total as the given inputs.  | 
97  |  |   inline bool equals(const char *other_unichar, int other_pos,  | 
98  | 0  |                      int other_total) const { | 
99  | 0  |     return (strcmp(this->unichar, other_unichar) == 0 &&  | 
100  | 0  |             this->pos == other_pos && this->total == other_total);  | 
101  | 0  |   }  | 
102  | 0  |   inline bool equals(const CHAR_FRAGMENT *other) const { | 
103  | 0  |     return this->equals(other->get_unichar(), other->get_pos(),  | 
104  | 0  |                         other->get_total());  | 
105  | 0  |   }  | 
106  |  |  | 
107  |  |   // Checks whether a given fragment is a continuation of this fragment.  | 
108  |  |   // Assumes that the given fragment pointer is not nullptr.  | 
109  | 2.41M  |   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { | 
110  | 2.41M  |     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&  | 
111  | 2.41M  |             this->total == fragment->get_total() &&  | 
112  | 2.41M  |             this->pos == fragment->get_pos() + 1);  | 
113  | 2.41M  |   }  | 
114  |  |  | 
115  |  |   // Returns true if this fragment is a beginning fragment.  | 
116  | 381k  |   inline bool is_beginning() const { | 
117  | 381k  |     return this->pos == 0;  | 
118  | 381k  |   }  | 
119  |  |  | 
120  |  |   // Returns true if this fragment is an ending fragment.  | 
121  | 478k  |   inline bool is_ending() const { | 
122  | 478k  |     return this->pos == this->total - 1;  | 
123  | 478k  |   }  | 
124  |  |  | 
125  |  |   // Returns true if the fragment was a separate component to begin with,  | 
126  |  |   // ie did not need chopping to be isolated, but may have been separated  | 
127  |  |   // out from a multi-outline blob.  | 
128  | 0  |   inline bool is_natural() const { | 
129  | 0  |     return natural;  | 
130  | 0  |   }  | 
131  | 1.14k  |   void set_natural(bool value) { | 
132  | 1.14k  |     natural = value;  | 
133  | 1.14k  |   }  | 
134  |  |  | 
135  |  |   // Parses the string to see whether it represents a character fragment  | 
136  |  |   // (rather than a regular character). If so, allocates memory for a new  | 
137  |  |   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment  | 
138  |  |   // information. Fragments are of the form:  | 
139  |  |   // |m|1|2, meaning chunk 1 of 2 of character m, or  | 
140  |  |   // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed  | 
141  |  |   // to divide the parts, as they were already separate connected components.  | 
142  |  |   //  | 
143  |  |   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT  | 
144  |  |   // instance, otherwise (if the string does not represent a fragment or it  | 
145  |  |   // looks like it does, but parsing it as a fragment fails) returns nullptr.  | 
146  |  |   //  | 
147  |  |   // Note: The caller is responsible for deallocating memory  | 
148  |  |   // associated with the returned pointer.  | 
149  |  |   static CHAR_FRAGMENT *parse_from_string(const char *str);  | 
150  |  |  | 
151  |  | private:  | 
152  |  |   char unichar[UNICHAR_LEN + 1];  | 
153  |  |   // True if the fragment was a separate component to begin with,  | 
154  |  |   // ie did not need chopping to be isolated, but may have been separated  | 
155  |  |   // out from a multi-outline blob.  | 
156  |  |   bool natural;  | 
157  |  |   int16_t pos;   // fragment position in the character  | 
158  |  |   int16_t total; // total number of fragments in the character  | 
159  |  | };  | 
160  |  |  | 
161  |  | // The UNICHARSET class is an utility class for Tesseract that holds the  | 
162  |  | // set of characters that are used by the engine. Each character is identified  | 
163  |  | // by a unique number, from 0 to (size - 1).  | 
164  |  | class TESS_API UNICHARSET { | 
165  |  | public:  | 
166  |  |   // Custom list of characters and their ligature forms (UTF8)  | 
167  |  |   // These map to unicode values in the private use area (PUC) and are supported  | 
168  |  |   // by only few font families (eg. Wyld, Adobe Caslon Pro).  | 
169  |  |   static const char *kCustomLigatures[][2];  | 
170  |  |  | 
171  |  |   // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.  | 
172  |  |   static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];  | 
173  |  |  | 
174  |  |   // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)  | 
175  |  |   enum Direction { | 
176  |  |     U_LEFT_TO_RIGHT = 0,  | 
177  |  |     U_RIGHT_TO_LEFT = 1,  | 
178  |  |     U_EUROPEAN_NUMBER = 2,  | 
179  |  |     U_EUROPEAN_NUMBER_SEPARATOR = 3,  | 
180  |  |     U_EUROPEAN_NUMBER_TERMINATOR = 4,  | 
181  |  |     U_ARABIC_NUMBER = 5,  | 
182  |  |     U_COMMON_NUMBER_SEPARATOR = 6,  | 
183  |  |     U_BLOCK_SEPARATOR = 7,  | 
184  |  |     U_SEGMENT_SEPARATOR = 8,  | 
185  |  |     U_WHITE_SPACE_NEUTRAL = 9,  | 
186  |  |     U_OTHER_NEUTRAL = 10,  | 
187  |  |     U_LEFT_TO_RIGHT_EMBEDDING = 11,  | 
188  |  |     U_LEFT_TO_RIGHT_OVERRIDE = 12,  | 
189  |  |     U_RIGHT_TO_LEFT_ARABIC = 13,  | 
190  |  |     U_RIGHT_TO_LEFT_EMBEDDING = 14,  | 
191  |  |     U_RIGHT_TO_LEFT_OVERRIDE = 15,  | 
192  |  |     U_POP_DIRECTIONAL_FORMAT = 16,  | 
193  |  |     U_DIR_NON_SPACING_MARK = 17,  | 
194  |  |     U_BOUNDARY_NEUTRAL = 18,  | 
195  |  |     U_FIRST_STRONG_ISOLATE = 19,  | 
196  |  |     U_LEFT_TO_RIGHT_ISOLATE = 20,  | 
197  |  |     U_RIGHT_TO_LEFT_ISOLATE = 21,  | 
198  |  |     U_POP_DIRECTIONAL_ISOLATE = 22,  | 
199  |  | #ifndef U_HIDE_DEPRECATED_API  | 
200  |  |     U_CHAR_DIRECTION_COUNT  | 
201  |  | #endif // U_HIDE_DEPRECATED_API  | 
202  |  |   };  | 
203  |  |  | 
204  |  |   // Create an empty UNICHARSET  | 
205  |  |   UNICHARSET();  | 
206  |  |  | 
207  |  |   ~UNICHARSET();  | 
208  |  |  | 
209  |  |   // Return the UNICHAR_ID of a given unichar representation within the  | 
210  |  |   // UNICHARSET.  | 
211  |  |   UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;  | 
212  |  |  | 
213  |  |   // Return the UNICHAR_ID of a given unichar representation within the  | 
214  |  |   // UNICHARSET. Only the first length characters from unichar_repr are used.  | 
215  |  |   UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;  | 
216  |  |  | 
217  |  |   // Return the minimum number of bytes that matches a legal UNICHAR_ID,  | 
218  |  |   // while leaving the rest of the string encodable. Returns 0 if the  | 
219  |  |   // beginning of the string is not encodable.  | 
220  |  |   // WARNING: this function now encodes the whole string for precision.  | 
221  |  |   // Use encode_string in preference to repeatedly calling step.  | 
222  |  |   int step(const char *str) const;  | 
223  |  |  | 
224  |  |   // Returns true if the given UTF-8 string is encodable with this UNICHARSET.  | 
225  |  |   // If not encodable, write the first byte offset which cannot be converted  | 
226  |  |   // into the second (return) argument.  | 
227  |  |   bool encodable_string(const char *str, unsigned *first_bad_position) const;  | 
228  |  |  | 
229  |  |   // Encodes the given UTF-8 string with this UNICHARSET.  | 
230  |  |   // Any part of the string that cannot be encoded (because the utf8 can't  | 
231  |  |   // be broken up into pieces that are in the unicharset) then:  | 
232  |  |   // if give_up_on_failure, stops and returns a partial encoding,  | 
233  |  |   // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.  | 
234  |  |   // Returns true if the encoding succeeds completely, false if there is at  | 
235  |  |   // least one failure.  | 
236  |  |   // If lengths is not nullptr, then it is filled with the corresponding  | 
237  |  |   // byte length of each encoded UNICHAR_ID.  | 
238  |  |   // If encoded_length is not nullptr then on return it contains the length of  | 
239  |  |   // str that was encoded. (if give_up_on_failure the location of the first  | 
240  |  |   // failure, otherwise strlen(str).)  | 
241  |  |   // WARNING: Caller must guarantee that str has already been cleaned of codes  | 
242  |  |   // that do not belong in the unicharset, or encoding may fail.  | 
243  |  |   // Use CleanupString to perform the cleaning.  | 
244  |  |   bool encode_string(const char *str, bool give_up_on_failure,  | 
245  |  |                      std::vector<UNICHAR_ID> *encoding,  | 
246  |  |                      std::vector<char> *lengths,  | 
247  |  |                      unsigned *encoded_length) const;  | 
248  |  |  | 
249  |  |   // Return the unichar representation corresponding to the given UNICHAR_ID  | 
250  |  |   // within the UNICHARSET.  | 
251  |  |   const char *id_to_unichar(UNICHAR_ID id) const;  | 
252  |  |  | 
253  |  |   // Return the UTF8 representation corresponding to the given UNICHAR_ID after  | 
254  |  |   // resolving any private encodings internal to Tesseract. This method is  | 
255  |  |   // preferable to id_to_unichar for outputting text that will be visible to  | 
256  |  |   // external applications.  | 
257  |  |   const char *id_to_unichar_ext(UNICHAR_ID id) const;  | 
258  |  |  | 
259  |  |   // Return a string that reformats the utf8 str into the str followed  | 
260  |  |   // by its hex unicodes.  | 
261  |  |   static std::string debug_utf8_str(const char *str);  | 
262  |  |  | 
263  |  |   // Removes/replaces content that belongs in rendered text, but not in the  | 
264  |  |   // unicharset.  | 
265  | 2.62k  |   static std::string CleanupString(const char *utf8_str) { | 
266  | 2.62k  |     return CleanupString(utf8_str, strlen(utf8_str));  | 
267  | 2.62k  |   }  | 
268  |  |   static std::string CleanupString(const char *utf8_str, size_t length);  | 
269  |  |  | 
270  |  |   // Return a string containing debug information on the unichar, including  | 
271  |  |   // the id_to_unichar, its hex unicodes and the properties.  | 
272  |  |   std::string debug_str(UNICHAR_ID id) const;  | 
273  | 0  |   std::string debug_str(const char *unichar_repr) const { | 
274  | 0  |     return debug_str(unichar_to_id(unichar_repr));  | 
275  | 0  |   }  | 
276  |  |  | 
277  |  |   // Adds a unichar representation to the set. If old_style is true, then  | 
278  |  |   // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL  | 
279  |  |   // characters are ignored/skipped as if they don't exist and n-grams that  | 
280  |  |   // can already be encoded are not added.  | 
281  |  |   void unichar_insert(const char *const unichar_repr,  | 
282  |  |                       OldUncleanUnichars old_style);  | 
283  | 44  |   void unichar_insert(const char *const unichar_repr) { | 
284  | 44  |     unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);  | 
285  | 44  |   }  | 
286  |  |   // Adds a unichar representation to the set. Avoids setting old_style to true,  | 
287  |  |   // unless it is necessary to make the new unichar get added.  | 
288  | 1.34k  |   void unichar_insert_backwards_compatible(const char *const unichar_repr) { | 
289  | 1.34k  |     std::string cleaned = CleanupString(unichar_repr);  | 
290  | 1.34k  |     if (cleaned != unichar_repr) { | 
291  | 16  |       unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);  | 
292  | 1.32k  |     } else { | 
293  | 1.32k  |       auto old_size = size();  | 
294  | 1.32k  |       unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);  | 
295  | 1.32k  |       if (size() == old_size) { | 
296  | 0  |         unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);  | 
297  | 0  |       }  | 
298  | 1.32k  |     }  | 
299  | 1.34k  |   }  | 
300  |  |  | 
301  |  |   // Return true if the given unichar id exists within the set.  | 
302  |  |   // Relies on the fact that unichar ids are contiguous in the unicharset.  | 
303  | 1.11G  |   bool contains_unichar_id(UNICHAR_ID unichar_id) const { | 
304  | 1.11G  |     return static_cast<size_t>(unichar_id) < unichars.size();  | 
305  | 1.11G  |   }  | 
306  |  |  | 
307  |  |   // Return true if the given unichar representation exists within the set.  | 
308  |  |   bool contains_unichar(const char *const unichar_repr) const;  | 
309  |  |   bool contains_unichar(const char *const unichar_repr, int length) const;  | 
310  |  |  | 
311  |  |   // Return true if the given unichar representation corresponds to the given  | 
312  |  |   // UNICHAR_ID within the set.  | 
313  |  |   bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;  | 
314  |  |  | 
315  |  |   // Delete CHAR_FRAGMENTs stored in properties of unichars array.  | 
316  | 28  |   void delete_pointers_in_unichars() { | 
317  | 488  |     for (auto &unichar : unichars) { | 
318  | 488  |       delete unichar.properties.fragment;  | 
319  | 488  |       unichar.properties.fragment = nullptr;  | 
320  | 488  |     }  | 
321  | 28  |   }  | 
322  |  |  | 
323  |  |   // Clear the UNICHARSET (all the previous data is lost).  | 
324  | 28  |   void clear() { | 
325  | 28  |     if (script_table != nullptr) { | 
326  | 40  |       for (int i = 0; i < script_table_size_used; ++i) { | 
327  | 24  |         delete[] script_table[i];  | 
328  | 24  |       }  | 
329  | 16  |       delete[] script_table;  | 
330  | 16  |       script_table = nullptr;  | 
331  | 16  |       script_table_size_used = 0;  | 
332  | 16  |     }  | 
333  | 28  |     script_table_size_reserved = 0;  | 
334  | 28  |     delete_pointers_in_unichars();  | 
335  | 28  |     unichars.clear();  | 
336  | 28  |     ids.clear();  | 
337  | 28  |     top_bottom_set_ = false;  | 
338  | 28  |     script_has_upper_lower_ = false;  | 
339  | 28  |     script_has_xheight_ = false;  | 
340  | 28  |     old_style_included_ = false;  | 
341  | 28  |     null_sid_ = 0;  | 
342  | 28  |     common_sid_ = 0;  | 
343  | 28  |     latin_sid_ = 0;  | 
344  | 28  |     cyrillic_sid_ = 0;  | 
345  | 28  |     greek_sid_ = 0;  | 
346  | 28  |     han_sid_ = 0;  | 
347  | 28  |     hiragana_sid_ = 0;  | 
348  | 28  |     katakana_sid_ = 0;  | 
349  | 28  |     thai_sid_ = 0;  | 
350  | 28  |     hangul_sid_ = 0;  | 
351  | 28  |     default_sid_ = 0;  | 
352  | 28  |   }  | 
353  |  |  | 
354  |  |   // Return the size of the set (the number of different UNICHAR it holds).  | 
355  | 1.54G  |   size_t size() const { | 
356  | 1.54G  |     return unichars.size();  | 
357  | 1.54G  |   }  | 
358  |  |  | 
359  |  |   // Opens the file indicated by filename and saves unicharset to that file.  | 
360  |  |   // Returns true if the operation is successful.  | 
361  | 0  |   bool save_to_file(const char *const filename) const { | 
362  | 0  |     FILE *file = fopen(filename, "w+b");  | 
363  | 0  |     if (file == nullptr) { | 
364  | 0  |       return false;  | 
365  | 0  |     }  | 
366  | 0  |     bool result = save_to_file(file);  | 
367  | 0  |     fclose(file);  | 
368  | 0  |     return result;  | 
369  | 0  |   }  | 
370  |  |  | 
371  |  |   // Saves the content of the UNICHARSET to the given file.  | 
372  |  |   // Returns true if the operation is successful.  | 
373  | 0  |   bool save_to_file(FILE *file) const { | 
374  | 0  |     std::string str;  | 
375  | 0  |     return save_to_string(str) &&  | 
376  | 0  |            tesseract::Serialize(file, &str[0], str.length());  | 
377  | 0  |   }  | 
378  |  |  | 
379  | 0  |   bool save_to_file(tesseract::TFile *file) const { | 
380  | 0  |     std::string str;  | 
381  | 0  |     return save_to_string(str) && file->Serialize(&str[0], str.length());  | 
382  | 0  |   }  | 
383  |  |  | 
384  |  |   // Saves the content of the UNICHARSET to the given string.  | 
385  |  |   // Returns true if the operation is successful.  | 
386  |  |   bool save_to_string(std::string &str) const;  | 
387  |  |  | 
388  |  |   // Opens the file indicated by filename and loads the UNICHARSET  | 
389  |  |   // from the given file. The previous data is lost.  | 
390  |  |   // Returns true if the operation is successful.  | 
391  | 0  |   bool load_from_file(const char *const filename, bool skip_fragments) { | 
392  | 0  |     FILE *file = fopen(filename, "rb");  | 
393  | 0  |     if (file == nullptr) { | 
394  | 0  |       return false;  | 
395  | 0  |     }  | 
396  | 0  |     bool result = load_from_file(file, skip_fragments);  | 
397  | 0  |     fclose(file);  | 
398  | 0  |     return result;  | 
399  | 0  |   }  | 
400  |  |   // returns true if the operation is successful.  | 
401  | 0  |   bool load_from_file(const char *const filename) { | 
402  | 0  |     return load_from_file(filename, false);  | 
403  | 0  |   }  | 
404  |  |  | 
405  |  |   // Loads the UNICHARSET from the given file. The previous data is lost.  | 
406  |  |   // Returns true if the operation is successful.  | 
407  |  |   bool load_from_file(FILE *file, bool skip_fragments);  | 
408  | 0  |   bool load_from_file(FILE *file) { | 
409  | 0  |     return load_from_file(file, false);  | 
410  | 0  |   }  | 
411  |  |   bool load_from_file(tesseract::TFile *file, bool skip_fragments);  | 
412  |  |  | 
413  |  |   // Sets up internal data after loading the file, based on the char  | 
414  |  |   // properties. Called from load_from_file, but also needs to be run  | 
415  |  |   // during set_unicharset_properties.  | 
416  |  |   void post_load_setup();  | 
417  |  |  | 
418  |  |   // Returns true if right_to_left scripts are significant in the unicharset,  | 
419  |  |   // but without being so sensitive that "universal" unicharsets containing  | 
420  |  |   // characters from many scripts, like orientation and script detection,  | 
421  |  |   // look like they are right_to_left.  | 
422  |  |   bool major_right_to_left() const;  | 
423  |  |  | 
424  |  |   // Set a whitelist and/or blacklist of characters to recognize.  | 
425  |  |   // An empty or nullptr whitelist enables everything (minus any blacklist).  | 
426  |  |   // An empty or nullptr blacklist disables nothing.  | 
427  |  |   // An empty or nullptr unblacklist has no effect.  | 
428  |  |   // The blacklist overrides the whitelist.  | 
429  |  |   // The unblacklist overrides the blacklist.  | 
430  |  |   // Each list is a string of utf8 character strings. Boundaries between  | 
431  |  |   // unicharset units are worked out automatically, and characters not in  | 
432  |  |   // the unicharset are silently ignored.  | 
433  |  |   void set_black_and_whitelist(const char *blacklist, const char *whitelist,  | 
434  |  |                                const char *unblacklist);  | 
435  |  |  | 
436  |  |   // Set the isalpha property of the given unichar to the given value.  | 
437  | 900  |   void set_isalpha(UNICHAR_ID unichar_id, bool value) { | 
438  | 900  |     unichars[unichar_id].properties.isalpha = value;  | 
439  | 900  |   }  | 
440  |  |  | 
441  |  |   // Set the islower property of the given unichar to the given value.  | 
442  | 900  |   void set_islower(UNICHAR_ID unichar_id, bool value) { | 
443  | 900  |     unichars[unichar_id].properties.islower = value;  | 
444  | 900  |   }  | 
445  |  |  | 
446  |  |   // Set the isupper property of the given unichar to the given value.  | 
447  | 900  |   void set_isupper(UNICHAR_ID unichar_id, bool value) { | 
448  | 900  |     unichars[unichar_id].properties.isupper = value;  | 
449  | 900  |   }  | 
450  |  |  | 
451  |  |   // Set the isdigit property of the given unichar to the given value.  | 
452  | 900  |   void set_isdigit(UNICHAR_ID unichar_id, bool value) { | 
453  | 900  |     unichars[unichar_id].properties.isdigit = value;  | 
454  | 900  |   }  | 
455  |  |  | 
456  |  |   // Set the ispunctuation property of the given unichar to the given value.  | 
457  | 900  |   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { | 
458  | 900  |     unichars[unichar_id].properties.ispunctuation = value;  | 
459  | 900  |   }  | 
460  |  |  | 
461  |  |   // Set the isngram property of the given unichar to the given value.  | 
462  | 76.7k  |   void set_isngram(UNICHAR_ID unichar_id, bool value) { | 
463  | 76.7k  |     unichars[unichar_id].properties.isngram = value;  | 
464  | 76.7k  |   }  | 
465  |  |  | 
466  |  |   // Set the script name of the given unichar to the given value.  | 
467  |  |   // Value is copied and thus can be a temporary;  | 
468  | 3.75k  |   void set_script(UNICHAR_ID unichar_id, const char *value) { | 
469  | 3.75k  |     unichars[unichar_id].properties.script_id = add_script(value);  | 
470  | 3.75k  |   }  | 
471  |  |  | 
472  |  |   // Set other_case unichar id in the properties for the given unichar id.  | 
473  | 900  |   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { | 
474  | 900  |     unichars[unichar_id].properties.other_case = other_case;  | 
475  | 900  |   }  | 
476  |  |  | 
477  |  |   // Set the direction property of the given unichar to the given value.  | 
478  | 900  |   void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { | 
479  | 900  |     unichars[unichar_id].properties.direction = value;  | 
480  | 900  |   }  | 
481  |  |  | 
482  |  |   // Set mirror unichar id in the properties for the given unichar id.  | 
483  | 900  |   void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { | 
484  | 900  |     unichars[unichar_id].properties.mirror = mirror;  | 
485  | 900  |   }  | 
486  |  |  | 
487  |  |   // Record normalized version of unichar with the given unichar_id.  | 
488  | 900  |   void set_normed(UNICHAR_ID unichar_id, const char *normed) { | 
489  | 900  |     unichars[unichar_id].properties.normed = normed;  | 
490  | 900  |     unichars[unichar_id].properties.normed_ids.clear();  | 
491  | 900  |   }  | 
492  |  |   // Sets the normed_ids vector from the normed string. normed_ids is not  | 
493  |  |   // stored in the file, and needs to be set when the UNICHARSET is loaded.  | 
494  |  |   void set_normed_ids(UNICHAR_ID unichar_id);  | 
495  |  |  | 
496  |  |   // Return the isalpha property of the given unichar.  | 
497  | 161M  |   bool get_isalpha(UNICHAR_ID unichar_id) const { | 
498  | 161M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
499  | 0  |       return false;  | 
500  | 0  |     }  | 
501  | 161M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
502  | 161M  |     return unichars[unichar_id].properties.isalpha;  | 
503  | 161M  |   }  | 
504  |  |  | 
505  |  |   // Return the islower property of the given unichar.  | 
506  | 92.9M  |   bool get_islower(UNICHAR_ID unichar_id) const { | 
507  | 92.9M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
508  | 0  |       return false;  | 
509  | 0  |     }  | 
510  | 92.9M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
511  | 92.9M  |     return unichars[unichar_id].properties.islower;  | 
512  | 92.9M  |   }  | 
513  |  |  | 
514  |  |   // Return the isupper property of the given unichar.  | 
515  | 50.7M  |   bool get_isupper(UNICHAR_ID unichar_id) const { | 
516  | 50.7M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
517  | 0  |       return false;  | 
518  | 0  |     }  | 
519  | 50.7M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
520  | 50.7M  |     return unichars[unichar_id].properties.isupper;  | 
521  | 50.7M  |   }  | 
522  |  |  | 
523  |  |   // Return the isdigit property of the given unichar.  | 
524  | 135M  |   bool get_isdigit(UNICHAR_ID unichar_id) const { | 
525  | 135M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
526  | 0  |       return false;  | 
527  | 0  |     }  | 
528  | 135M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
529  | 135M  |     return unichars[unichar_id].properties.isdigit;  | 
530  | 135M  |   }  | 
531  |  |  | 
532  |  |   // Return the ispunctuation property of the given unichar.  | 
533  | 50.2M  |   bool get_ispunctuation(UNICHAR_ID unichar_id) const { | 
534  | 50.2M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
535  | 0  |       return false;  | 
536  | 0  |     }  | 
537  | 50.2M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
538  | 50.2M  |     return unichars[unichar_id].properties.ispunctuation;  | 
539  | 50.2M  |   }  | 
540  |  |  | 
541  |  |   // Return the isngram property of the given unichar.  | 
542  | 5.83M  |   bool get_isngram(UNICHAR_ID unichar_id) const { | 
543  | 5.83M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
544  | 0  |       return false;  | 
545  | 0  |     }  | 
546  | 5.83M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
547  | 5.83M  |     return unichars[unichar_id].properties.isngram;  | 
548  | 5.83M  |   }  | 
549  |  |  | 
550  |  |   // Returns whether the unichar id represents a unicode value in the private  | 
551  |  |   // use area.  | 
552  |  |   bool get_isprivate(UNICHAR_ID unichar_id) const;  | 
553  |  |  | 
554  |  |   // Returns true if the ids have useful min/max top/bottom values.  | 
555  | 8.40M  |   bool top_bottom_useful() const { | 
556  | 8.40M  |     return top_bottom_set_;  | 
557  | 8.40M  |   }  | 
558  |  |   // Sets all ranges to empty, so they can be expanded to set the values.  | 
559  |  |   void set_ranges_empty();  | 
560  |  |   // Sets all the properties for this unicharset given a src_unicharset with  | 
561  |  |   // everything set. The unicharsets don't have to be the same, and graphemes  | 
562  |  |   // are correctly accounted for.  | 
563  | 0  |   void SetPropertiesFromOther(const UNICHARSET &src) { | 
564  | 0  |     PartialSetPropertiesFromOther(0, src);  | 
565  | 0  |   }  | 
566  |  |   // Sets properties from Other, starting only at the given index.  | 
567  |  |   void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);  | 
568  |  |   // Expands the tops and bottoms and widths for this unicharset given a  | 
569  |  |   // src_unicharset with ranges in it. The unicharsets don't have to be the  | 
570  |  |   // same, and graphemes are correctly accounted for.  | 
571  |  |   void ExpandRangesFromOther(const UNICHARSET &src);  | 
572  |  |   // Makes this a copy of src. Clears this completely first, so the automattic  | 
573  |  |   // ids will not be present in this if not in src.  | 
574  |  |   void CopyFrom(const UNICHARSET &src);  | 
575  |  |   // For each id in src, if it does not occur in this, add it, as in  | 
576  |  |   // SetPropertiesFromOther, otherwise expand the ranges, as in  | 
577  |  |   // ExpandRangesFromOther.  | 
578  |  |   void AppendOtherUnicharset(const UNICHARSET &src);  | 
579  |  |   // Returns true if the acceptable ranges of the tops of the characters do  | 
580  |  |   // not overlap, making their x-height calculations distinct.  | 
581  |  |   bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;  | 
582  |  |   // Returns the min and max bottom and top of the given unichar in  | 
583  |  |   // baseline-normalized coordinates, ie, where the baseline is  | 
584  |  |   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight  | 
585  |  |   // (See normalis.h for the definitions).  | 
586  |  |   void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,  | 
587  | 14.6M  |                       int *min_top, int *max_top) const { | 
588  | 14.6M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
589  | 0  |       *min_bottom = *min_top = 0;  | 
590  | 0  |       *max_bottom = *max_top = 256; // kBlnCellHeight  | 
591  | 0  |       return;  | 
592  | 0  |     }  | 
593  | 14.6M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
594  | 14.6M  |     *min_bottom = unichars[unichar_id].properties.min_bottom;  | 
595  | 14.6M  |     *max_bottom = unichars[unichar_id].properties.max_bottom;  | 
596  | 14.6M  |     *min_top = unichars[unichar_id].properties.min_top;  | 
597  | 14.6M  |     *max_top = unichars[unichar_id].properties.max_top;  | 
598  | 14.6M  |   }  | 
599  |  |   void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,  | 
600  | 900  |                       int min_top, int max_top) { | 
601  | 900  |     unichars[unichar_id].properties.min_bottom =  | 
602  | 900  |         ClipToRange<int>(min_bottom, 0, UINT8_MAX);  | 
603  | 900  |     unichars[unichar_id].properties.max_bottom =  | 
604  | 900  |         ClipToRange<int>(max_bottom, 0, UINT8_MAX);  | 
605  | 900  |     unichars[unichar_id].properties.min_top =  | 
606  | 900  |         ClipToRange<int>(min_top, 0, UINT8_MAX);  | 
607  | 900  |     unichars[unichar_id].properties.max_top =  | 
608  | 900  |         ClipToRange<int>(max_top, 0, UINT8_MAX);  | 
609  | 900  |   }  | 
610  |  |   // Returns the width stats (as mean, sd) of the given unichar relative to the  | 
611  |  |   // median advance of all characters in the character set.  | 
612  |  |   void get_width_stats(UNICHAR_ID unichar_id, float *width,  | 
613  | 0  |                        float *width_sd) const { | 
614  | 0  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
615  | 0  |       *width = 0.0f;  | 
616  | 0  |       *width_sd = 0.0f;  | 
617  | 0  |       return;  | 
618  | 0  |     }  | 
619  | 0  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
620  | 0  |     *width = unichars[unichar_id].properties.width;  | 
621  | 0  |     *width_sd = unichars[unichar_id].properties.width_sd;  | 
622  | 0  |   }  | 
623  | 900  |   void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) { | 
624  | 900  |     unichars[unichar_id].properties.width = width;  | 
625  | 900  |     unichars[unichar_id].properties.width_sd = width_sd;  | 
626  | 900  |   }  | 
627  |  |   // Returns the stats of the x-bearing (as mean, sd) of the given unichar  | 
628  |  |   // relative to the median advance of all characters in the character set.  | 
629  |  |   void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,  | 
630  | 0  |                          float *bearing_sd) const { | 
631  | 0  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
632  | 0  |       *bearing = *bearing_sd = 0.0f;  | 
633  | 0  |       return;  | 
634  | 0  |     }  | 
635  | 0  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
636  | 0  |     *bearing = unichars[unichar_id].properties.bearing;  | 
637  | 0  |     *bearing_sd = unichars[unichar_id].properties.bearing_sd;  | 
638  | 0  |   }  | 
639  |  |   void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,  | 
640  | 900  |                          float bearing_sd) { | 
641  | 900  |     unichars[unichar_id].properties.bearing = bearing;  | 
642  | 900  |     unichars[unichar_id].properties.bearing_sd = bearing_sd;  | 
643  | 900  |   }  | 
644  |  |   // Returns the stats of the x-advance of the given unichar (as mean, sd)  | 
645  |  |   // relative to the median advance of all characters in the character set.  | 
646  |  |   void get_advance_stats(UNICHAR_ID unichar_id, float *advance,  | 
647  | 0  |                          float *advance_sd) const { | 
648  | 0  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
649  | 0  |       *advance = *advance_sd = 0;  | 
650  | 0  |       return;  | 
651  | 0  |     }  | 
652  | 0  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
653  | 0  |     *advance = unichars[unichar_id].properties.advance;  | 
654  | 0  |     *advance_sd = unichars[unichar_id].properties.advance_sd;  | 
655  | 0  |   }  | 
656  |  |   void set_advance_stats(UNICHAR_ID unichar_id, float advance,  | 
657  | 900  |                          float advance_sd) { | 
658  | 900  |     unichars[unichar_id].properties.advance = advance;  | 
659  | 900  |     unichars[unichar_id].properties.advance_sd = advance_sd;  | 
660  | 900  |   }  | 
661  |  |   // Returns true if the font metrics properties are empty.  | 
662  | 0  |   bool PropertiesIncomplete(UNICHAR_ID unichar_id) const { | 
663  | 0  |     return unichars[unichar_id].properties.AnyRangeEmpty();  | 
664  | 0  |   }  | 
665  |  |  | 
666  |  |   // Returns true if the script of the given id is space delimited.  | 
667  |  |   // Returns false for Han and Thai scripts.  | 
668  | 16.9M  |   bool IsSpaceDelimited(UNICHAR_ID unichar_id) const { | 
669  | 16.9M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
670  | 3.78M  |       return true;  | 
671  | 3.78M  |     }  | 
672  | 13.1M  |     int script_id = get_script(unichar_id);  | 
673  | 13.1M  |     return script_id != han_sid_ && script_id != thai_sid_ &&  | 
674  | 13.1M  |            script_id != hangul_sid_ && script_id != hiragana_sid_ &&  | 
675  | 13.1M  |            script_id != katakana_sid_;  | 
676  | 16.9M  |   }  | 
677  |  |  | 
678  |  |   // Return the script name of the given unichar.  | 
679  |  |   // The returned pointer will always be the same for the same script, it's  | 
680  |  |   // managed by unicharset and thus MUST NOT be deleted  | 
681  | 39.3M  |   int get_script(UNICHAR_ID unichar_id) const { | 
682  | 39.3M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
683  | 0  |       return null_sid_;  | 
684  | 0  |     }  | 
685  | 39.3M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
686  | 39.3M  |     return unichars[unichar_id].properties.script_id;  | 
687  | 39.3M  |   }  | 
688  |  |  | 
689  |  |   // Return the character properties, eg. alpha/upper/lower/digit/punct,  | 
690  |  |   // as a bit field of unsigned int.  | 
691  |  |   unsigned int get_properties(UNICHAR_ID unichar_id) const;  | 
692  |  |  | 
693  |  |   // Return the character property as a single char.  If a character has  | 
694  |  |   // multiple attributes, the main property is defined by the following order:  | 
695  |  |   //   upper_case : 'A'  | 
696  |  |   //   lower_case : 'a'  | 
697  |  |   //   alpha      : 'x'  | 
698  |  |   //   digit      : '0'  | 
699  |  |   //   punctuation: 'p'  | 
700  |  |   char get_chartype(UNICHAR_ID unichar_id) const;  | 
701  |  |  | 
702  |  |   // Get other_case unichar id in the properties for the given unichar id.  | 
703  | 34.3M  |   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { | 
704  | 34.3M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
705  | 0  |       return INVALID_UNICHAR_ID;  | 
706  | 0  |     }  | 
707  | 34.3M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
708  | 34.3M  |     return unichars[unichar_id].properties.other_case;  | 
709  | 34.3M  |   }  | 
710  |  |  | 
711  |  |   // Returns the direction property of the given unichar.  | 
712  | 15.8M  |   Direction get_direction(UNICHAR_ID unichar_id) const { | 
713  | 15.8M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
714  | 0  |       return UNICHARSET::U_OTHER_NEUTRAL;  | 
715  | 0  |     }  | 
716  | 15.8M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
717  | 15.8M  |     return unichars[unichar_id].properties.direction;  | 
718  | 15.8M  |   }  | 
719  |  |  | 
720  |  |   // Get mirror unichar id in the properties for the given unichar id.  | 
721  | 718k  |   UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { | 
722  | 718k  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
723  | 0  |       return INVALID_UNICHAR_ID;  | 
724  | 0  |     }  | 
725  | 718k  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
726  | 718k  |     return unichars[unichar_id].properties.mirror;  | 
727  | 718k  |   }  | 
728  |  |  | 
729  |  |   // Returns UNICHAR_ID of the corresponding lower-case unichar.  | 
730  | 176  |   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { | 
731  | 176  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
732  | 0  |       return INVALID_UNICHAR_ID;  | 
733  | 0  |     }  | 
734  | 176  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
735  | 176  |     if (unichars[unichar_id].properties.islower) { | 
736  | 168  |       return unichar_id;  | 
737  | 168  |     }  | 
738  | 8  |     return unichars[unichar_id].properties.other_case;  | 
739  | 176  |   }  | 
740  |  |  | 
741  |  |   // Returns UNICHAR_ID of the corresponding upper-case unichar.  | 
742  | 0  |   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { | 
743  | 0  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
744  | 0  |       return INVALID_UNICHAR_ID;  | 
745  | 0  |     }  | 
746  | 0  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
747  | 0  |     if (unichars[unichar_id].properties.isupper) { | 
748  | 0  |       return unichar_id;  | 
749  | 0  |     }  | 
750  | 0  |     return unichars[unichar_id].properties.other_case;  | 
751  | 0  |   }  | 
752  |  |  | 
753  |  |   // Returns true if this UNICHARSET has the special codes in  | 
754  |  |   // SpecialUnicharCodes available. If false then there are normal unichars  | 
755  |  |   // at these codes and they should not be used.  | 
756  | 0  |   bool has_special_codes() const { | 
757  | 0  |     return get_fragment(UNICHAR_BROKEN) != nullptr &&  | 
758  | 0  |            strcmp(id_to_unichar(UNICHAR_BROKEN),  | 
759  | 0  |                   kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;  | 
760  | 0  |   }  | 
761  |  |  | 
762  |  |   // Returns true if there are any repeated unicodes in the normalized  | 
763  |  |   // text of any unichar-id in the unicharset.  | 
764  |  |   bool AnyRepeatedUnicodes() const;  | 
765  |  |  | 
766  |  |   // Return a pointer to the CHAR_FRAGMENT class if the given  | 
767  |  |   // unichar id represents a character fragment.  | 
768  | 75.8M  |   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { | 
769  | 75.8M  |     if (INVALID_UNICHAR_ID == unichar_id) { | 
770  | 0  |       return nullptr;  | 
771  | 0  |     }  | 
772  | 75.8M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
773  | 75.8M  |     return unichars[unichar_id].properties.fragment;  | 
774  | 75.8M  |   }  | 
775  |  |  | 
776  |  |   // Return the isalpha property of the given unichar representation.  | 
777  | 0  |   bool get_isalpha(const char *const unichar_repr) const { | 
778  | 0  |     return get_isalpha(unichar_to_id(unichar_repr));  | 
779  | 0  |   }  | 
780  |  |  | 
781  |  |   // Return the islower property of the given unichar representation.  | 
782  | 0  |   bool get_islower(const char *const unichar_repr) const { | 
783  | 0  |     return get_islower(unichar_to_id(unichar_repr));  | 
784  | 0  |   }  | 
785  |  |  | 
786  |  |   // Return the isupper property of the given unichar representation.  | 
787  | 0  |   bool get_isupper(const char *const unichar_repr) const { | 
788  | 0  |     return get_isupper(unichar_to_id(unichar_repr));  | 
789  | 0  |   }  | 
790  |  |  | 
791  |  |   // Return the isdigit property of the given unichar representation.  | 
792  | 0  |   bool get_isdigit(const char *const unichar_repr) const { | 
793  | 0  |     return get_isdigit(unichar_to_id(unichar_repr));  | 
794  | 0  |   }  | 
795  |  |  | 
796  |  |   // Return the ispunctuation property of the given unichar representation.  | 
797  | 0  |   bool get_ispunctuation(const char *const unichar_repr) const { | 
798  | 0  |     return get_ispunctuation(unichar_to_id(unichar_repr));  | 
799  | 0  |   }  | 
800  |  |  | 
801  |  |   // Return the character properties, eg. alpha/upper/lower/digit/punct,  | 
802  |  |   // of the given unichar representation  | 
803  | 0  |   unsigned int get_properties(const char *const unichar_repr) const { | 
804  | 0  |     return get_properties(unichar_to_id(unichar_repr));  | 
805  | 0  |   }  | 
806  |  |  | 
807  | 0  |   char get_chartype(const char *const unichar_repr) const { | 
808  | 0  |     return get_chartype(unichar_to_id(unichar_repr));  | 
809  | 0  |   }  | 
810  |  |  | 
811  |  |   // Return the script name of the given unichar representation.  | 
812  |  |   // The returned pointer will always be the same for the same script, it's  | 
813  |  |   // managed by unicharset and thus MUST NOT be deleted  | 
814  | 1.11k  |   int get_script(const char *const unichar_repr) const { | 
815  | 1.11k  |     return get_script(unichar_to_id(unichar_repr));  | 
816  | 1.11k  |   }  | 
817  |  |  | 
818  |  |   // Return a pointer to the CHAR_FRAGMENT class struct if the given  | 
819  |  |   // unichar representation represents a character fragment.  | 
820  | 0  |   const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const { | 
821  | 0  |     if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||  | 
822  | 0  |         !ids.contains(unichar_repr, false)) { | 
823  | 0  |       return nullptr;  | 
824  | 0  |     }  | 
825  | 0  |     return get_fragment(unichar_to_id(unichar_repr));  | 
826  | 0  |   }  | 
827  |  |  | 
828  |  |   // Return the isalpha property of the given unichar representation.  | 
829  |  |   // Only the first length characters from unichar_repr are used.  | 
830  | 58.1k  |   bool get_isalpha(const char *const unichar_repr, int length) const { | 
831  | 58.1k  |     return get_isalpha(unichar_to_id(unichar_repr, length));  | 
832  | 58.1k  |   }  | 
833  |  |  | 
834  |  |   // Return the islower property of the given unichar representation.  | 
835  |  |   // Only the first length characters from unichar_repr are used.  | 
836  | 0  |   bool get_islower(const char *const unichar_repr, int length) const { | 
837  | 0  |     return get_islower(unichar_to_id(unichar_repr, length));  | 
838  | 0  |   }  | 
839  |  |  | 
840  |  |   // Return the isupper property of the given unichar representation.  | 
841  |  |   // Only the first length characters from unichar_repr are used.  | 
842  | 0  |   bool get_isupper(const char *const unichar_repr, int length) const { | 
843  | 0  |     return get_isupper(unichar_to_id(unichar_repr, length));  | 
844  | 0  |   }  | 
845  |  |  | 
846  |  |   // Return the isdigit property of the given unichar representation.  | 
847  |  |   // Only the first length characters from unichar_repr are used.  | 
848  | 9  |   bool get_isdigit(const char *const unichar_repr, int length) const { | 
849  | 9  |     return get_isdigit(unichar_to_id(unichar_repr, length));  | 
850  | 9  |   }  | 
851  |  |  | 
852  |  |   // Return the ispunctuation property of the given unichar representation.  | 
853  |  |   // Only the first length characters from unichar_repr are used.  | 
854  | 0  |   bool get_ispunctuation(const char *const unichar_repr, int length) const { | 
855  | 0  |     return get_ispunctuation(unichar_to_id(unichar_repr, length));  | 
856  | 0  |   }  | 
857  |  |  | 
858  |  |   // Returns normalized version of unichar with the given unichar_id.  | 
859  | 0  |   const char *get_normed_unichar(UNICHAR_ID unichar_id) const { | 
860  | 0  |     if (unichar_id == UNICHAR_SPACE) { | 
861  | 0  |       return " ";  | 
862  | 0  |     }  | 
863  | 0  |     return unichars[unichar_id].properties.normed.c_str();  | 
864  | 0  |   }  | 
865  |  |   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized  | 
866  |  |   // version of the given id. There may be more than one UNICHAR_ID in the  | 
867  |  |   // vector if unichar_id represents a ligature.  | 
868  | 20.7M  |   const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const { | 
869  | 20.7M  |     return unichars[unichar_id].properties.normed_ids;  | 
870  | 20.7M  |   }  | 
871  |  |  | 
872  |  |   // Return the script name of the given unichar representation.  | 
873  |  |   // Only the first length characters from unichar_repr are used.  | 
874  |  |   // The returned pointer will always be the same for the same script, it's  | 
875  |  |   // managed by unicharset and thus MUST NOT be deleted  | 
876  | 0  |   int get_script(const char *const unichar_repr, int length) const { | 
877  | 0  |     return get_script(unichar_to_id(unichar_repr, length));  | 
878  | 0  |   }  | 
879  |  |  | 
880  |  |   // Return the (current) number of scripts in the script table  | 
881  | 0  |   int get_script_table_size() const { | 
882  | 0  |     return script_table_size_used;  | 
883  | 0  |   }  | 
884  |  |  | 
885  |  |   // Return the script string from its id  | 
886  | 452  |   const char *get_script_from_script_id(int id) const { | 
887  | 452  |     if (id >= script_table_size_used || id < 0) { | 
888  | 0  |       return null_script;  | 
889  | 0  |     }  | 
890  | 452  |     return script_table[id];  | 
891  | 452  |   }  | 
892  |  |  | 
893  |  |   // Returns the id from the name of the script, or 0 if script is not found.  | 
894  |  |   // Note that this is an expensive operation since it involves iteratively  | 
895  |  |   // comparing strings in the script table.  To avoid dependency on STL, we  | 
896  |  |   // won't use a hash.  Instead, the calling function can use this to lookup  | 
897  |  |   // and save the ID for relevant scripts for fast comparisons later.  | 
898  |  |   int get_script_id_from_name(const char *script_name) const;  | 
899  |  |  | 
900  |  |   // Return true if the given script is the null script  | 
901  | 0  |   bool is_null_script(const char *script) const { | 
902  | 0  |     return script == null_script;  | 
903  | 0  |   }  | 
904  |  |  | 
905  |  |   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,  | 
906  |  |   // then the returned pointer will be the same.  | 
907  |  |   // The script parameter is copied and thus can be a temporary.  | 
908  |  |   int add_script(const char *script);  | 
909  |  |  | 
910  |  |   // Return the enabled property of the given unichar.  | 
911  | 407M  |   bool get_enabled(UNICHAR_ID unichar_id) const { | 
912  | 407M  |     ASSERT_HOST(contains_unichar_id(unichar_id));  | 
913  | 407M  |     return unichars[unichar_id].properties.enabled;  | 
914  | 407M  |   }  | 
915  |  |  | 
916  | 18.6M  |   int null_sid() const { | 
917  | 18.6M  |     return null_sid_;  | 
918  | 18.6M  |   }  | 
919  | 31.0M  |   int common_sid() const { | 
920  | 31.0M  |     return common_sid_;  | 
921  | 31.0M  |   }  | 
922  | 439k  |   int latin_sid() const { | 
923  | 439k  |     return latin_sid_;  | 
924  | 439k  |   }  | 
925  | 0  |   int cyrillic_sid() const { | 
926  | 0  |     return cyrillic_sid_;  | 
927  | 0  |   }  | 
928  | 0  |   int greek_sid() const { | 
929  | 0  |     return greek_sid_;  | 
930  | 0  |   }  | 
931  | 18.6M  |   int han_sid() const { | 
932  | 18.6M  |     return han_sid_;  | 
933  | 18.6M  |   }  | 
934  | 0  |   int hiragana_sid() const { | 
935  | 0  |     return hiragana_sid_;  | 
936  | 0  |   }  | 
937  | 2  |   int katakana_sid() const { | 
938  | 2  |     return katakana_sid_;  | 
939  | 2  |   }  | 
940  | 2  |   int thai_sid() const { | 
941  | 2  |     return thai_sid_;  | 
942  | 2  |   }  | 
943  | 0  |   int hangul_sid() const { | 
944  | 0  |     return hangul_sid_;  | 
945  | 0  |   }  | 
946  | 439k  |   int default_sid() const { | 
947  | 439k  |     return default_sid_;  | 
948  | 439k  |   }  | 
949  |  |  | 
950  |  |   // Returns true if the unicharset has the concept of upper/lower case.  | 
951  | 8.33M  |   bool script_has_upper_lower() const { | 
952  | 8.33M  |     return script_has_upper_lower_;  | 
953  | 8.33M  |   }  | 
954  |  |   // Returns true if the unicharset has the concept of x-height.  | 
955  |  |   // script_has_xheight can be true even if script_has_upper_lower is not,  | 
956  |  |   // when the script has a sufficiently predominant top line with ascenders,  | 
957  |  |   // such as Devanagari and Thai.  | 
958  | 482k  |   bool script_has_xheight() const { | 
959  | 482k  |     return script_has_xheight_;  | 
960  | 482k  |   }  | 
961  |  |  | 
962  |  | private:  | 
963  |  |   struct TESS_API UNICHAR_PROPERTIES { | 
964  |  |     UNICHAR_PROPERTIES();  | 
965  |  |     // Initializes all properties to sensible default values.  | 
966  |  |     void Init();  | 
967  |  |     // Sets all ranges wide open. Initialization default in case there are  | 
968  |  |     // no useful values available.  | 
969  |  |     void SetRangesOpen();  | 
970  |  |     // Sets all ranges to empty. Used before expanding with font-based data.  | 
971  |  |     void SetRangesEmpty();  | 
972  |  |     // Returns true if any of the top/bottom/width/bearing/advance ranges/stats  | 
973  |  |     // is empty.  | 
974  |  |     bool AnyRangeEmpty() const;  | 
975  |  |     // Expands the ranges with the ranges from the src properties.  | 
976  |  |     void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);  | 
977  |  |     // Copies the properties from src into this.  | 
978  |  |     void CopyFrom(const UNICHAR_PROPERTIES &src);  | 
979  |  |  | 
980  |  |     bool isalpha;  | 
981  |  |     bool islower;  | 
982  |  |     bool isupper;  | 
983  |  |     bool isdigit;  | 
984  |  |     bool ispunctuation;  | 
985  |  |     bool isngram;  | 
986  |  |     bool enabled;  | 
987  |  |     // Possible limits of the top and bottom of the bounding box in  | 
988  |  |     // baseline-normalized coordinates, ie, where the baseline is  | 
989  |  |     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight  | 
990  |  |     // (See normalis.h for the definitions).  | 
991  |  |     uint8_t min_bottom;  | 
992  |  |     uint8_t max_bottom;  | 
993  |  |     uint8_t min_top;  | 
994  |  |     uint8_t max_top;  | 
995  |  |     // Statistics of the widths of bounding box, relative to the median advance.  | 
996  |  |     float width;  | 
997  |  |     float width_sd;  | 
998  |  |     // Stats of the x-bearing and advance, also relative to the median advance.  | 
999  |  |     float bearing;  | 
1000  |  |     float bearing_sd;  | 
1001  |  |     float advance;  | 
1002  |  |     float advance_sd;  | 
1003  |  |     int script_id;  | 
1004  |  |     UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar  | 
1005  |  |     Direction direction;   // direction of this unichar  | 
1006  |  |     // Mirror property is useful for reverse DAWG lookup for words in  | 
1007  |  |     // right-to-left languages (e.g. "(word)" would be in  | 
1008  |  |     // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.  | 
1009  |  |     // However, what we want in our DAWG is  | 
1010  |  |     // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not  | 
1011  |  |     // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.  | 
1012  |  |     UNICHAR_ID mirror;  | 
1013  |  |     // A string of unichar_ids that represent the corresponding normed string.  | 
1014  |  |     // For awkward characters like em-dash, this gives hyphen.  | 
1015  |  |     // For ligatures, this gives the string of normal unichars.  | 
1016  |  |     std::vector<UNICHAR_ID> normed_ids;  | 
1017  |  |     std::string normed; // normalized version of this unichar  | 
1018  |  |     // Contains meta information about the fragment if a unichar represents  | 
1019  |  |     // a fragment of a character, otherwise should be set to nullptr.  | 
1020  |  |     // It is assumed that character fragments are added to the unicharset  | 
1021  |  |     // after the corresponding 'base' characters.  | 
1022  |  |     CHAR_FRAGMENT *fragment;  | 
1023  |  |   };  | 
1024  |  |  | 
1025  |  |   struct UNICHAR_SLOT { | 
1026  |  |     char representation[UNICHAR_LEN + 1];  | 
1027  |  |     UNICHAR_PROPERTIES properties;  | 
1028  |  |   };  | 
1029  |  |  | 
1030  |  |   // Internal recursive version of encode_string above.  | 
1031  |  |   // str is the start of the whole string.  | 
1032  |  |   // str_index is the current position in str.  | 
1033  |  |   // str_length is the length of str.  | 
1034  |  |   // encoding is a working encoding of str.  | 
1035  |  |   // lengths is a working set of lengths of each element of encoding.  | 
1036  |  |   // best_total_length is the longest length of str that has been successfully  | 
1037  |  |   // encoded so far.  | 
1038  |  |   // On return:  | 
1039  |  |   // best_encoding contains the encoding that used the longest part of str.  | 
1040  |  |   // best_lengths (may be null) contains the lengths of best_encoding.  | 
1041  |  |   void encode_string(const char *str, int str_index, int str_length,  | 
1042  |  |                      std::vector<UNICHAR_ID> *encoding,  | 
1043  |  |                      std::vector<char> *lengths, unsigned *best_total_length,  | 
1044  |  |                      std::vector<UNICHAR_ID> *best_encoding,  | 
1045  |  |                      std::vector<char> *best_lengths) const;  | 
1046  |  |  | 
1047  |  |   // Gets the properties for a grapheme string, combining properties for  | 
1048  |  |   // multiple characters in a meaningful way where possible.  | 
1049  |  |   // Returns false if no valid match was found in the unicharset.  | 
1050  |  |   // NOTE that script_id, mirror, and other_case refer to this unicharset on  | 
1051  |  |   // return and will need redirecting if the target unicharset is different.  | 
1052  |  |   bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;  | 
1053  |  |  | 
1054  |  |   // Load ourselves from a "file" where our only interface to the file is  | 
1055  |  |   // an implementation of fgets().  This is the parsing primitive accessed by  | 
1056  |  |   // the public routines load_from_file().  | 
1057  |  |   bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,  | 
1058  |  |                       bool skip_fragments);  | 
1059  |  |  | 
1060  |  |   // List of mappings to make when ingesting strings from the outside.  | 
1061  |  |   // The substitutions clean up text that should exists for rendering of  | 
1062  |  |   // synthetic data, but not in the recognition set.  | 
1063  |  |   static const char *kCleanupMaps[][2];  | 
1064  |  |   static const char *null_script;  | 
1065  |  |  | 
1066  |  |   std::vector<UNICHAR_SLOT> unichars;  | 
1067  |  |   UNICHARMAP ids;  | 
1068  |  |   char **script_table;  | 
1069  |  |   int script_table_size_used;  | 
1070  |  |   int script_table_size_reserved;  | 
1071  |  |   // True if the unichars have their tops/bottoms set.  | 
1072  |  |   bool top_bottom_set_;  | 
1073  |  |   // True if the unicharset has significant upper/lower case chars.  | 
1074  |  |   bool script_has_upper_lower_;  | 
1075  |  |   // True if the unicharset has a significant mean-line with significant  | 
1076  |  |   // ascenders above that.  | 
1077  |  |   bool script_has_xheight_;  | 
1078  |  |   // True if the set contains chars that would be changed by the cleanup.  | 
1079  |  |   bool old_style_included_;  | 
1080  |  |  | 
1081  |  |   // A few convenient script name-to-id mapping without using hash.  | 
1082  |  |   // These are initialized when unicharset file is loaded.  Anything  | 
1083  |  |   // missing from this list can be looked up using get_script_id_from_name.  | 
1084  |  |   int null_sid_;  | 
1085  |  |   int common_sid_;  | 
1086  |  |   int latin_sid_;  | 
1087  |  |   int cyrillic_sid_;  | 
1088  |  |   int greek_sid_;  | 
1089  |  |   int han_sid_;  | 
1090  |  |   int hiragana_sid_;  | 
1091  |  |   int katakana_sid_;  | 
1092  |  |   int thai_sid_;  | 
1093  |  |   int hangul_sid_;  | 
1094  |  |   // The most frequently occurring script in the charset.  | 
1095  |  |   int default_sid_;  | 
1096  |  | };  | 
1097  |  |  | 
1098  |  | } // namespace tesseract  | 
1099  |  |  | 
1100  |  | #endif // TESSERACT_CCUTIL_UNICHARSET_H_  |