/src/tesseract/include/tesseract/unichar.h
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // File: unichar.h |
3 | | // Description: Unicode character/ligature class. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | |
17 | | #ifndef TESSERACT_CCUTIL_UNICHAR_H_ |
18 | | #define TESSERACT_CCUTIL_UNICHAR_H_ |
19 | | |
20 | | #include "export.h" |
21 | | |
22 | | #include <memory.h> |
23 | | #include <cstring> |
24 | | #include <string> |
25 | | #include <vector> |
26 | | |
27 | | namespace tesseract { |
28 | | |
29 | | // Maximum number of characters that can be stored in a UNICHAR. Must be |
30 | | // at least 4. Must not exceed 31 without changing the coding of length. |
31 | 1.15G | #define UNICHAR_LEN 30 |
32 | | |
33 | | // A UNICHAR_ID is the unique id of a unichar. |
34 | | using UNICHAR_ID = int; |
35 | | |
36 | | // A variable to indicate an invalid or uninitialized unichar id. |
37 | | static const int INVALID_UNICHAR_ID = -1; |
38 | | // A special unichar that corresponds to INVALID_UNICHAR_ID. |
39 | | static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__"; |
40 | | |
41 | | enum StrongScriptDirection { |
42 | | DIR_NEUTRAL = 0, // Text contains only neutral characters. |
43 | | DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. |
44 | | DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. |
45 | | DIR_MIX = 3, // Text contains a mixture of left-to-right |
46 | | // and right-to-left characters. |
47 | | }; |
48 | | |
49 | | using char32 = signed int; |
50 | | |
51 | | // The UNICHAR class holds a single classification result. This may be |
52 | | // a single Unicode character (stored as between 1 and 4 utf8 bytes) or |
53 | | // multiple Unicode characters representing the NFKC expansion of a ligature |
54 | | // such as fi, ffl etc. These are also stored as utf8. |
55 | | class TESS_API UNICHAR { |
56 | | public: |
57 | 0 | UNICHAR() { |
58 | 0 | memset(chars, 0, UNICHAR_LEN); |
59 | 0 | } |
60 | | |
61 | | // Construct from a utf8 string. If len<0 then the string is null terminated. |
62 | | // If the string is too long to fit in the UNICHAR then it takes only what |
63 | | // will fit. |
64 | | UNICHAR(const char *utf8_str, int len); |
65 | | |
66 | | // Construct from a single UCS4 character. |
67 | | explicit UNICHAR(int unicode); |
68 | | |
69 | | // Default copy constructor and operator= are OK. |
70 | | |
71 | | // Get the first character as UCS-4. |
72 | | int first_uni() const; |
73 | | |
74 | | // Get the length of the UTF8 string. |
75 | 0 | int utf8_len() const { |
76 | 0 | int len = chars[UNICHAR_LEN - 1]; |
77 | 0 | return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; |
78 | 0 | } |
79 | | |
80 | | // Get a UTF8 string, but NOT nullptr terminated. |
81 | 0 | const char *utf8() const { |
82 | 0 | return chars; |
83 | 0 | } |
84 | | |
85 | | // Get a terminated UTF8 string: Must delete[] it after use. |
86 | | char *utf8_str() const; |
87 | | |
88 | | // Get the number of bytes in the first character of the given utf8 string. |
89 | | static int utf8_step(const char *utf8_str); |
90 | | |
91 | | // A class to simplify iterating over and accessing elements of a UTF8 |
92 | | // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or |
93 | | // take ownership of the underlying byte array. It also does not permit |
94 | | // modification of the array (as the name suggests). |
95 | | // |
96 | | // Example: |
97 | | // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len); |
98 | | // it != UNICHAR::end(str, len); |
99 | | // ++it) { |
100 | | // printf("UCS-4 symbol code = %d\n", *it); |
101 | | // char buf[5]; |
102 | | // int char_len = it.get_utf8(buf); buf[char_len] = '\0'; |
103 | | // printf("Char = %s\n", buf); |
104 | | // } |
105 | | class TESS_API const_iterator { |
106 | | using CI = const_iterator; |
107 | | |
108 | | public: |
109 | | // Step to the next UTF8 character. |
110 | | // If the current position is at an illegal UTF8 character, then print an |
111 | | // error message and step by one byte. If the current position is at a |
112 | | // nullptr value, don't step past it. |
113 | | const_iterator &operator++(); |
114 | | |
115 | | // Return the UCS-4 value at the current position. |
116 | | // If the current position is at an illegal UTF8 value, return a single |
117 | | // space character. |
118 | | int operator*() const; |
119 | | |
120 | | // Store the UTF-8 encoding of the current codepoint into buf, which must be |
121 | | // at least 4 bytes long. Return the number of bytes written. |
122 | | // If the current position is at an illegal UTF8 value, writes a single |
123 | | // space character and returns 1. |
124 | | // Note that this method does not null-terminate the buffer. |
125 | | int get_utf8(char *buf) const; |
126 | | // Returns the number of bytes of the current codepoint. Returns 1 if the |
127 | | // current position is at an illegal UTF8 value. |
128 | | int utf8_len() const; |
129 | | // Returns true if the UTF-8 encoding at the current position is legal. |
130 | | bool is_legal() const; |
131 | | |
132 | | // Return the pointer into the string at the current position. |
133 | 0 | const char *utf8_data() const { |
134 | 0 | return it_; |
135 | 0 | } |
136 | | |
137 | | // Iterator equality operators. |
138 | 0 | friend bool operator==(const CI &lhs, const CI &rhs) { |
139 | 0 | return lhs.it_ == rhs.it_; |
140 | 0 | } |
141 | 0 | friend bool operator!=(const CI &lhs, const CI &rhs) { |
142 | 0 | return !(lhs == rhs); |
143 | 0 | } |
144 | | |
145 | | private: |
146 | | friend class UNICHAR; |
147 | 0 | explicit const_iterator(const char *it) : it_(it) {} |
148 | | |
149 | | const char *it_; // Pointer into the string. |
150 | | }; |
151 | | |
152 | | // Create a start/end iterator pointing to a string. Note that these methods |
153 | | // are static and do NOT create a copy or take ownership of the underlying |
154 | | // array. |
155 | | static const_iterator begin(const char *utf8_str, int byte_length); |
156 | | static const_iterator end(const char *utf8_str, int byte_length); |
157 | | |
158 | | // Converts a utf-8 string to a vector of unicodes. |
159 | | // Returns an empty vector if the input contains invalid UTF-8. |
160 | | static std::vector<char32> UTF8ToUTF32(const char *utf8_str); |
161 | | // Converts a vector of unicodes to a utf8 string. |
162 | | // Returns an empty string if the input contains an invalid unicode. |
163 | | static std::string UTF32ToUTF8(const std::vector<char32> &str32); |
164 | | |
165 | | private: |
166 | | // A UTF-8 representation of 1 or more Unicode characters. |
167 | | // The last element (chars[UNICHAR_LEN - 1]) is a length if |
168 | | // its value < UNICHAR_LEN, otherwise it is a genuine character. |
169 | | char chars[UNICHAR_LEN]{}; |
170 | | }; |
171 | | |
172 | | } // namespace tesseract |
173 | | |
174 | | #endif // TESSERACT_CCUTIL_UNICHAR_H_ |