/src/tesseract/src/ccutil/unicharcompress.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: unicharcompress.cpp |
3 | | // Description: Unicode re-encoding using a sequence of smaller numbers in |
4 | | // place of a single large code for CJK, similarly for Indic, |
5 | | // and dissection of ligatures for other scripts. |
6 | | // Author: Ray Smith |
7 | | // |
8 | | // (C) Copyright 2015, Google Inc. |
9 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
10 | | // you may not use this file except in compliance with the License. |
11 | | // You may obtain a copy of the License at |
12 | | // http://www.apache.org/licenses/LICENSE-2.0 |
13 | | // Unless required by applicable law or agreed to in writing, software |
14 | | // distributed under the License is distributed on an "AS IS" BASIS, |
15 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | // See the License for the specific language governing permissions and |
17 | | // limitations under the License. |
18 | | // |
19 | | /////////////////////////////////////////////////////////////////////// |
20 | | |
21 | | #include "unicharcompress.h" |
22 | | #include <algorithm> |
23 | | #include <memory> |
24 | | #include "tprintf.h" |
25 | | |
26 | | namespace tesseract { |
27 | | |
28 | | // String used to represent the null_id in direct_set. |
29 | | static const char *kNullChar = "<nul>"; |
30 | | // Radix to make unique values from the stored radical codes. |
31 | | const int kRadicalRadix = 29; |
32 | | |
33 | | // "Hash" function for const std::vector<int> computes the sum of elements. |
34 | | // Build a unique number for each code sequence that we can use as the index in |
35 | | // a hash map of ints instead of trying to hash the vectors. |
36 | 0 | static int RadicalPreHash(const std::vector<int> &rs) { |
37 | 0 | size_t result = 0; |
38 | 0 | for (int radical : rs) { |
39 | 0 | result *= kRadicalRadix; |
40 | 0 | result += radical; |
41 | 0 | } |
42 | 0 | return result; |
43 | 0 | } |
44 | | |
45 | | // A hash map to convert unicodes to radical encoding. |
46 | | using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>; |
47 | | // A hash map to count occurrences of each radical encoding. |
48 | | using RSCounts = std::unordered_map<int, int>; |
49 | | |
50 | 0 | static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) { |
51 | 0 | if (radical_data_line.empty() || (radical_data_line)[0] == '#') { |
52 | 0 | return true; |
53 | 0 | } |
54 | 0 | std::vector<std::string> entries = split(radical_data_line, ' '); |
55 | 0 | if (entries.size() < 2) { |
56 | 0 | return false; |
57 | 0 | } |
58 | 0 | char *end = nullptr; |
59 | 0 | int unicode = strtol(&entries[0][0], &end, 10); |
60 | 0 | if (*end != '\0') { |
61 | 0 | return false; |
62 | 0 | } |
63 | 0 | std::unique_ptr<std::vector<int>> radicals(new std::vector<int>); |
64 | 0 | for (size_t i = 1; i < entries.size(); ++i) { |
65 | 0 | int radical = strtol(&entries[i][0], &end, 10); |
66 | 0 | if (*end != '\0') { |
67 | 0 | return false; |
68 | 0 | } |
69 | 0 | radicals->push_back(radical); |
70 | 0 | } |
71 | 0 | (*radical_map)[unicode] = std::move(radicals); |
72 | 0 | return true; |
73 | 0 | } |
74 | | |
75 | | // Helper function builds the RSMap from the radical-stroke file, which has |
76 | | // already been read into a string. Returns false on error. |
77 | | // The radical_stroke_table is non-const because it gets split and the caller |
78 | | // is unlikely to want to use it again. |
79 | 0 | static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) { |
80 | 0 | std::vector<std::string> lines = split(radical_data, '\n'); |
81 | 0 | for (unsigned i = 0; i < lines.size(); ++i) { |
82 | 0 | if (!DecodeRadicalLine(lines[i], radical_map)) { |
83 | 0 | tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str()); |
84 | 0 | return false; |
85 | 0 | } |
86 | 0 | } |
87 | 0 | return true; |
88 | 0 | } |
89 | | |
90 | 2 | UnicharCompress::UnicharCompress() : code_range_(0) {} |
91 | 0 | UnicharCompress::UnicharCompress(const UnicharCompress &src) { |
92 | 0 | *this = src; |
93 | 0 | } |
94 | 0 | UnicharCompress::~UnicharCompress() { |
95 | 0 | Cleanup(); |
96 | 0 | } |
97 | 0 | UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) { |
98 | 0 | Cleanup(); |
99 | 0 | encoder_ = src.encoder_; |
100 | 0 | code_range_ = src.code_range_; |
101 | 0 | SetupDecoder(); |
102 | 0 | return *this; |
103 | 0 | } |
104 | | |
105 | | // Computes the encoding for the given unicharset. It is a requirement that |
106 | | // the file training/langdata/radical-stroke.txt have been read into the |
107 | | // input string radical_stroke_table. |
108 | | // Returns false if the encoding cannot be constructed. |
109 | | bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id, |
110 | 0 | std::string *radical_stroke_table) { |
111 | 0 | RSMap radical_map; |
112 | 0 | if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) { |
113 | 0 | return false; |
114 | 0 | } |
115 | 0 | encoder_.clear(); |
116 | 0 | UNICHARSET direct_set; |
117 | | // To avoid unused codes, clear the special codes from the direct_set. |
118 | 0 | direct_set.clear(); |
119 | | // Always keep space as 0; |
120 | 0 | direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue); |
121 | | // Null char is next if we have one. |
122 | 0 | if (null_id >= 0) { |
123 | 0 | direct_set.unichar_insert(kNullChar); |
124 | 0 | } |
125 | 0 | RSCounts radical_counts; |
126 | | // In the initial map, codes [0, unicharset.size()) are |
127 | | // reserved for non-han/hangul sequences of 1 or more unicodes. |
128 | 0 | int hangul_offset = unicharset.size(); |
129 | | // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos). |
130 | 0 | const int kTotalJamos = kLCount + kVCount + kTCount; |
131 | | // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard |
132 | | // to measure the number of radicals and strokes, initially we use the same |
133 | | // code range for all 3 Han code positions, and fix them after. |
134 | 0 | int han_offset = hangul_offset + kTotalJamos; |
135 | 0 | for (unsigned u = 0; u <= unicharset.size(); ++u) { |
136 | | // We special-case allow null_id to be equal to unicharset.size() in case |
137 | | // there is no space in unicharset for it. |
138 | 0 | if (u == unicharset.size() && static_cast<int>(u) != null_id) { |
139 | 0 | break; // Finished |
140 | 0 | } |
141 | 0 | RecodedCharID code; |
142 | | // Convert to unicodes. |
143 | 0 | std::vector<char32> unicodes; |
144 | 0 | std::string cleaned; |
145 | 0 | if (u < unicharset.size()) { |
146 | 0 | cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u)); |
147 | 0 | } |
148 | 0 | if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) { |
149 | | // Check single unicodes for Hangul/Han and encode if so. |
150 | 0 | int unicode = unicodes[0]; |
151 | 0 | int leading, vowel, trailing; |
152 | 0 | auto it = radical_map.find(unicode); |
153 | 0 | if (it != radical_map.end()) { |
154 | | // This is Han. Use the radical codes directly. |
155 | 0 | int num_radicals = it->second->size(); |
156 | 0 | for (int c = 0; c < num_radicals; ++c) { |
157 | 0 | code.Set(c, han_offset + (*it->second)[c]); |
158 | 0 | } |
159 | 0 | int pre_hash = RadicalPreHash(*it->second); |
160 | 0 | int num_samples = radical_counts[pre_hash]++; |
161 | 0 | if (num_samples > 0) { |
162 | 0 | code.Set(num_radicals, han_offset + num_samples + kRadicalRadix); |
163 | 0 | } |
164 | 0 | } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) { |
165 | | // This is Hangul. Since we know the exact size of each part at compile |
166 | | // time, it gets the bottom set of codes. |
167 | 0 | code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset, |
168 | 0 | trailing + kLCount + kVCount + hangul_offset); |
169 | 0 | } |
170 | 0 | } |
171 | | // If the code is still empty, it wasn't Han or Hangul. |
172 | 0 | if (code.empty()) { |
173 | | // Special cases. |
174 | 0 | if (u == UNICHAR_SPACE) { |
175 | 0 | code.Set(0, 0); // Space. |
176 | 0 | } else if (static_cast<int>(u) == null_id || |
177 | 0 | (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) { |
178 | 0 | code.Set(0, direct_set.unichar_to_id(kNullChar)); |
179 | 0 | } else { |
180 | | // Add the direct_set unichar-ids of the unicodes in sequence to the |
181 | | // code. |
182 | 0 | for (int uni : unicodes) { |
183 | 0 | int position = code.length(); |
184 | 0 | if (position >= RecodedCharID::kMaxCodeLen) { |
185 | 0 | tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u)); |
186 | 0 | return false; |
187 | 0 | } |
188 | 0 | UNICHAR unichar(uni); |
189 | 0 | char *utf8 = unichar.utf8_str(); |
190 | 0 | if (!direct_set.contains_unichar(utf8)) { |
191 | 0 | direct_set.unichar_insert(utf8); |
192 | 0 | } |
193 | 0 | code.Set(position, direct_set.unichar_to_id(utf8)); |
194 | 0 | delete[] utf8; |
195 | 0 | if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) { |
196 | | // Code space got bigger! |
197 | 0 | tprintf("Code space expanded from original unicharset!!\n"); |
198 | 0 | return false; |
199 | 0 | } |
200 | 0 | } |
201 | 0 | } |
202 | 0 | } |
203 | 0 | encoder_.push_back(code); |
204 | 0 | } |
205 | | // Now renumber Han to make all codes unique. We already added han_offset to |
206 | | // all Han. Now separate out the radical, stroke, and count codes for Han. |
207 | 0 | int code_offset = 0; |
208 | 0 | for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) { |
209 | 0 | int max_offset = 0; |
210 | 0 | for (unsigned u = 0; u < unicharset.size(); ++u) { |
211 | 0 | RecodedCharID *code = &encoder_[u]; |
212 | 0 | if (code->length() <= i) { |
213 | 0 | continue; |
214 | 0 | } |
215 | 0 | max_offset = std::max(max_offset, (*code)(i)-han_offset); |
216 | 0 | code->Set(i, (*code)(i) + code_offset); |
217 | 0 | } |
218 | 0 | if (max_offset == 0) { |
219 | 0 | break; |
220 | 0 | } |
221 | 0 | code_offset += max_offset + 1; |
222 | 0 | } |
223 | 0 | DefragmentCodeValues(null_id >= 0 ? 1 : -1); |
224 | 0 | SetupDecoder(); |
225 | 0 | return true; |
226 | 0 | } |
227 | | |
228 | | // Sets up an encoder that doesn't change the unichars at all, so it just |
229 | | // passes them through unchanged. |
230 | 0 | void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) { |
231 | 0 | std::vector<RecodedCharID> codes; |
232 | 0 | for (unsigned u = 0; u < unicharset.size(); ++u) { |
233 | 0 | RecodedCharID code; |
234 | 0 | code.Set(0, u); |
235 | 0 | codes.push_back(code); |
236 | 0 | } |
237 | 0 | if (!unicharset.has_special_codes()) { |
238 | 0 | RecodedCharID code; |
239 | 0 | code.Set(0, unicharset.size()); |
240 | 0 | codes.push_back(code); |
241 | 0 | } |
242 | 0 | SetupDirect(codes); |
243 | 0 | } |
244 | | |
245 | | // Sets up an encoder directly using the given encoding vector, which maps |
246 | | // unichar_ids to the given codes. |
247 | 0 | void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) { |
248 | 0 | encoder_ = codes; |
249 | 0 | ComputeCodeRange(); |
250 | 0 | SetupDecoder(); |
251 | 0 | } |
252 | | |
253 | | // Renumbers codes to eliminate unused values. |
254 | 0 | void UnicharCompress::DefragmentCodeValues(int encoded_null) { |
255 | | // There may not be any Hangul, but even if there is, it is possible that not |
256 | | // all codes are used. Likewise with the Han encoding, it is possible that not |
257 | | // all numbers of strokes are used. |
258 | 0 | ComputeCodeRange(); |
259 | 0 | std::vector<int> offsets(code_range_); |
260 | | // Find which codes are used |
261 | 0 | for (auto &code : encoder_) { |
262 | 0 | for (int i = 0; i < code.length(); ++i) { |
263 | 0 | offsets[code(i)] = 1; |
264 | 0 | } |
265 | 0 | } |
266 | | // Compute offsets based on code use. |
267 | 0 | int offset = 0; |
268 | 0 | for (unsigned i = 0; i < offsets.size(); ++i) { |
269 | | // If not used, decrement everything above here. |
270 | | // We are moving encoded_null to the end, so it is not "used". |
271 | 0 | if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) { |
272 | 0 | --offset; |
273 | 0 | } else { |
274 | 0 | offsets[i] = offset; |
275 | 0 | } |
276 | 0 | } |
277 | 0 | if (encoded_null >= 0) { |
278 | | // The encoded_null is moving to the end, for the benefit of TensorFlow, |
279 | | // which is offsets.size() + offsets.back(). |
280 | 0 | offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null; |
281 | 0 | } |
282 | | // Now apply the offsets. |
283 | 0 | for (auto &c : encoder_) { |
284 | 0 | RecodedCharID *code = &c; |
285 | 0 | for (int i = 0; i < code->length(); ++i) { |
286 | 0 | int value = (*code)(i); |
287 | 0 | code->Set(i, value + offsets[value]); |
288 | 0 | } |
289 | 0 | } |
290 | 0 | ComputeCodeRange(); |
291 | 0 | } |
292 | | |
293 | | // Encodes a single unichar_id. Returns the length of the code, or zero if |
294 | | // invalid input, and the encoding itself |
295 | 2 | int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const { |
296 | 2 | if (unichar_id >= encoder_.size()) { |
297 | 0 | return 0; |
298 | 0 | } |
299 | 2 | *code = encoder_[unichar_id]; |
300 | 2 | return code->length(); |
301 | 2 | } |
302 | | |
303 | | // Decodes code, returning the original unichar-id, or |
304 | | // INVALID_UNICHAR_ID if the input is invalid. |
305 | 13.7M | int UnicharCompress::DecodeUnichar(const RecodedCharID &code) const { |
306 | 13.7M | int len = code.length(); |
307 | 13.7M | if (len <= 0 || len > RecodedCharID::kMaxCodeLen) { |
308 | 0 | return INVALID_UNICHAR_ID; |
309 | 0 | } |
310 | 13.7M | auto it = decoder_.find(code); |
311 | 13.7M | if (it == decoder_.end()) { |
312 | 0 | return INVALID_UNICHAR_ID; |
313 | 0 | } |
314 | 13.7M | return it->second; |
315 | 13.7M | } |
316 | | |
317 | | // Writes to the given file. Returns false in case of error. |
318 | 0 | bool UnicharCompress::Serialize(TFile *fp) const { |
319 | 0 | return fp->Serialize(encoder_); |
320 | 0 | } |
321 | | |
322 | | // Reads from the given file. Returns false in case of error. |
323 | 2 | bool UnicharCompress::DeSerialize(TFile *fp) { |
324 | 2 | if (!fp->DeSerialize(encoder_)) { |
325 | 0 | return false; |
326 | 0 | } |
327 | 2 | ComputeCodeRange(); |
328 | 2 | SetupDecoder(); |
329 | 2 | return true; |
330 | 2 | } |
331 | | |
332 | | // Returns a string containing a text file that describes the encoding thus: |
333 | | // <index>[,<index>]*<tab><UTF8-str><newline> |
334 | | // In words, a comma-separated list of one or more indices, followed by a tab |
335 | | // and the UTF-8 string that the code represents per line. Most simple scripts |
336 | | // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean |
337 | | // and the Indic scripts will contain a many-to-many mapping. |
338 | | // See the class comment above for details. |
339 | 0 | std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const { |
340 | 0 | std::string encoding; |
341 | 0 | for (unsigned c = 0; c < encoder_.size(); ++c) { |
342 | 0 | const RecodedCharID &code = encoder_[c]; |
343 | 0 | if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) { |
344 | | // Don't show the duplicate entry. |
345 | 0 | continue; |
346 | 0 | } |
347 | 0 | encoding += std::to_string(code(0)); |
348 | 0 | for (int i = 1; i < code.length(); ++i) { |
349 | 0 | encoding += "," + std::to_string(code(i)); |
350 | 0 | } |
351 | 0 | encoding += "\t"; |
352 | 0 | if (c >= unicharset.size() || |
353 | 0 | (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) { |
354 | 0 | encoding += kNullChar; |
355 | 0 | } else { |
356 | 0 | encoding += unicharset.id_to_unichar(c); |
357 | 0 | } |
358 | 0 | encoding += "\n"; |
359 | 0 | } |
360 | 0 | return encoding; |
361 | 0 | } |
362 | | |
363 | | // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing. |
364 | | // Note that the returned values are 0-based indices, NOT unicode Jamo. |
365 | | // Returns false if the input is not in the Hangul unicode range. |
366 | | /* static */ |
367 | 0 | bool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) { |
368 | 0 | if (unicode < kFirstHangul) { |
369 | 0 | return false; |
370 | 0 | } |
371 | 0 | int offset = unicode - kFirstHangul; |
372 | 0 | if (offset >= kNumHangul) { |
373 | 0 | return false; |
374 | 0 | } |
375 | 0 | const int kNCount = kVCount * kTCount; |
376 | 0 | *leading = offset / kNCount; |
377 | 0 | *vowel = (offset % kNCount) / kTCount; |
378 | 0 | *trailing = offset % kTCount; |
379 | 0 | return true; |
380 | 0 | } |
381 | | |
382 | | // Computes the value of code_range_ from the encoder_. |
383 | 2 | void UnicharCompress::ComputeCodeRange() { |
384 | 2 | code_range_ = -1; |
385 | 224 | for (auto &code : encoder_) { |
386 | 448 | for (int i = 0; i < code.length(); ++i) { |
387 | 224 | if (code(i) > code_range_) { |
388 | 4 | code_range_ = code(i); |
389 | 4 | } |
390 | 224 | } |
391 | 224 | } |
392 | 2 | ++code_range_; |
393 | 2 | } |
394 | | |
395 | | // Initializes the decoding hash_map from the encoding array. |
396 | 2 | void UnicharCompress::SetupDecoder() { |
397 | 2 | Cleanup(); |
398 | 2 | is_valid_start_.clear(); |
399 | 2 | is_valid_start_.resize(code_range_); |
400 | 226 | for (unsigned c = 0; c < encoder_.size(); ++c) { |
401 | 224 | const RecodedCharID &code = encoder_[c]; |
402 | 224 | decoder_[code] = c; |
403 | 224 | is_valid_start_[code(0)] = true; |
404 | 224 | RecodedCharID prefix = code; |
405 | 224 | int len = code.length() - 1; |
406 | 224 | prefix.Truncate(len); |
407 | 224 | auto final_it = final_codes_.find(prefix); |
408 | 224 | if (final_it == final_codes_.end()) { |
409 | 2 | auto *code_list = new std::vector<int>; |
410 | 2 | code_list->push_back(code(len)); |
411 | 2 | final_codes_[prefix] = code_list; |
412 | 2 | while (--len >= 0) { |
413 | 0 | prefix.Truncate(len); |
414 | 0 | auto next_it = next_codes_.find(prefix); |
415 | 0 | if (next_it == next_codes_.end()) { |
416 | 0 | auto *code_list = new std::vector<int>; |
417 | 0 | code_list->push_back(code(len)); |
418 | 0 | next_codes_[prefix] = code_list; |
419 | 0 | } else { |
420 | | // We still have to search the list as we may get here via multiple |
421 | | // lengths of code. |
422 | 0 | if (!contains(*next_it->second, code(len))) { |
423 | 0 | next_it->second->push_back(code(len)); |
424 | 0 | } |
425 | 0 | break; // This prefix has been processed. |
426 | 0 | } |
427 | 0 | } |
428 | 222 | } else { |
429 | 222 | if (!contains(*final_it->second, code(len))) { |
430 | 220 | final_it->second->push_back(code(len)); |
431 | 220 | } |
432 | 222 | } |
433 | 224 | } |
434 | 2 | } |
435 | | |
436 | | // Frees allocated memory. |
437 | 2 | void UnicharCompress::Cleanup() { |
438 | 2 | decoder_.clear(); |
439 | 2 | is_valid_start_.clear(); |
440 | 2 | for (auto &next_code : next_codes_) { |
441 | 0 | delete next_code.second; |
442 | 0 | } |
443 | 2 | for (auto &final_code : final_codes_) { |
444 | 0 | delete final_code.second; |
445 | 0 | } |
446 | 2 | next_codes_.clear(); |
447 | 2 | final_codes_.clear(); |
448 | 2 | } |
449 | | |
450 | | } // namespace tesseract. |