Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccutil/unicharcompress.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        unicharcompress.cpp
3
// Description: Unicode re-encoding using a sequence of smaller numbers in
4
//              place of a single large code for CJK, similarly for Indic,
5
//              and dissection of ligatures for other scripts.
6
// Author:      Ray Smith
7
//
8
// (C) Copyright 2015, Google Inc.
9
// Licensed under the Apache License, Version 2.0 (the "License");
10
// you may not use this file except in compliance with the License.
11
// You may obtain a copy of the License at
12
// http://www.apache.org/licenses/LICENSE-2.0
13
// Unless required by applicable law or agreed to in writing, software
14
// distributed under the License is distributed on an "AS IS" BASIS,
15
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
// See the License for the specific language governing permissions and
17
// limitations under the License.
18
//
19
///////////////////////////////////////////////////////////////////////
20
21
#include "unicharcompress.h"
22
#include <algorithm>
23
#include <memory>
24
#include "tprintf.h"
25
26
namespace tesseract {
27
28
// String used to represent the null_id in direct_set.
29
static const char *kNullChar = "<nul>";
30
// Radix to make unique values from the stored radical codes.
31
const int kRadicalRadix = 29;
32
33
// "Hash" function for const std::vector<int> computes the sum of elements.
34
// Build a unique number for each code sequence that we can use as the index in
35
// a hash map of ints instead of trying to hash the vectors.
36
0
static int RadicalPreHash(const std::vector<int> &rs) {
37
0
  size_t result = 0;
38
0
  for (int radical : rs) {
39
0
    result *= kRadicalRadix;
40
0
    result += radical;
41
0
  }
42
0
  return result;
43
0
}
44
45
// A hash map to convert unicodes to radical encoding.
46
using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
47
// A hash map to count occurrences of each radical encoding.
48
using RSCounts = std::unordered_map<int, int>;
49
50
0
static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
51
0
  if (radical_data_line.empty() || (radical_data_line)[0] == '#') {
52
0
    return true;
53
0
  }
54
0
  std::vector<std::string> entries = split(radical_data_line, ' ');
55
0
  if (entries.size() < 2) {
56
0
    return false;
57
0
  }
58
0
  char *end = nullptr;
59
0
  int unicode = strtol(&entries[0][0], &end, 10);
60
0
  if (*end != '\0') {
61
0
    return false;
62
0
  }
63
0
  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
64
0
  for (size_t i = 1; i < entries.size(); ++i) {
65
0
    int radical = strtol(&entries[i][0], &end, 10);
66
0
    if (*end != '\0') {
67
0
      return false;
68
0
    }
69
0
    radicals->push_back(radical);
70
0
  }
71
0
  (*radical_map)[unicode] = std::move(radicals);
72
0
  return true;
73
0
}
74
75
// Helper function builds the RSMap from the radical-stroke file, which has
76
// already been read into a string. Returns false on error.
77
// The radical_stroke_table is non-const because it gets split and the caller
78
// is unlikely to want to use it again.
79
0
static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
80
0
  std::vector<std::string> lines = split(radical_data, '\n');
81
0
  for (unsigned i = 0; i < lines.size(); ++i) {
82
0
    if (!DecodeRadicalLine(lines[i], radical_map)) {
83
0
      tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
84
0
      return false;
85
0
    }
86
0
  }
87
0
  return true;
88
0
}
89
90
2
UnicharCompress::UnicharCompress() : code_range_(0) {}
91
0
UnicharCompress::UnicharCompress(const UnicharCompress &src) {
92
0
  *this = src;
93
0
}
94
0
UnicharCompress::~UnicharCompress() {
95
0
  Cleanup();
96
0
}
97
0
UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {
98
0
  Cleanup();
99
0
  encoder_ = src.encoder_;
100
0
  code_range_ = src.code_range_;
101
0
  SetupDecoder();
102
0
  return *this;
103
0
}
104
105
// Computes the encoding for the given unicharset. It is a requirement that
106
// the file training/langdata/radical-stroke.txt have been read into the
107
// input string radical_stroke_table.
108
// Returns false if the encoding cannot be constructed.
109
bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
110
0
                                      std::string *radical_stroke_table) {
111
0
  RSMap radical_map;
112
0
  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
113
0
    return false;
114
0
  }
115
0
  encoder_.clear();
116
0
  UNICHARSET direct_set;
117
  // To avoid unused codes, clear the special codes from the direct_set.
118
0
  direct_set.clear();
119
  // Always keep space as 0;
120
0
  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
121
  // Null char is next if we have one.
122
0
  if (null_id >= 0) {
123
0
    direct_set.unichar_insert(kNullChar);
124
0
  }
125
0
  RSCounts radical_counts;
126
  // In the initial map, codes [0, unicharset.size()) are
127
  // reserved for non-han/hangul sequences of 1 or more unicodes.
128
0
  int hangul_offset = unicharset.size();
129
  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
130
0
  const int kTotalJamos = kLCount + kVCount + kTCount;
131
  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
132
  // to measure the number of radicals and strokes, initially we use the same
133
  // code range for all 3 Han code positions, and fix them after.
134
0
  int han_offset = hangul_offset + kTotalJamos;
135
0
  for (unsigned u = 0; u <= unicharset.size(); ++u) {
136
    // We special-case allow null_id to be equal to unicharset.size() in case
137
    // there is no space in unicharset for it.
138
0
    if (u == unicharset.size() && static_cast<int>(u) != null_id) {
139
0
      break; // Finished
140
0
    }
141
0
    RecodedCharID code;
142
    // Convert to unicodes.
143
0
    std::vector<char32> unicodes;
144
0
    std::string cleaned;
145
0
    if (u < unicharset.size()) {
146
0
      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
147
0
    }
148
0
    if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
149
      // Check single unicodes for Hangul/Han and encode if so.
150
0
      int unicode = unicodes[0];
151
0
      int leading, vowel, trailing;
152
0
      auto it = radical_map.find(unicode);
153
0
      if (it != radical_map.end()) {
154
        // This is Han. Use the radical codes directly.
155
0
        int num_radicals = it->second->size();
156
0
        for (int c = 0; c < num_radicals; ++c) {
157
0
          code.Set(c, han_offset + (*it->second)[c]);
158
0
        }
159
0
        int pre_hash = RadicalPreHash(*it->second);
160
0
        int num_samples = radical_counts[pre_hash]++;
161
0
        if (num_samples > 0) {
162
0
          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
163
0
        }
164
0
      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
165
        // This is Hangul. Since we know the exact size of each part at compile
166
        // time, it gets the bottom set of codes.
167
0
        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
168
0
                  trailing + kLCount + kVCount + hangul_offset);
169
0
      }
170
0
    }
171
    // If the code is still empty, it wasn't Han or Hangul.
172
0
    if (code.empty()) {
173
      // Special cases.
174
0
      if (u == UNICHAR_SPACE) {
175
0
        code.Set(0, 0); // Space.
176
0
      } else if (static_cast<int>(u) == null_id ||
177
0
                 (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
178
0
        code.Set(0, direct_set.unichar_to_id(kNullChar));
179
0
      } else {
180
        // Add the direct_set unichar-ids of the unicodes in sequence to the
181
        // code.
182
0
        for (int uni : unicodes) {
183
0
          int position = code.length();
184
0
          if (position >= RecodedCharID::kMaxCodeLen) {
185
0
            tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
186
0
            return false;
187
0
          }
188
0
          UNICHAR unichar(uni);
189
0
          char *utf8 = unichar.utf8_str();
190
0
          if (!direct_set.contains_unichar(utf8)) {
191
0
            direct_set.unichar_insert(utf8);
192
0
          }
193
0
          code.Set(position, direct_set.unichar_to_id(utf8));
194
0
          delete[] utf8;
195
0
          if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
196
            // Code space got bigger!
197
0
            tprintf("Code space expanded from original unicharset!!\n");
198
0
            return false;
199
0
          }
200
0
        }
201
0
      }
202
0
    }
203
0
    encoder_.push_back(code);
204
0
  }
205
  // Now renumber Han to make all codes unique. We already added han_offset to
206
  // all Han. Now separate out the radical, stroke, and count codes for Han.
207
0
  int code_offset = 0;
208
0
  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
209
0
    int max_offset = 0;
210
0
    for (unsigned u = 0; u < unicharset.size(); ++u) {
211
0
      RecodedCharID *code = &encoder_[u];
212
0
      if (code->length() <= i) {
213
0
        continue;
214
0
      }
215
0
      max_offset = std::max(max_offset, (*code)(i)-han_offset);
216
0
      code->Set(i, (*code)(i) + code_offset);
217
0
    }
218
0
    if (max_offset == 0) {
219
0
      break;
220
0
    }
221
0
    code_offset += max_offset + 1;
222
0
  }
223
0
  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
224
0
  SetupDecoder();
225
0
  return true;
226
0
}
227
228
// Sets up an encoder that doesn't change the unichars at all, so it just
229
// passes them through unchanged.
230
0
void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {
231
0
  std::vector<RecodedCharID> codes;
232
0
  for (unsigned u = 0; u < unicharset.size(); ++u) {
233
0
    RecodedCharID code;
234
0
    code.Set(0, u);
235
0
    codes.push_back(code);
236
0
  }
237
0
  if (!unicharset.has_special_codes()) {
238
0
    RecodedCharID code;
239
0
    code.Set(0, unicharset.size());
240
0
    codes.push_back(code);
241
0
  }
242
0
  SetupDirect(codes);
243
0
}
244
245
// Sets up an encoder directly using the given encoding vector, which maps
246
// unichar_ids to the given codes.
247
0
void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) {
248
0
  encoder_ = codes;
249
0
  ComputeCodeRange();
250
0
  SetupDecoder();
251
0
}
252
253
// Renumbers codes to eliminate unused values.
254
0
void UnicharCompress::DefragmentCodeValues(int encoded_null) {
255
  // There may not be any Hangul, but even if there is, it is possible that not
256
  // all codes are used. Likewise with the Han encoding, it is possible that not
257
  // all numbers of strokes are used.
258
0
  ComputeCodeRange();
259
0
  std::vector<int> offsets(code_range_);
260
  // Find which codes are used
261
0
  for (auto &code : encoder_) {
262
0
    for (int i = 0; i < code.length(); ++i) {
263
0
      offsets[code(i)] = 1;
264
0
    }
265
0
  }
266
  // Compute offsets based on code use.
267
0
  int offset = 0;
268
0
  for (unsigned i = 0; i < offsets.size(); ++i) {
269
    // If not used, decrement everything above here.
270
    // We are moving encoded_null to the end, so it is not "used".
271
0
    if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) {
272
0
      --offset;
273
0
    } else {
274
0
      offsets[i] = offset;
275
0
    }
276
0
  }
277
0
  if (encoded_null >= 0) {
278
    // The encoded_null is moving to the end, for the benefit of TensorFlow,
279
    // which is offsets.size() + offsets.back().
280
0
    offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
281
0
  }
282
  // Now apply the offsets.
283
0
  for (auto &c : encoder_) {
284
0
    RecodedCharID *code = &c;
285
0
    for (int i = 0; i < code->length(); ++i) {
286
0
      int value = (*code)(i);
287
0
      code->Set(i, value + offsets[value]);
288
0
    }
289
0
  }
290
0
  ComputeCodeRange();
291
0
}
292
293
// Encodes a single unichar_id. Returns the length of the code, or zero if
294
// invalid input, and the encoding itself
295
2
int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {
296
2
  if (unichar_id >= encoder_.size()) {
297
0
    return 0;
298
0
  }
299
2
  *code = encoder_[unichar_id];
300
2
  return code->length();
301
2
}
302
303
// Decodes code, returning the original unichar-id, or
304
// INVALID_UNICHAR_ID if the input is invalid.
305
13.7M
int UnicharCompress::DecodeUnichar(const RecodedCharID &code) const {
306
13.7M
  int len = code.length();
307
13.7M
  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {
308
0
    return INVALID_UNICHAR_ID;
309
0
  }
310
13.7M
  auto it = decoder_.find(code);
311
13.7M
  if (it == decoder_.end()) {
312
0
    return INVALID_UNICHAR_ID;
313
0
  }
314
13.7M
  return it->second;
315
13.7M
}
316
317
// Writes to the given file. Returns false in case of error.
318
0
bool UnicharCompress::Serialize(TFile *fp) const {
319
0
  return fp->Serialize(encoder_);
320
0
}
321
322
// Reads from the given file. Returns false in case of error.
323
2
bool UnicharCompress::DeSerialize(TFile *fp) {
324
2
  if (!fp->DeSerialize(encoder_)) {
325
0
    return false;
326
0
  }
327
2
  ComputeCodeRange();
328
2
  SetupDecoder();
329
2
  return true;
330
2
}
331
332
// Returns a string containing a text file that describes the encoding thus:
333
// <index>[,<index>]*<tab><UTF8-str><newline>
334
// In words, a comma-separated list of one or more indices, followed by a tab
335
// and the UTF-8 string that the code represents per line. Most simple scripts
336
// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
337
// and the Indic scripts will contain a many-to-many mapping.
338
// See the class comment above for details.
339
0
std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
340
0
  std::string encoding;
341
0
  for (unsigned c = 0; c < encoder_.size(); ++c) {
342
0
    const RecodedCharID &code = encoder_[c];
343
0
    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
344
      // Don't show the duplicate entry.
345
0
      continue;
346
0
    }
347
0
    encoding += std::to_string(code(0));
348
0
    for (int i = 1; i < code.length(); ++i) {
349
0
      encoding += "," + std::to_string(code(i));
350
0
    }
351
0
    encoding += "\t";
352
0
    if (c >= unicharset.size() ||
353
0
        (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
354
0
      encoding += kNullChar;
355
0
    } else {
356
0
      encoding += unicharset.id_to_unichar(c);
357
0
    }
358
0
    encoding += "\n";
359
0
  }
360
0
  return encoding;
361
0
}
362
363
// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
364
// Note that the returned values are 0-based indices, NOT unicode Jamo.
365
// Returns false if the input is not in the Hangul unicode range.
366
/* static */
367
0
bool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) {
368
0
  if (unicode < kFirstHangul) {
369
0
    return false;
370
0
  }
371
0
  int offset = unicode - kFirstHangul;
372
0
  if (offset >= kNumHangul) {
373
0
    return false;
374
0
  }
375
0
  const int kNCount = kVCount * kTCount;
376
0
  *leading = offset / kNCount;
377
0
  *vowel = (offset % kNCount) / kTCount;
378
0
  *trailing = offset % kTCount;
379
0
  return true;
380
0
}
381
382
// Computes the value of code_range_ from the encoder_.
383
2
void UnicharCompress::ComputeCodeRange() {
384
2
  code_range_ = -1;
385
224
  for (auto &code : encoder_) {
386
448
    for (int i = 0; i < code.length(); ++i) {
387
224
      if (code(i) > code_range_) {
388
4
        code_range_ = code(i);
389
4
      }
390
224
    }
391
224
  }
392
2
  ++code_range_;
393
2
}
394
395
// Initializes the decoding hash_map from the encoding array.
396
2
void UnicharCompress::SetupDecoder() {
397
2
  Cleanup();
398
2
  is_valid_start_.clear();
399
2
  is_valid_start_.resize(code_range_);
400
226
  for (unsigned c = 0; c < encoder_.size(); ++c) {
401
224
    const RecodedCharID &code = encoder_[c];
402
224
    decoder_[code] = c;
403
224
    is_valid_start_[code(0)] = true;
404
224
    RecodedCharID prefix = code;
405
224
    int len = code.length() - 1;
406
224
    prefix.Truncate(len);
407
224
    auto final_it = final_codes_.find(prefix);
408
224
    if (final_it == final_codes_.end()) {
409
2
      auto *code_list = new std::vector<int>;
410
2
      code_list->push_back(code(len));
411
2
      final_codes_[prefix] = code_list;
412
2
      while (--len >= 0) {
413
0
        prefix.Truncate(len);
414
0
        auto next_it = next_codes_.find(prefix);
415
0
        if (next_it == next_codes_.end()) {
416
0
          auto *code_list = new std::vector<int>;
417
0
          code_list->push_back(code(len));
418
0
          next_codes_[prefix] = code_list;
419
0
        } else {
420
          // We still have to search the list as we may get here via multiple
421
          // lengths of code.
422
0
          if (!contains(*next_it->second, code(len))) {
423
0
            next_it->second->push_back(code(len));
424
0
          }
425
0
          break; // This prefix has been processed.
426
0
        }
427
0
      }
428
222
    } else {
429
222
      if (!contains(*final_it->second, code(len))) {
430
220
        final_it->second->push_back(code(len));
431
220
      }
432
222
    }
433
224
  }
434
2
}
435
436
// Frees allocated memory.
437
2
void UnicharCompress::Cleanup() {
438
2
  decoder_.clear();
439
2
  is_valid_start_.clear();
440
2
  for (auto &next_code : next_codes_) {
441
0
    delete next_code.second;
442
0
  }
443
2
  for (auto &final_code : final_codes_) {
444
0
    delete final_code.second;
445
0
  }
446
2
  next_codes_.clear();
447
2
  final_codes_.clear();
448
2
}
449
450
} // namespace tesseract.