/src/tesseract/src/ccutil/unicharcompress.cpp

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        unicharcompress.cpp
// Description: Unicode re-encoding using a sequence of smaller numbers in
//              place of a single large code for CJK, similarly for Indic,
//              and dissection of ligatures for other scripts.
// Author:      Ray Smith
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "unicharcompress.h"
#include <algorithm>
#include <memory>
#include "tprintf.h"

namespace tesseract {

// String used to represent the null_id in direct_set.
static const char *kNullChar = "<nul>";
// Radix to make unique values from the stored radical codes.
const int kRadicalRadix = 29;

// "Hash" function for const std::vector<int> computes the sum of elements.
// Build a unique number for each code sequence that we can use as the index in
// a hash map of ints instead of trying to hash the vectors.
static int RadicalPreHash(const std::vector<int> &rs) {
  size_t result = 0;
  for (int radical : rs) {
    result *= kRadicalRadix;
    result += radical;
  }
  return result;
}

// A hash map to convert unicodes to radical encoding.
using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
// A hash map to count occurrences of each radical encoding.
using RSCounts = std::unordered_map<int, int>;

static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
  if (radical_data_line.empty() || (radical_data_line)[0] == '#') {
    return true;
  }
  std::vector<std::string> entries = split(radical_data_line, ' ');
  if (entries.size() < 2) {
    return false;
  }
  char *end = nullptr;
  int unicode = strtol(&entries[0][0], &end, 10);
  if (*end != '\0') {
    return false;
  }
  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
  for (size_t i = 1; i < entries.size(); ++i) {
    int radical = strtol(&entries[i][0], &end, 10);
    if (*end != '\0') {
      return false;
    }
    radicals->push_back(radical);
  }
  (*radical_map)[unicode] = std::move(radicals);
  return true;
}

// Helper function builds the RSMap from the radical-stroke file, which has
// already been read into a string. Returns false on error.
// The radical_stroke_table is non-const because it gets split and the caller
// is unlikely to want to use it again.
static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
  std::vector<std::string> lines = split(radical_data, '\n');
  for (unsigned i = 0; i < lines.size(); ++i) {
    if (!DecodeRadicalLine(lines[i], radical_map)) {
      tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
      return false;
    }
  }
  return true;
}

UnicharCompress::UnicharCompress() : code_range_(0) {}
UnicharCompress::UnicharCompress(const UnicharCompress &src) {
  *this = src;
}
UnicharCompress::~UnicharCompress() {
  Cleanup();
}
UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {
  Cleanup();
  encoder_ = src.encoder_;
  code_range_ = src.code_range_;
  SetupDecoder();
  return *this;
}

// Computes the encoding for the given unicharset. It is a requirement that
// the file training/langdata/radical-stroke.txt have been read into the
// input string radical_stroke_table.
// Returns false if the encoding cannot be constructed.
bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
                                      std::string *radical_stroke_table) {
  RSMap radical_map;
  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
    return false;
  }
  encoder_.clear();
  UNICHARSET direct_set;
  // To avoid unused codes, clear the special codes from the direct_set.
  direct_set.clear();
  // Always keep space as 0;
  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
  // Null char is next if we have one.
  if (null_id >= 0) {
    direct_set.unichar_insert(kNullChar);
  }
  RSCounts radical_counts;
  // In the initial map, codes [0, unicharset.size()) are
  // reserved for non-han/hangul sequences of 1 or more unicodes.
  int hangul_offset = unicharset.size();
  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
  const int kTotalJamos = kLCount + kVCount + kTCount;
  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
  // to measure the number of radicals and strokes, initially we use the same
  // code range for all 3 Han code positions, and fix them after.
  int han_offset = hangul_offset + kTotalJamos;
  for (unsigned u = 0; u <= unicharset.size(); ++u) {
    // We special-case allow null_id to be equal to unicharset.size() in case
    // there is no space in unicharset for it.
    if (u == unicharset.size() && static_cast<int>(u) != null_id) {
      break; // Finished
    }
    RecodedCharID code;
    // Convert to unicodes.
    std::vector<char32> unicodes;
    std::string cleaned;
    if (u < unicharset.size()) {
      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
    }
    if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
      // Check single unicodes for Hangul/Han and encode if so.
      int unicode = unicodes[0];
      int leading, vowel, trailing;
      auto it = radical_map.find(unicode);
      if (it != radical_map.end()) {
        // This is Han. Use the radical codes directly.
        int num_radicals = it->second->size();
        for (int c = 0; c < num_radicals; ++c) {
          code.Set(c, han_offset + (*it->second)[c]);
        }
        int pre_hash = RadicalPreHash(*it->second);
        int num_samples = radical_counts[pre_hash]++;
        if (num_samples > 0) {
          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
        }
      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
        // This is Hangul. Since we know the exact size of each part at compile
        // time, it gets the bottom set of codes.
        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
                  trailing + kLCount + kVCount + hangul_offset);
      }
    }
    // If the code is still empty, it wasn't Han or Hangul.
    if (code.empty()) {
      // Special cases.
      if (u == UNICHAR_SPACE) {
        code.Set(0, 0); // Space.
      } else if (static_cast<int>(u) == null_id ||
                 (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
        code.Set(0, direct_set.unichar_to_id(kNullChar));
      } else {
        // Add the direct_set unichar-ids of the unicodes in sequence to the
        // code.
        for (int uni : unicodes) {
          int position = code.length();
          if (position >= RecodedCharID::kMaxCodeLen) {
            tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
            return false;
          }
          UNICHAR unichar(uni);
          char *utf8 = unichar.utf8_str();
          if (!direct_set.contains_unichar(utf8)) {
            direct_set.unichar_insert(utf8);
          }
          code.Set(position, direct_set.unichar_to_id(utf8));
          delete[] utf8;
          if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
            // Code space got bigger!
            tprintf("Code space expanded from original unicharset!!\n");
            return false;
          }
        }
      }
    }
    encoder_.push_back(code);
  }
  // Now renumber Han to make all codes unique. We already added han_offset to
  // all Han. Now separate out the radical, stroke, and count codes for Han.
  int code_offset = 0;
  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
    int max_offset = 0;
    for (unsigned u = 0; u < unicharset.size(); ++u) {
      RecodedCharID *code = &encoder_[u];
      if (code->length() <= i) {
        continue;
      }
      max_offset = std::max(max_offset, (*code)(i)-han_offset);
      code->Set(i, (*code)(i) + code_offset);
    }
    if (max_offset == 0) {
      break;
    }
    code_offset += max_offset + 1;
  }
  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
  SetupDecoder();
  return true;
}

// Sets up an encoder that doesn't change the unichars at all, so it just
// passes them through unchanged.
void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {
  std::vector<RecodedCharID> codes;
  for (unsigned u = 0; u < unicharset.size(); ++u) {
    RecodedCharID code;
    code.Set(0, u);
    codes.push_back(code);
  }
  if (!unicharset.has_special_codes()) {
    RecodedCharID code;
    code.Set(0, unicharset.size());
    codes.push_back(code);
  }
  SetupDirect(codes);
}

// Sets up an encoder directly using the given encoding vector, which maps
// unichar_ids to the given codes.
void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) {
  encoder_ = codes;
  ComputeCodeRange();
  SetupDecoder();
}

// Renumbers codes to eliminate unused values.
void UnicharCompress::DefragmentCodeValues(int encoded_null) {
  // There may not be any Hangul, but even if there is, it is possible that not
  // all codes are used. Likewise with the Han encoding, it is possible that not
  // all numbers of strokes are used.
  ComputeCodeRange();
  std::vector<int> offsets(code_range_);
  // Find which codes are used
  for (auto &code : encoder_) {
    for (int i = 0; i < code.length(); ++i) {
      offsets[code(i)] = 1;
    }
  }
  // Compute offsets based on code use.
  int offset = 0;
  for (unsigned i = 0; i < offsets.size(); ++i) {
    // If not used, decrement everything above here.
    // We are moving encoded_null to the end, so it is not "used".
    if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) {
      --offset;
    } else {
      offsets[i] = offset;
    }
  }
  if (encoded_null >= 0) {
    // The encoded_null is moving to the end, for the benefit of TensorFlow,
    // which is offsets.size() + offsets.back().
    offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
  }
  // Now apply the offsets.
  for (auto &c : encoder_) {
    RecodedCharID *code = &c;
    for (int i = 0; i < code->length(); ++i) {
      int value = (*code)(i);
      code->Set(i, value + offsets[value]);
    }
  }
  ComputeCodeRange();
}

// Encodes a single unichar_id. Returns the length of the code, or zero if
// invalid input, and the encoding itself
int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {
  if (unichar_id >= encoder_.size()) {
    return 0;
  }
  *code = encoder_[unichar_id];
  return code->length();
}

// Decodes code, returning the original unichar-id, or
// INVALID_UNICHAR_ID if the input is invalid.
int UnicharCompress::DecodeUnichar(const RecodedCharID &code) const {
  int len = code.length();
  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {
    return INVALID_UNICHAR_ID;
  }
  auto it = decoder_.find(code);
  if (it == decoder_.end()) {
    return INVALID_UNICHAR_ID;
  }
  return it->second;
}

// Writes to the given file. Returns false in case of error.
bool UnicharCompress::Serialize(TFile *fp) const {
  return fp->Serialize(encoder_);
}

// Reads from the given file. Returns false in case of error.
bool UnicharCompress::DeSerialize(TFile *fp) {
  if (!fp->DeSerialize(encoder_)) {
    return false;
  }
  ComputeCodeRange();
  SetupDecoder();
  return true;
}

// Returns a string containing a text file that describes the encoding thus:
// <index>[,<index>]*<tab><UTF8-str><newline>
// In words, a comma-separated list of one or more indices, followed by a tab
// and the UTF-8 string that the code represents per line. Most simple scripts
// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
// and the Indic scripts will contain a many-to-many mapping.
// See the class comment above for details.
std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
  std::string encoding;
  for (unsigned c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID &code = encoder_[c];
    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
      // Don't show the duplicate entry.
      continue;
    }
    encoding += std::to_string(code(0));
    for (int i = 1; i < code.length(); ++i) {
      encoding += "," + std::to_string(code(i));
    }
    encoding += "\t";
    if (c >= unicharset.size() ||
        (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
      encoding += kNullChar;
    } else {
      encoding += unicharset.id_to_unichar(c);
    }
    encoding += "\n";
  }
  return encoding;
}

// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
// Note that the returned values are 0-based indices, NOT unicode Jamo.
// Returns false if the input is not in the Hangul unicode range.
/* static */
bool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) {
  if (unicode < kFirstHangul) {
    return false;
  }
  int offset = unicode - kFirstHangul;
  if (offset >= kNumHangul) {
    return false;
  }
  const int kNCount = kVCount * kTCount;
  *leading = offset / kNCount;
  *vowel = (offset % kNCount) / kTCount;
  *trailing = offset % kTCount;
  return true;
}

// Computes the value of code_range_ from the encoder_.
void UnicharCompress::ComputeCodeRange() {
  code_range_ = -1;
  for (auto &code : encoder_) {
    for (int i = 0; i < code.length(); ++i) {
      if (code(i) > code_range_) {
        code_range_ = code(i);
      }
    }
  }
  ++code_range_;
}

// Initializes the decoding hash_map from the encoding array.
void UnicharCompress::SetupDecoder() {
  Cleanup();
  is_valid_start_.clear();
  is_valid_start_.resize(code_range_);
  for (unsigned c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID &code = encoder_[c];
    decoder_[code] = c;
    is_valid_start_[code(0)] = true;
    RecodedCharID prefix = code;
    int len = code.length() - 1;
    prefix.Truncate(len);
    auto final_it = final_codes_.find(prefix);
    if (final_it == final_codes_.end()) {
      auto *code_list = new std::vector<int>;
      code_list->push_back(code(len));
      final_codes_[prefix] = code_list;
      while (--len >= 0) {
        prefix.Truncate(len);
        auto next_it = next_codes_.find(prefix);
        if (next_it == next_codes_.end()) {
          auto *code_list = new std::vector<int>;
          code_list->push_back(code(len));
          next_codes_[prefix] = code_list;
        } else {
          // We still have to search the list as we may get here via multiple
          // lengths of code.
          if (!contains(*next_it->second, code(len))) {
            next_it->second->push_back(code(len));
          }
          break; // This prefix has been processed.
        }
      }
    } else {
      if (!contains(*final_it->second, code(len))) {
        final_it->second->push_back(code(len));
      }
    }
  }
}

// Frees allocated memory.
void UnicharCompress::Cleanup() {
  decoder_.clear();
  is_valid_start_.clear();
  for (auto &next_code : next_codes_) {
    delete next_code.second;
  }
  for (auto &final_code : final_codes_) {
    delete final_code.second;
  }
  next_codes_.clear();
  final_codes_.clear();
}

} // namespace tesseract.

Coverage Report

Created: 2025-06-13 07:15

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: unicharcompress.cpp
3		// Description: Unicode re-encoding using a sequence of smaller numbers in
4		// place of a single large code for CJK, similarly for Indic,
5		// and dissection of ligatures for other scripts.
6		// Author: Ray Smith
7		//
8		// (C) Copyright 2015, Google Inc.
9		// Licensed under the Apache License, Version 2.0 (the "License");
10		// you may not use this file except in compliance with the License.
11		// You may obtain a copy of the License at
12		// http://www.apache.org/licenses/LICENSE-2.0
13		// Unless required by applicable law or agreed to in writing, software
14		// distributed under the License is distributed on an "AS IS" BASIS,
15		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		// See the License for the specific language governing permissions and
17		// limitations under the License.
18		//
19		///////////////////////////////////////////////////////////////////////
20
21		#include "unicharcompress.h"
22		#include <algorithm>
23		#include <memory>
24		#include "tprintf.h"
25
26		namespace tesseract {
27
28		// String used to represent the null_id in direct_set.
29		static const char *kNullChar = "<nul>";
30		// Radix to make unique values from the stored radical codes.
31		const int kRadicalRadix = 29;
32
33		// "Hash" function for const std::vector<int> computes the sum of elements.
34		// Build a unique number for each code sequence that we can use as the index in
35		// a hash map of ints instead of trying to hash the vectors.
36	0	static int RadicalPreHash(const std::vector<int> &rs) {
37	0	size_t result = 0;
38	0	for (int radical : rs) {
39	0	result *= kRadicalRadix;
40	0	result += radical;
41	0	}
42	0	return result;
43	0	}
44
45		// A hash map to convert unicodes to radical encoding.
46		using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
47		// A hash map to count occurrences of each radical encoding.
48		using RSCounts = std::unordered_map<int, int>;
49
50	0	static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
51	0	if (radical_data_line.empty() \|\| (radical_data_line)[0] == '#') {
52	0	return true;
53	0	}
54	0	std::vector<std::string> entries = split(radical_data_line, ' ');
55	0	if (entries.size() < 2) {
56	0	return false;
57	0	}
58	0	char *end = nullptr;
59	0	int unicode = strtol(&entries[0][0], &end, 10);
60	0	if (*end != '\0') {
61	0	return false;
62	0	}
63	0	std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
64	0	for (size_t i = 1; i < entries.size(); ++i) {
65	0	int radical = strtol(&entries[i][0], &end, 10);
66	0	if (*end != '\0') {
67	0	return false;
68	0	}
69	0	radicals->push_back(radical);
70	0	}
71	0	(*radical_map)[unicode] = std::move(radicals);
72	0	return true;
73	0	}
74
75		// Helper function builds the RSMap from the radical-stroke file, which has
76		// already been read into a string. Returns false on error.
77		// The radical_stroke_table is non-const because it gets split and the caller
78		// is unlikely to want to use it again.
79	0	static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
80	0	std::vector<std::string> lines = split(radical_data, '\n');
81	0	for (unsigned i = 0; i < lines.size(); ++i) {
82	0	if (!DecodeRadicalLine(lines[i], radical_map)) {
83	0	tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
84	0	return false;
85	0	}
86	0	}
87	0	return true;
88	0	}
89
90	2	UnicharCompress::UnicharCompress() : code_range_(0) {}
91	0	UnicharCompress::UnicharCompress(const UnicharCompress &src) {
92	0	*this = src;
93	0	}
94	0	UnicharCompress::~UnicharCompress() {
95	0	Cleanup();
96	0	}
97	0	UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {
98	0	Cleanup();
99	0	encoder_ = src.encoder_;
100	0	code_range_ = src.code_range_;
101	0	SetupDecoder();
102	0	return *this;
103	0	}
104
105		// Computes the encoding for the given unicharset. It is a requirement that
106		// the file training/langdata/radical-stroke.txt have been read into the
107		// input string radical_stroke_table.
108		// Returns false if the encoding cannot be constructed.
109		bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
110	0	std::string *radical_stroke_table) {
111	0	RSMap radical_map;
112	0	if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
113	0	return false;
114	0	}
115	0	encoder_.clear();
116	0	UNICHARSET direct_set;
117		// To avoid unused codes, clear the special codes from the direct_set.
118	0	direct_set.clear();
119		// Always keep space as 0;
120	0	direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
121		// Null char is next if we have one.
122	0	if (null_id >= 0) {
123	0	direct_set.unichar_insert(kNullChar);
124	0	}
125	0	RSCounts radical_counts;
126		// In the initial map, codes [0, unicharset.size()) are
127		// reserved for non-han/hangul sequences of 1 or more unicodes.
128	0	int hangul_offset = unicharset.size();
129		// Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
130	0	const int kTotalJamos = kLCount + kVCount + kTCount;
131		// Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
132		// to measure the number of radicals and strokes, initially we use the same
133		// code range for all 3 Han code positions, and fix them after.
134	0	int han_offset = hangul_offset + kTotalJamos;
135	0	for (unsigned u = 0; u <= unicharset.size(); ++u) {
136		// We special-case allow null_id to be equal to unicharset.size() in case
137		// there is no space in unicharset for it.
138	0	if (u == unicharset.size() && static_cast<int>(u) != null_id) {
139	0	break; // Finished
140	0	}
141	0	RecodedCharID code;
142		// Convert to unicodes.
143	0	std::vector<char32> unicodes;
144	0	std::string cleaned;
145	0	if (u < unicharset.size()) {
146	0	cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
147	0	}
148	0	if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
149		// Check single unicodes for Hangul/Han and encode if so.
150	0	int unicode = unicodes[0];
151	0	int leading, vowel, trailing;
152	0	auto it = radical_map.find(unicode);
153	0	if (it != radical_map.end()) {
154		// This is Han. Use the radical codes directly.
155	0	int num_radicals = it->second->size();
156	0	for (int c = 0; c < num_radicals; ++c) {
157	0	code.Set(c, han_offset + (*it->second)[c]);
158	0	}
159	0	int pre_hash = RadicalPreHash(*it->second);
160	0	int num_samples = radical_counts[pre_hash]++;
161	0	if (num_samples > 0) {
162	0	code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
163	0	}
164	0	} else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
165		// This is Hangul. Since we know the exact size of each part at compile
166		// time, it gets the bottom set of codes.
167	0	code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
168	0	trailing + kLCount + kVCount + hangul_offset);
169	0	}
170	0	}
171		// If the code is still empty, it wasn't Han or Hangul.
172	0	if (code.empty()) {
173		// Special cases.
174	0	if (u == UNICHAR_SPACE) {
175	0	code.Set(0, 0); // Space.
176	0	} else if (static_cast<int>(u) == null_id \|\|
177	0	(unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
178	0	code.Set(0, direct_set.unichar_to_id(kNullChar));
179	0	} else {
180		// Add the direct_set unichar-ids of the unicodes in sequence to the
181		// code.
182	0	for (int uni : unicodes) {
183	0	int position = code.length();
184	0	if (position >= RecodedCharID::kMaxCodeLen) {
185	0	tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
186	0	return false;
187	0	}
188	0	UNICHAR unichar(uni);
189	0	char *utf8 = unichar.utf8_str();
190	0	if (!direct_set.contains_unichar(utf8)) {
191	0	direct_set.unichar_insert(utf8);
192	0	}
193	0	code.Set(position, direct_set.unichar_to_id(utf8));
194	0	delete[] utf8;
195	0	if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
196		// Code space got bigger!
197	0	tprintf("Code space expanded from original unicharset!!\n");
198	0	return false;
199	0	}
200	0	}
201	0	}
202	0	}
203	0	encoder_.push_back(code);
204	0	}
205		// Now renumber Han to make all codes unique. We already added han_offset to
206		// all Han. Now separate out the radical, stroke, and count codes for Han.
207	0	int code_offset = 0;
208	0	for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
209	0	int max_offset = 0;
210	0	for (unsigned u = 0; u < unicharset.size(); ++u) {
211	0	RecodedCharID *code = &encoder_[u];
212	0	if (code->length() <= i) {
213	0	continue;
214	0	}
215	0	max_offset = std::max(max_offset, (*code)(i)-han_offset);
216	0	code->Set(i, (*code)(i) + code_offset);
217	0	}
218	0	if (max_offset == 0) {
219	0	break;
220	0	}
221	0	code_offset += max_offset + 1;
222	0	}
223	0	DefragmentCodeValues(null_id >= 0 ? 1 : -1);
224	0	SetupDecoder();
225	0	return true;
226	0	}
227
228		// Sets up an encoder that doesn't change the unichars at all, so it just
229		// passes them through unchanged.
230	0	void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {
231	0	std::vector<RecodedCharID> codes;
232	0	for (unsigned u = 0; u < unicharset.size(); ++u) {
233	0	RecodedCharID code;
234	0	code.Set(0, u);
235	0	codes.push_back(code);
236	0	}
237	0	if (!unicharset.has_special_codes()) {
238	0	RecodedCharID code;
239	0	code.Set(0, unicharset.size());
240	0	codes.push_back(code);
241	0	}
242	0	SetupDirect(codes);
243	0	}
244
245		// Sets up an encoder directly using the given encoding vector, which maps
246		// unichar_ids to the given codes.
247	0	void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) {
248	0	encoder_ = codes;
249	0	ComputeCodeRange();
250	0	SetupDecoder();
251	0	}
252
253		// Renumbers codes to eliminate unused values.
254	0	void UnicharCompress::DefragmentCodeValues(int encoded_null) {
255		// There may not be any Hangul, but even if there is, it is possible that not
256		// all codes are used. Likewise with the Han encoding, it is possible that not
257		// all numbers of strokes are used.
258	0	ComputeCodeRange();
259	0	std::vector<int> offsets(code_range_);
260		// Find which codes are used
261	0	for (auto &code : encoder_) {
262	0	for (int i = 0; i < code.length(); ++i) {
263	0	offsets[code(i)] = 1;
264	0	}
265	0	}
266		// Compute offsets based on code use.
267	0	int offset = 0;
268	0	for (unsigned i = 0; i < offsets.size(); ++i) {
269		// If not used, decrement everything above here.
270		// We are moving encoded_null to the end, so it is not "used".
271	0	if (offsets[i] == 0 \|\| i == static_cast<unsigned>(encoded_null)) {
272	0	--offset;
273	0	} else {
274	0	offsets[i] = offset;
275	0	}
276	0	}
277	0	if (encoded_null >= 0) {
278		// The encoded_null is moving to the end, for the benefit of TensorFlow,
279		// which is offsets.size() + offsets.back().
280	0	offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
281	0	}
282		// Now apply the offsets.
283	0	for (auto &c : encoder_) {
284	0	RecodedCharID *code = &c;
285	0	for (int i = 0; i < code->length(); ++i) {
286	0	int value = (*code)(i);
287	0	code->Set(i, value + offsets[value]);
288	0	}
289	0	}
290	0	ComputeCodeRange();
291	0	}
292
293		// Encodes a single unichar_id. Returns the length of the code, or zero if
294		// invalid input, and the encoding itself
295	2	int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {
296	2	if (unichar_id >= encoder_.size()) {
297	0	return 0;
298	0	}
299	2	*code = encoder_[unichar_id];
300	2	return code->length();
301	2	}
302
303		// Decodes code, returning the original unichar-id, or
304		// INVALID_UNICHAR_ID if the input is invalid.
305	13.7M	int UnicharCompress::DecodeUnichar(const RecodedCharID &code) const {
306	13.7M	int len = code.length();
307	13.7M	if (len <= 0 \|\| len > RecodedCharID::kMaxCodeLen) {
308	0	return INVALID_UNICHAR_ID;
309	0	}
310	13.7M	auto it = decoder_.find(code);
311	13.7M	if (it == decoder_.end()) {
312	0	return INVALID_UNICHAR_ID;
313	0	}
314	13.7M	return it->second;
315	13.7M	}
316
317		// Writes to the given file. Returns false in case of error.
318	0	bool UnicharCompress::Serialize(TFile *fp) const {
319	0	return fp->Serialize(encoder_);
320	0	}
321
322		// Reads from the given file. Returns false in case of error.
323	2	bool UnicharCompress::DeSerialize(TFile *fp) {
324	2	if (!fp->DeSerialize(encoder_)) {
325	0	return false;
326	0	}
327	2	ComputeCodeRange();
328	2	SetupDecoder();
329	2	return true;
330	2	}
331
332		// Returns a string containing a text file that describes the encoding thus:
333		// <index>[,<index>]*<tab><UTF8-str><newline>
334		// In words, a comma-separated list of one or more indices, followed by a tab
335		// and the UTF-8 string that the code represents per line. Most simple scripts
336		// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
337		// and the Indic scripts will contain a many-to-many mapping.
338		// See the class comment above for details.
339	0	std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
340	0	std::string encoding;
341	0	for (unsigned c = 0; c < encoder_.size(); ++c) {
342	0	const RecodedCharID &code = encoder_[c];
343	0	if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
344		// Don't show the duplicate entry.
345	0	continue;
346	0	}
347	0	encoding += std::to_string(code(0));
348	0	for (int i = 1; i < code.length(); ++i) {
349	0	encoding += "," + std::to_string(code(i));
350	0	}
351	0	encoding += "\t";
352	0	if (c >= unicharset.size() \|\|
353	0	(0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
354	0	encoding += kNullChar;
355	0	} else {
356	0	encoding += unicharset.id_to_unichar(c);
357	0	}
358	0	encoding += "\n";
359	0	}
360	0	return encoding;
361	0	}
362
363		// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
364		// Note that the returned values are 0-based indices, NOT unicode Jamo.
365		// Returns false if the input is not in the Hangul unicode range.
366		/* static */
367	0	bool UnicharCompress::DecomposeHangul(int unicode, int leading, int vowel, int *trailing) {
368	0	if (unicode < kFirstHangul) {
369	0	return false;
370	0	}
371	0	int offset = unicode - kFirstHangul;
372	0	if (offset >= kNumHangul) {
373	0	return false;
374	0	}
375	0	const int kNCount = kVCount * kTCount;
376	0	*leading = offset / kNCount;
377	0	*vowel = (offset % kNCount) / kTCount;
378	0	*trailing = offset % kTCount;
379	0	return true;
380	0	}
381
382		// Computes the value of code_range_ from the encoder_.
383	2	void UnicharCompress::ComputeCodeRange() {
384	2	code_range_ = -1;
385	224	for (auto &code : encoder_) {
386	448	for (int i = 0; i < code.length(); ++i) {
387	224	if (code(i) > code_range_) {
388	4	code_range_ = code(i);
389	4	}
390	224	}
391	224	}
392	2	++code_range_;
393	2	}
394
395		// Initializes the decoding hash_map from the encoding array.
396	2	void UnicharCompress::SetupDecoder() {
397	2	Cleanup();
398	2	is_valid_start_.clear();
399	2	is_valid_start_.resize(code_range_);
400	226	for (unsigned c = 0; c < encoder_.size(); ++c) {
401	224	const RecodedCharID &code = encoder_[c];
402	224	decoder_[code] = c;
403	224	is_valid_start_[code(0)] = true;
404	224	RecodedCharID prefix = code;
405	224	int len = code.length() - 1;
406	224	prefix.Truncate(len);
407	224	auto final_it = final_codes_.find(prefix);
408	224	if (final_it == final_codes_.end()) {
409	2	auto *code_list = new std::vector<int>;
410	2	code_list->push_back(code(len));
411	2	final_codes_[prefix] = code_list;
412	2	while (--len >= 0) {
413	0	prefix.Truncate(len);
414	0	auto next_it = next_codes_.find(prefix);
415	0	if (next_it == next_codes_.end()) {
416	0	auto *code_list = new std::vector<int>;
417	0	code_list->push_back(code(len));
418	0	next_codes_[prefix] = code_list;
419	0	} else {
420		// We still have to search the list as we may get here via multiple
421		// lengths of code.
422	0	if (!contains(*next_it->second, code(len))) {
423	0	next_it->second->push_back(code(len));
424	0	}
425	0	break; // This prefix has been processed.
426	0	}
427	0	}
428	222	} else {
429	222	if (!contains(*final_it->second, code(len))) {
430	220	final_it->second->push_back(code(len));
431	220	}
432	222	}
433	224	}
434	2	}
435
436		// Frees allocated memory.
437	2	void UnicharCompress::Cleanup() {
438	2	decoder_.clear();
439	2	is_valid_start_.clear();
440	2	for (auto &next_code : next_codes_) {
441	0	delete next_code.second;
442	0	}
443	2	for (auto &final_code : final_codes_) {
444	0	delete final_code.second;
445	0	}
446	2	next_codes_.clear();
447	2	final_codes_.clear();
448	2	}
449
450		} // namespace tesseract.