Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccutil/unicharset.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        unicharset.h
3
// Description: Unicode character/ligature set class.
4
// Author:      Thomas Kielbus
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20
#define TESSERACT_CCUTIL_UNICHARSET_H_
21
22
#include "errcode.h"
23
#include "unicharmap.h"
24
25
#include <tesseract/unichar.h>
26
#include "helpers.h"
27
#include "serialis.h"
28
29
#include <functional> // for std::function
30
31
namespace tesseract {
32
33
// Enum holding special values of unichar_id. Every unicharset has these.
34
// Warning! Keep in sync with kSpecialUnicharCodes.
35
enum SpecialUnicharCodes {
36
  UNICHAR_SPACE,
37
  UNICHAR_JOINED,
38
  UNICHAR_BROKEN,
39
40
  SPECIAL_UNICHAR_CODES_COUNT
41
};
42
43
// Boolean flag for unichar_insert. It's a bit of a double negative to allow
44
// the default value to be false.
45
enum class OldUncleanUnichars {
46
  kFalse,
47
  kTrue,
48
};
49
50
class TESS_API CHAR_FRAGMENT {
51
public:
52
  // Minimum number of characters used for fragment representation.
53
  static const int kMinLen = 6;
54
  // Maximum number of characters used for fragment representation.
55
  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
56
  // Maximum number of fragments per character.
57
  static const int kMaxChunks = 5;
58
59
  // Setters and Getters.
60
1.14k
  inline void set_all(const char *unichar, int pos, int total, bool natural) {
61
1.14k
    set_unichar(unichar);
62
1.14k
    set_pos(pos);
63
1.14k
    set_total(total);
64
1.14k
    set_natural(natural);
65
1.14k
  }
66
1.14k
  inline void set_unichar(const char *uch) {
67
1.14k
    strncpy(this->unichar, uch, sizeof(this->unichar));
68
1.14k
    this->unichar[UNICHAR_LEN] = '\0';
69
1.14k
  }
70
1.14k
  inline void set_pos(int p) {
71
1.14k
    this->pos = p;
72
1.14k
  }
73
1.14k
  inline void set_total(int t) {
74
1.14k
    this->total = t;
75
1.14k
  }
76
2.80M
  inline const char *get_unichar() const {
77
2.80M
    return this->unichar;
78
2.80M
  }
79
876k
  inline int get_pos() const {
80
876k
    return this->pos;
81
876k
  }
82
876k
  inline int get_total() const {
83
876k
    return this->total;
84
876k
  }
85
86
  // Returns the string that represents a fragment
87
  // with the given unichar, pos and total.
88
  static std::string to_string(const char *unichar, int pos, int total,
89
                               bool natural);
90
  // Returns the string that represents this fragment.
91
0
  std::string to_string() const {
92
0
    return to_string(unichar, pos, total, natural);
93
0
  }
94
95
  // Checks whether a fragment has the same unichar,
96
  // position and total as the given inputs.
97
  inline bool equals(const char *other_unichar, int other_pos,
98
0
                     int other_total) const {
99
0
    return (strcmp(this->unichar, other_unichar) == 0 &&
100
0
            this->pos == other_pos && this->total == other_total);
101
0
  }
102
0
  inline bool equals(const CHAR_FRAGMENT *other) const {
103
0
    return this->equals(other->get_unichar(), other->get_pos(),
104
0
                        other->get_total());
105
0
  }
106
107
  // Checks whether a given fragment is a continuation of this fragment.
108
  // Assumes that the given fragment pointer is not nullptr.
109
2.41M
  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
110
2.41M
    return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
111
2.41M
            this->total == fragment->get_total() &&
112
2.41M
            this->pos == fragment->get_pos() + 1);
113
2.41M
  }
114
115
  // Returns true if this fragment is a beginning fragment.
116
381k
  inline bool is_beginning() const {
117
381k
    return this->pos == 0;
118
381k
  }
119
120
  // Returns true if this fragment is an ending fragment.
121
478k
  inline bool is_ending() const {
122
478k
    return this->pos == this->total - 1;
123
478k
  }
124
125
  // Returns true if the fragment was a separate component to begin with,
126
  // ie did not need chopping to be isolated, but may have been separated
127
  // out from a multi-outline blob.
128
0
  inline bool is_natural() const {
129
0
    return natural;
130
0
  }
131
1.14k
  void set_natural(bool value) {
132
1.14k
    natural = value;
133
1.14k
  }
134
135
  // Parses the string to see whether it represents a character fragment
136
  // (rather than a regular character). If so, allocates memory for a new
137
  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
138
  // information. Fragments are of the form:
139
  // |m|1|2, meaning chunk 1 of 2 of character m, or
140
  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
141
  // to divide the parts, as they were already separate connected components.
142
  //
143
  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
144
  // instance, otherwise (if the string does not represent a fragment or it
145
  // looks like it does, but parsing it as a fragment fails) returns nullptr.
146
  //
147
  // Note: The caller is responsible for deallocating memory
148
  // associated with the returned pointer.
149
  static CHAR_FRAGMENT *parse_from_string(const char *str);
150
151
private:
152
  char unichar[UNICHAR_LEN + 1];
153
  // True if the fragment was a separate component to begin with,
154
  // ie did not need chopping to be isolated, but may have been separated
155
  // out from a multi-outline blob.
156
  bool natural;
157
  int16_t pos;   // fragment position in the character
158
  int16_t total; // total number of fragments in the character
159
};
160
161
// The UNICHARSET class is an utility class for Tesseract that holds the
162
// set of characters that are used by the engine. Each character is identified
163
// by a unique number, from 0 to (size - 1).
164
class TESS_API UNICHARSET {
165
public:
166
  // Custom list of characters and their ligature forms (UTF8)
167
  // These map to unicode values in the private use area (PUC) and are supported
168
  // by only few font families (eg. Wyld, Adobe Caslon Pro).
169
  static const char *kCustomLigatures[][2];
170
171
  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
172
  static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
173
174
  // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
175
  enum Direction {
176
    U_LEFT_TO_RIGHT = 0,
177
    U_RIGHT_TO_LEFT = 1,
178
    U_EUROPEAN_NUMBER = 2,
179
    U_EUROPEAN_NUMBER_SEPARATOR = 3,
180
    U_EUROPEAN_NUMBER_TERMINATOR = 4,
181
    U_ARABIC_NUMBER = 5,
182
    U_COMMON_NUMBER_SEPARATOR = 6,
183
    U_BLOCK_SEPARATOR = 7,
184
    U_SEGMENT_SEPARATOR = 8,
185
    U_WHITE_SPACE_NEUTRAL = 9,
186
    U_OTHER_NEUTRAL = 10,
187
    U_LEFT_TO_RIGHT_EMBEDDING = 11,
188
    U_LEFT_TO_RIGHT_OVERRIDE = 12,
189
    U_RIGHT_TO_LEFT_ARABIC = 13,
190
    U_RIGHT_TO_LEFT_EMBEDDING = 14,
191
    U_RIGHT_TO_LEFT_OVERRIDE = 15,
192
    U_POP_DIRECTIONAL_FORMAT = 16,
193
    U_DIR_NON_SPACING_MARK = 17,
194
    U_BOUNDARY_NEUTRAL = 18,
195
    U_FIRST_STRONG_ISOLATE = 19,
196
    U_LEFT_TO_RIGHT_ISOLATE = 20,
197
    U_RIGHT_TO_LEFT_ISOLATE = 21,
198
    U_POP_DIRECTIONAL_ISOLATE = 22,
199
#ifndef U_HIDE_DEPRECATED_API
200
    U_CHAR_DIRECTION_COUNT
201
#endif // U_HIDE_DEPRECATED_API
202
  };
203
204
  // Create an empty UNICHARSET
205
  UNICHARSET();
206
207
  ~UNICHARSET();
208
209
  // Return the UNICHAR_ID of a given unichar representation within the
210
  // UNICHARSET.
211
  UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;
212
213
  // Return the UNICHAR_ID of a given unichar representation within the
214
  // UNICHARSET. Only the first length characters from unichar_repr are used.
215
  UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;
216
217
  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
218
  // while leaving the rest of the string encodable. Returns 0 if the
219
  // beginning of the string is not encodable.
220
  // WARNING: this function now encodes the whole string for precision.
221
  // Use encode_string in preference to repeatedly calling step.
222
  int step(const char *str) const;
223
224
  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
225
  // If not encodable, write the first byte offset which cannot be converted
226
  // into the second (return) argument.
227
  bool encodable_string(const char *str, unsigned *first_bad_position) const;
228
229
  // Encodes the given UTF-8 string with this UNICHARSET.
230
  // Any part of the string that cannot be encoded (because the utf8 can't
231
  // be broken up into pieces that are in the unicharset) then:
232
  // if give_up_on_failure, stops and returns a partial encoding,
233
  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
234
  // Returns true if the encoding succeeds completely, false if there is at
235
  // least one failure.
236
  // If lengths is not nullptr, then it is filled with the corresponding
237
  // byte length of each encoded UNICHAR_ID.
238
  // If encoded_length is not nullptr then on return it contains the length of
239
  // str that was encoded. (if give_up_on_failure the location of the first
240
  // failure, otherwise strlen(str).)
241
  // WARNING: Caller must guarantee that str has already been cleaned of codes
242
  // that do not belong in the unicharset, or encoding may fail.
243
  // Use CleanupString to perform the cleaning.
244
  bool encode_string(const char *str, bool give_up_on_failure,
245
                     std::vector<UNICHAR_ID> *encoding,
246
                     std::vector<char> *lengths,
247
                     unsigned *encoded_length) const;
248
249
  // Return the unichar representation corresponding to the given UNICHAR_ID
250
  // within the UNICHARSET.
251
  const char *id_to_unichar(UNICHAR_ID id) const;
252
253
  // Return the UTF8 representation corresponding to the given UNICHAR_ID after
254
  // resolving any private encodings internal to Tesseract. This method is
255
  // preferable to id_to_unichar for outputting text that will be visible to
256
  // external applications.
257
  const char *id_to_unichar_ext(UNICHAR_ID id) const;
258
259
  // Return a string that reformats the utf8 str into the str followed
260
  // by its hex unicodes.
261
  static std::string debug_utf8_str(const char *str);
262
263
  // Removes/replaces content that belongs in rendered text, but not in the
264
  // unicharset.
265
2.62k
  static std::string CleanupString(const char *utf8_str) {
266
2.62k
    return CleanupString(utf8_str, strlen(utf8_str));
267
2.62k
  }
268
  static std::string CleanupString(const char *utf8_str, size_t length);
269
270
  // Return a string containing debug information on the unichar, including
271
  // the id_to_unichar, its hex unicodes and the properties.
272
  std::string debug_str(UNICHAR_ID id) const;
273
0
  std::string debug_str(const char *unichar_repr) const {
274
0
    return debug_str(unichar_to_id(unichar_repr));
275
0
  }
276
277
  // Adds a unichar representation to the set. If old_style is true, then
278
  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
279
  // characters are ignored/skipped as if they don't exist and n-grams that
280
  // can already be encoded are not added.
281
  void unichar_insert(const char *const unichar_repr,
282
                      OldUncleanUnichars old_style);
283
44
  void unichar_insert(const char *const unichar_repr) {
284
44
    unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
285
44
  }
286
  // Adds a unichar representation to the set. Avoids setting old_style to true,
287
  // unless it is necessary to make the new unichar get added.
288
1.34k
  void unichar_insert_backwards_compatible(const char *const unichar_repr) {
289
1.34k
    std::string cleaned = CleanupString(unichar_repr);
290
1.34k
    if (cleaned != unichar_repr) {
291
16
      unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
292
1.32k
    } else {
293
1.32k
      auto old_size = size();
294
1.32k
      unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
295
1.32k
      if (size() == old_size) {
296
0
        unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
297
0
      }
298
1.32k
    }
299
1.34k
  }
300
301
  // Return true if the given unichar id exists within the set.
302
  // Relies on the fact that unichar ids are contiguous in the unicharset.
303
1.11G
  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
304
1.11G
    return static_cast<size_t>(unichar_id) < unichars.size();
305
1.11G
  }
306
307
  // Return true if the given unichar representation exists within the set.
308
  bool contains_unichar(const char *const unichar_repr) const;
309
  bool contains_unichar(const char *const unichar_repr, int length) const;
310
311
  // Return true if the given unichar representation corresponds to the given
312
  // UNICHAR_ID within the set.
313
  bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;
314
315
  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
316
28
  void delete_pointers_in_unichars() {
317
488
    for (auto &unichar : unichars) {
318
488
      delete unichar.properties.fragment;
319
488
      unichar.properties.fragment = nullptr;
320
488
    }
321
28
  }
322
323
  // Clear the UNICHARSET (all the previous data is lost).
324
28
  void clear() {
325
28
    if (script_table != nullptr) {
326
40
      for (int i = 0; i < script_table_size_used; ++i) {
327
24
        delete[] script_table[i];
328
24
      }
329
16
      delete[] script_table;
330
16
      script_table = nullptr;
331
16
      script_table_size_used = 0;
332
16
    }
333
28
    script_table_size_reserved = 0;
334
28
    delete_pointers_in_unichars();
335
28
    unichars.clear();
336
28
    ids.clear();
337
28
    top_bottom_set_ = false;
338
28
    script_has_upper_lower_ = false;
339
28
    script_has_xheight_ = false;
340
28
    old_style_included_ = false;
341
28
    null_sid_ = 0;
342
28
    common_sid_ = 0;
343
28
    latin_sid_ = 0;
344
28
    cyrillic_sid_ = 0;
345
28
    greek_sid_ = 0;
346
28
    han_sid_ = 0;
347
28
    hiragana_sid_ = 0;
348
28
    katakana_sid_ = 0;
349
28
    thai_sid_ = 0;
350
28
    hangul_sid_ = 0;
351
28
    default_sid_ = 0;
352
28
  }
353
354
  // Return the size of the set (the number of different UNICHAR it holds).
355
1.54G
  size_t size() const {
356
1.54G
    return unichars.size();
357
1.54G
  }
358
359
  // Opens the file indicated by filename and saves unicharset to that file.
360
  // Returns true if the operation is successful.
361
0
  bool save_to_file(const char *const filename) const {
362
0
    FILE *file = fopen(filename, "w+b");
363
0
    if (file == nullptr) {
364
0
      return false;
365
0
    }
366
0
    bool result = save_to_file(file);
367
0
    fclose(file);
368
0
    return result;
369
0
  }
370
371
  // Saves the content of the UNICHARSET to the given file.
372
  // Returns true if the operation is successful.
373
0
  bool save_to_file(FILE *file) const {
374
0
    std::string str;
375
0
    return save_to_string(str) &&
376
0
           tesseract::Serialize(file, &str[0], str.length());
377
0
  }
378
379
0
  bool save_to_file(tesseract::TFile *file) const {
380
0
    std::string str;
381
0
    return save_to_string(str) && file->Serialize(&str[0], str.length());
382
0
  }
383
384
  // Saves the content of the UNICHARSET to the given string.
385
  // Returns true if the operation is successful.
386
  bool save_to_string(std::string &str) const;
387
388
  // Opens the file indicated by filename and loads the UNICHARSET
389
  // from the given file. The previous data is lost.
390
  // Returns true if the operation is successful.
391
0
  bool load_from_file(const char *const filename, bool skip_fragments) {
392
0
    FILE *file = fopen(filename, "rb");
393
0
    if (file == nullptr) {
394
0
      return false;
395
0
    }
396
0
    bool result = load_from_file(file, skip_fragments);
397
0
    fclose(file);
398
0
    return result;
399
0
  }
400
  // returns true if the operation is successful.
401
0
  bool load_from_file(const char *const filename) {
402
0
    return load_from_file(filename, false);
403
0
  }
404
405
  // Loads the UNICHARSET from the given file. The previous data is lost.
406
  // Returns true if the operation is successful.
407
  bool load_from_file(FILE *file, bool skip_fragments);
408
0
  bool load_from_file(FILE *file) {
409
0
    return load_from_file(file, false);
410
0
  }
411
  bool load_from_file(tesseract::TFile *file, bool skip_fragments);
412
413
  // Sets up internal data after loading the file, based on the char
414
  // properties. Called from load_from_file, but also needs to be run
415
  // during set_unicharset_properties.
416
  void post_load_setup();
417
418
  // Returns true if right_to_left scripts are significant in the unicharset,
419
  // but without being so sensitive that "universal" unicharsets containing
420
  // characters from many scripts, like orientation and script detection,
421
  // look like they are right_to_left.
422
  bool major_right_to_left() const;
423
424
  // Set a whitelist and/or blacklist of characters to recognize.
425
  // An empty or nullptr whitelist enables everything (minus any blacklist).
426
  // An empty or nullptr blacklist disables nothing.
427
  // An empty or nullptr unblacklist has no effect.
428
  // The blacklist overrides the whitelist.
429
  // The unblacklist overrides the blacklist.
430
  // Each list is a string of utf8 character strings. Boundaries between
431
  // unicharset units are worked out automatically, and characters not in
432
  // the unicharset are silently ignored.
433
  void set_black_and_whitelist(const char *blacklist, const char *whitelist,
434
                               const char *unblacklist);
435
436
  // Set the isalpha property of the given unichar to the given value.
437
900
  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
438
900
    unichars[unichar_id].properties.isalpha = value;
439
900
  }
440
441
  // Set the islower property of the given unichar to the given value.
442
900
  void set_islower(UNICHAR_ID unichar_id, bool value) {
443
900
    unichars[unichar_id].properties.islower = value;
444
900
  }
445
446
  // Set the isupper property of the given unichar to the given value.
447
900
  void set_isupper(UNICHAR_ID unichar_id, bool value) {
448
900
    unichars[unichar_id].properties.isupper = value;
449
900
  }
450
451
  // Set the isdigit property of the given unichar to the given value.
452
900
  void set_isdigit(UNICHAR_ID unichar_id, bool value) {
453
900
    unichars[unichar_id].properties.isdigit = value;
454
900
  }
455
456
  // Set the ispunctuation property of the given unichar to the given value.
457
900
  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
458
900
    unichars[unichar_id].properties.ispunctuation = value;
459
900
  }
460
461
  // Set the isngram property of the given unichar to the given value.
462
76.7k
  void set_isngram(UNICHAR_ID unichar_id, bool value) {
463
76.7k
    unichars[unichar_id].properties.isngram = value;
464
76.7k
  }
465
466
  // Set the script name of the given unichar to the given value.
467
  // Value is copied and thus can be a temporary;
468
3.75k
  void set_script(UNICHAR_ID unichar_id, const char *value) {
469
3.75k
    unichars[unichar_id].properties.script_id = add_script(value);
470
3.75k
  }
471
472
  // Set other_case unichar id in the properties for the given unichar id.
473
900
  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
474
900
    unichars[unichar_id].properties.other_case = other_case;
475
900
  }
476
477
  // Set the direction property of the given unichar to the given value.
478
900
  void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
479
900
    unichars[unichar_id].properties.direction = value;
480
900
  }
481
482
  // Set mirror unichar id in the properties for the given unichar id.
483
900
  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
484
900
    unichars[unichar_id].properties.mirror = mirror;
485
900
  }
486
487
  // Record normalized version of unichar with the given unichar_id.
488
900
  void set_normed(UNICHAR_ID unichar_id, const char *normed) {
489
900
    unichars[unichar_id].properties.normed = normed;
490
900
    unichars[unichar_id].properties.normed_ids.clear();
491
900
  }
492
  // Sets the normed_ids vector from the normed string. normed_ids is not
493
  // stored in the file, and needs to be set when the UNICHARSET is loaded.
494
  void set_normed_ids(UNICHAR_ID unichar_id);
495
496
  // Return the isalpha property of the given unichar.
497
161M
  bool get_isalpha(UNICHAR_ID unichar_id) const {
498
161M
    if (INVALID_UNICHAR_ID == unichar_id) {
499
0
      return false;
500
0
    }
501
161M
    ASSERT_HOST(contains_unichar_id(unichar_id));
502
161M
    return unichars[unichar_id].properties.isalpha;
503
161M
  }
504
505
  // Return the islower property of the given unichar.
506
92.9M
  bool get_islower(UNICHAR_ID unichar_id) const {
507
92.9M
    if (INVALID_UNICHAR_ID == unichar_id) {
508
0
      return false;
509
0
    }
510
92.9M
    ASSERT_HOST(contains_unichar_id(unichar_id));
511
92.9M
    return unichars[unichar_id].properties.islower;
512
92.9M
  }
513
514
  // Return the isupper property of the given unichar.
515
50.7M
  bool get_isupper(UNICHAR_ID unichar_id) const {
516
50.7M
    if (INVALID_UNICHAR_ID == unichar_id) {
517
0
      return false;
518
0
    }
519
50.7M
    ASSERT_HOST(contains_unichar_id(unichar_id));
520
50.7M
    return unichars[unichar_id].properties.isupper;
521
50.7M
  }
522
523
  // Return the isdigit property of the given unichar.
524
135M
  bool get_isdigit(UNICHAR_ID unichar_id) const {
525
135M
    if (INVALID_UNICHAR_ID == unichar_id) {
526
0
      return false;
527
0
    }
528
135M
    ASSERT_HOST(contains_unichar_id(unichar_id));
529
135M
    return unichars[unichar_id].properties.isdigit;
530
135M
  }
531
532
  // Return the ispunctuation property of the given unichar.
533
50.2M
  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
534
50.2M
    if (INVALID_UNICHAR_ID == unichar_id) {
535
0
      return false;
536
0
    }
537
50.2M
    ASSERT_HOST(contains_unichar_id(unichar_id));
538
50.2M
    return unichars[unichar_id].properties.ispunctuation;
539
50.2M
  }
540
541
  // Return the isngram property of the given unichar.
542
5.83M
  bool get_isngram(UNICHAR_ID unichar_id) const {
543
5.83M
    if (INVALID_UNICHAR_ID == unichar_id) {
544
0
      return false;
545
0
    }
546
5.83M
    ASSERT_HOST(contains_unichar_id(unichar_id));
547
5.83M
    return unichars[unichar_id].properties.isngram;
548
5.83M
  }
549
550
  // Returns whether the unichar id represents a unicode value in the private
551
  // use area.
552
  bool get_isprivate(UNICHAR_ID unichar_id) const;
553
554
  // Returns true if the ids have useful min/max top/bottom values.
555
8.40M
  bool top_bottom_useful() const {
556
8.40M
    return top_bottom_set_;
557
8.40M
  }
558
  // Sets all ranges to empty, so they can be expanded to set the values.
559
  void set_ranges_empty();
560
  // Sets all the properties for this unicharset given a src_unicharset with
561
  // everything set. The unicharsets don't have to be the same, and graphemes
562
  // are correctly accounted for.
563
0
  void SetPropertiesFromOther(const UNICHARSET &src) {
564
0
    PartialSetPropertiesFromOther(0, src);
565
0
  }
566
  // Sets properties from Other, starting only at the given index.
567
  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);
568
  // Expands the tops and bottoms and widths for this unicharset given a
569
  // src_unicharset with ranges in it. The unicharsets don't have to be the
570
  // same, and graphemes are correctly accounted for.
571
  void ExpandRangesFromOther(const UNICHARSET &src);
572
  // Makes this a copy of src. Clears this completely first, so the automattic
573
  // ids will not be present in this if not in src.
574
  void CopyFrom(const UNICHARSET &src);
575
  // For each id in src, if it does not occur in this, add it, as in
576
  // SetPropertiesFromOther, otherwise expand the ranges, as in
577
  // ExpandRangesFromOther.
578
  void AppendOtherUnicharset(const UNICHARSET &src);
579
  // Returns true if the acceptable ranges of the tops of the characters do
580
  // not overlap, making their x-height calculations distinct.
581
  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
582
  // Returns the min and max bottom and top of the given unichar in
583
  // baseline-normalized coordinates, ie, where the baseline is
584
  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
585
  // (See normalis.h for the definitions).
586
  void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,
587
14.6M
                      int *min_top, int *max_top) const {
588
14.6M
    if (INVALID_UNICHAR_ID == unichar_id) {
589
0
      *min_bottom = *min_top = 0;
590
0
      *max_bottom = *max_top = 256; // kBlnCellHeight
591
0
      return;
592
0
    }
593
14.6M
    ASSERT_HOST(contains_unichar_id(unichar_id));
594
14.6M
    *min_bottom = unichars[unichar_id].properties.min_bottom;
595
14.6M
    *max_bottom = unichars[unichar_id].properties.max_bottom;
596
14.6M
    *min_top = unichars[unichar_id].properties.min_top;
597
14.6M
    *max_top = unichars[unichar_id].properties.max_top;
598
14.6M
  }
599
  void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,
600
900
                      int min_top, int max_top) {
601
900
    unichars[unichar_id].properties.min_bottom =
602
900
        ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603
900
    unichars[unichar_id].properties.max_bottom =
604
900
        ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605
900
    unichars[unichar_id].properties.min_top =
606
900
        ClipToRange<int>(min_top, 0, UINT8_MAX);
607
900
    unichars[unichar_id].properties.max_top =
608
900
        ClipToRange<int>(max_top, 0, UINT8_MAX);
609
900
  }
610
  // Returns the width stats (as mean, sd) of the given unichar relative to the
611
  // median advance of all characters in the character set.
612
  void get_width_stats(UNICHAR_ID unichar_id, float *width,
613
0
                       float *width_sd) const {
614
0
    if (INVALID_UNICHAR_ID == unichar_id) {
615
0
      *width = 0.0f;
616
0
      *width_sd = 0.0f;
617
0
      return;
618
0
    }
619
0
    ASSERT_HOST(contains_unichar_id(unichar_id));
620
0
    *width = unichars[unichar_id].properties.width;
621
0
    *width_sd = unichars[unichar_id].properties.width_sd;
622
0
  }
623
900
  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
624
900
    unichars[unichar_id].properties.width = width;
625
900
    unichars[unichar_id].properties.width_sd = width_sd;
626
900
  }
627
  // Returns the stats of the x-bearing (as mean, sd) of the given unichar
628
  // relative to the median advance of all characters in the character set.
629
  void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,
630
0
                         float *bearing_sd) const {
631
0
    if (INVALID_UNICHAR_ID == unichar_id) {
632
0
      *bearing = *bearing_sd = 0.0f;
633
0
      return;
634
0
    }
635
0
    ASSERT_HOST(contains_unichar_id(unichar_id));
636
0
    *bearing = unichars[unichar_id].properties.bearing;
637
0
    *bearing_sd = unichars[unichar_id].properties.bearing_sd;
638
0
  }
639
  void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,
640
900
                         float bearing_sd) {
641
900
    unichars[unichar_id].properties.bearing = bearing;
642
900
    unichars[unichar_id].properties.bearing_sd = bearing_sd;
643
900
  }
644
  // Returns the stats of the x-advance of the given unichar (as mean, sd)
645
  // relative to the median advance of all characters in the character set.
646
  void get_advance_stats(UNICHAR_ID unichar_id, float *advance,
647
0
                         float *advance_sd) const {
648
0
    if (INVALID_UNICHAR_ID == unichar_id) {
649
0
      *advance = *advance_sd = 0;
650
0
      return;
651
0
    }
652
0
    ASSERT_HOST(contains_unichar_id(unichar_id));
653
0
    *advance = unichars[unichar_id].properties.advance;
654
0
    *advance_sd = unichars[unichar_id].properties.advance_sd;
655
0
  }
656
  void set_advance_stats(UNICHAR_ID unichar_id, float advance,
657
900
                         float advance_sd) {
658
900
    unichars[unichar_id].properties.advance = advance;
659
900
    unichars[unichar_id].properties.advance_sd = advance_sd;
660
900
  }
661
  // Returns true if the font metrics properties are empty.
662
0
  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
663
0
    return unichars[unichar_id].properties.AnyRangeEmpty();
664
0
  }
665
666
  // Returns true if the script of the given id is space delimited.
667
  // Returns false for Han and Thai scripts.
668
16.9M
  bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
669
16.9M
    if (INVALID_UNICHAR_ID == unichar_id) {
670
3.78M
      return true;
671
3.78M
    }
672
13.1M
    int script_id = get_script(unichar_id);
673
13.1M
    return script_id != han_sid_ && script_id != thai_sid_ &&
674
13.1M
           script_id != hangul_sid_ && script_id != hiragana_sid_ &&
675
13.1M
           script_id != katakana_sid_;
676
16.9M
  }
677
678
  // Return the script name of the given unichar.
679
  // The returned pointer will always be the same for the same script, it's
680
  // managed by unicharset and thus MUST NOT be deleted
681
39.3M
  int get_script(UNICHAR_ID unichar_id) const {
682
39.3M
    if (INVALID_UNICHAR_ID == unichar_id) {
683
0
      return null_sid_;
684
0
    }
685
39.3M
    ASSERT_HOST(contains_unichar_id(unichar_id));
686
39.3M
    return unichars[unichar_id].properties.script_id;
687
39.3M
  }
688
689
  // Return the character properties, eg. alpha/upper/lower/digit/punct,
690
  // as a bit field of unsigned int.
691
  unsigned int get_properties(UNICHAR_ID unichar_id) const;
692
693
  // Return the character property as a single char.  If a character has
694
  // multiple attributes, the main property is defined by the following order:
695
  //   upper_case : 'A'
696
  //   lower_case : 'a'
697
  //   alpha      : 'x'
698
  //   digit      : '0'
699
  //   punctuation: 'p'
700
  char get_chartype(UNICHAR_ID unichar_id) const;
701
702
  // Get other_case unichar id in the properties for the given unichar id.
703
34.3M
  UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
704
34.3M
    if (INVALID_UNICHAR_ID == unichar_id) {
705
0
      return INVALID_UNICHAR_ID;
706
0
    }
707
34.3M
    ASSERT_HOST(contains_unichar_id(unichar_id));
708
34.3M
    return unichars[unichar_id].properties.other_case;
709
34.3M
  }
710
711
  // Returns the direction property of the given unichar.
712
15.8M
  Direction get_direction(UNICHAR_ID unichar_id) const {
713
15.8M
    if (INVALID_UNICHAR_ID == unichar_id) {
714
0
      return UNICHARSET::U_OTHER_NEUTRAL;
715
0
    }
716
15.8M
    ASSERT_HOST(contains_unichar_id(unichar_id));
717
15.8M
    return unichars[unichar_id].properties.direction;
718
15.8M
  }
719
720
  // Get mirror unichar id in the properties for the given unichar id.
721
718k
  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
722
718k
    if (INVALID_UNICHAR_ID == unichar_id) {
723
0
      return INVALID_UNICHAR_ID;
724
0
    }
725
718k
    ASSERT_HOST(contains_unichar_id(unichar_id));
726
718k
    return unichars[unichar_id].properties.mirror;
727
718k
  }
728
729
  // Returns UNICHAR_ID of the corresponding lower-case unichar.
730
176
  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
731
176
    if (INVALID_UNICHAR_ID == unichar_id) {
732
0
      return INVALID_UNICHAR_ID;
733
0
    }
734
176
    ASSERT_HOST(contains_unichar_id(unichar_id));
735
176
    if (unichars[unichar_id].properties.islower) {
736
168
      return unichar_id;
737
168
    }
738
8
    return unichars[unichar_id].properties.other_case;
739
176
  }
740
741
  // Returns UNICHAR_ID of the corresponding upper-case unichar.
742
0
  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
743
0
    if (INVALID_UNICHAR_ID == unichar_id) {
744
0
      return INVALID_UNICHAR_ID;
745
0
    }
746
0
    ASSERT_HOST(contains_unichar_id(unichar_id));
747
0
    if (unichars[unichar_id].properties.isupper) {
748
0
      return unichar_id;
749
0
    }
750
0
    return unichars[unichar_id].properties.other_case;
751
0
  }
752
753
  // Returns true if this UNICHARSET has the special codes in
754
  // SpecialUnicharCodes available. If false then there are normal unichars
755
  // at these codes and they should not be used.
756
0
  bool has_special_codes() const {
757
0
    return get_fragment(UNICHAR_BROKEN) != nullptr &&
758
0
           strcmp(id_to_unichar(UNICHAR_BROKEN),
759
0
                  kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
760
0
  }
761
762
  // Returns true if there are any repeated unicodes in the normalized
763
  // text of any unichar-id in the unicharset.
764
  bool AnyRepeatedUnicodes() const;
765
766
  // Return a pointer to the CHAR_FRAGMENT class if the given
767
  // unichar id represents a character fragment.
768
75.8M
  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
769
75.8M
    if (INVALID_UNICHAR_ID == unichar_id) {
770
0
      return nullptr;
771
0
    }
772
75.8M
    ASSERT_HOST(contains_unichar_id(unichar_id));
773
75.8M
    return unichars[unichar_id].properties.fragment;
774
75.8M
  }
775
776
  // Return the isalpha property of the given unichar representation.
777
0
  bool get_isalpha(const char *const unichar_repr) const {
778
0
    return get_isalpha(unichar_to_id(unichar_repr));
779
0
  }
780
781
  // Return the islower property of the given unichar representation.
782
0
  bool get_islower(const char *const unichar_repr) const {
783
0
    return get_islower(unichar_to_id(unichar_repr));
784
0
  }
785
786
  // Return the isupper property of the given unichar representation.
787
0
  bool get_isupper(const char *const unichar_repr) const {
788
0
    return get_isupper(unichar_to_id(unichar_repr));
789
0
  }
790
791
  // Return the isdigit property of the given unichar representation.
792
0
  bool get_isdigit(const char *const unichar_repr) const {
793
0
    return get_isdigit(unichar_to_id(unichar_repr));
794
0
  }
795
796
  // Return the ispunctuation property of the given unichar representation.
797
0
  bool get_ispunctuation(const char *const unichar_repr) const {
798
0
    return get_ispunctuation(unichar_to_id(unichar_repr));
799
0
  }
800
801
  // Return the character properties, eg. alpha/upper/lower/digit/punct,
802
  // of the given unichar representation
803
0
  unsigned int get_properties(const char *const unichar_repr) const {
804
0
    return get_properties(unichar_to_id(unichar_repr));
805
0
  }
806
807
0
  char get_chartype(const char *const unichar_repr) const {
808
0
    return get_chartype(unichar_to_id(unichar_repr));
809
0
  }
810
811
  // Return the script name of the given unichar representation.
812
  // The returned pointer will always be the same for the same script, it's
813
  // managed by unicharset and thus MUST NOT be deleted
814
1.11k
  int get_script(const char *const unichar_repr) const {
815
1.11k
    return get_script(unichar_to_id(unichar_repr));
816
1.11k
  }
817
818
  // Return a pointer to the CHAR_FRAGMENT class struct if the given
819
  // unichar representation represents a character fragment.
820
0
  const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {
821
0
    if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
822
0
        !ids.contains(unichar_repr, false)) {
823
0
      return nullptr;
824
0
    }
825
0
    return get_fragment(unichar_to_id(unichar_repr));
826
0
  }
827
828
  // Return the isalpha property of the given unichar representation.
829
  // Only the first length characters from unichar_repr are used.
830
58.1k
  bool get_isalpha(const char *const unichar_repr, int length) const {
831
58.1k
    return get_isalpha(unichar_to_id(unichar_repr, length));
832
58.1k
  }
833
834
  // Return the islower property of the given unichar representation.
835
  // Only the first length characters from unichar_repr are used.
836
0
  bool get_islower(const char *const unichar_repr, int length) const {
837
0
    return get_islower(unichar_to_id(unichar_repr, length));
838
0
  }
839
840
  // Return the isupper property of the given unichar representation.
841
  // Only the first length characters from unichar_repr are used.
842
0
  bool get_isupper(const char *const unichar_repr, int length) const {
843
0
    return get_isupper(unichar_to_id(unichar_repr, length));
844
0
  }
845
846
  // Return the isdigit property of the given unichar representation.
847
  // Only the first length characters from unichar_repr are used.
848
9
  bool get_isdigit(const char *const unichar_repr, int length) const {
849
9
    return get_isdigit(unichar_to_id(unichar_repr, length));
850
9
  }
851
852
  // Return the ispunctuation property of the given unichar representation.
853
  // Only the first length characters from unichar_repr are used.
854
0
  bool get_ispunctuation(const char *const unichar_repr, int length) const {
855
0
    return get_ispunctuation(unichar_to_id(unichar_repr, length));
856
0
  }
857
858
  // Returns normalized version of unichar with the given unichar_id.
859
0
  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
860
0
    if (unichar_id == UNICHAR_SPACE) {
861
0
      return " ";
862
0
    }
863
0
    return unichars[unichar_id].properties.normed.c_str();
864
0
  }
865
  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
866
  // version of the given id. There may be more than one UNICHAR_ID in the
867
  // vector if unichar_id represents a ligature.
868
20.7M
  const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {
869
20.7M
    return unichars[unichar_id].properties.normed_ids;
870
20.7M
  }
871
872
  // Return the script name of the given unichar representation.
873
  // Only the first length characters from unichar_repr are used.
874
  // The returned pointer will always be the same for the same script, it's
875
  // managed by unicharset and thus MUST NOT be deleted
876
0
  int get_script(const char *const unichar_repr, int length) const {
877
0
    return get_script(unichar_to_id(unichar_repr, length));
878
0
  }
879
880
  // Return the (current) number of scripts in the script table
881
0
  int get_script_table_size() const {
882
0
    return script_table_size_used;
883
0
  }
884
885
  // Return the script string from its id
886
452
  const char *get_script_from_script_id(int id) const {
887
452
    if (id >= script_table_size_used || id < 0) {
888
0
      return null_script;
889
0
    }
890
452
    return script_table[id];
891
452
  }
892
893
  // Returns the id from the name of the script, or 0 if script is not found.
894
  // Note that this is an expensive operation since it involves iteratively
895
  // comparing strings in the script table.  To avoid dependency on STL, we
896
  // won't use a hash.  Instead, the calling function can use this to lookup
897
  // and save the ID for relevant scripts for fast comparisons later.
898
  int get_script_id_from_name(const char *script_name) const;
899
900
  // Return true if the given script is the null script
901
0
  bool is_null_script(const char *script) const {
902
0
    return script == null_script;
903
0
  }
904
905
  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
906
  // then the returned pointer will be the same.
907
  // The script parameter is copied and thus can be a temporary.
908
  int add_script(const char *script);
909
910
  // Return the enabled property of the given unichar.
911
407M
  bool get_enabled(UNICHAR_ID unichar_id) const {
912
407M
    ASSERT_HOST(contains_unichar_id(unichar_id));
913
407M
    return unichars[unichar_id].properties.enabled;
914
407M
  }
915
916
18.6M
  int null_sid() const {
917
18.6M
    return null_sid_;
918
18.6M
  }
919
31.0M
  int common_sid() const {
920
31.0M
    return common_sid_;
921
31.0M
  }
922
439k
  int latin_sid() const {
923
439k
    return latin_sid_;
924
439k
  }
925
0
  int cyrillic_sid() const {
926
0
    return cyrillic_sid_;
927
0
  }
928
0
  int greek_sid() const {
929
0
    return greek_sid_;
930
0
  }
931
18.6M
  int han_sid() const {
932
18.6M
    return han_sid_;
933
18.6M
  }
934
0
  int hiragana_sid() const {
935
0
    return hiragana_sid_;
936
0
  }
937
2
  int katakana_sid() const {
938
2
    return katakana_sid_;
939
2
  }
940
2
  int thai_sid() const {
941
2
    return thai_sid_;
942
2
  }
943
0
  int hangul_sid() const {
944
0
    return hangul_sid_;
945
0
  }
946
439k
  int default_sid() const {
947
439k
    return default_sid_;
948
439k
  }
949
950
  // Returns true if the unicharset has the concept of upper/lower case.
951
8.33M
  bool script_has_upper_lower() const {
952
8.33M
    return script_has_upper_lower_;
953
8.33M
  }
954
  // Returns true if the unicharset has the concept of x-height.
955
  // script_has_xheight can be true even if script_has_upper_lower is not,
956
  // when the script has a sufficiently predominant top line with ascenders,
957
  // such as Devanagari and Thai.
958
482k
  bool script_has_xheight() const {
959
482k
    return script_has_xheight_;
960
482k
  }
961
962
private:
963
  struct TESS_API UNICHAR_PROPERTIES {
964
    UNICHAR_PROPERTIES();
965
    // Initializes all properties to sensible default values.
966
    void Init();
967
    // Sets all ranges wide open. Initialization default in case there are
968
    // no useful values available.
969
    void SetRangesOpen();
970
    // Sets all ranges to empty. Used before expanding with font-based data.
971
    void SetRangesEmpty();
972
    // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
973
    // is empty.
974
    bool AnyRangeEmpty() const;
975
    // Expands the ranges with the ranges from the src properties.
976
    void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);
977
    // Copies the properties from src into this.
978
    void CopyFrom(const UNICHAR_PROPERTIES &src);
979
980
    bool isalpha;
981
    bool islower;
982
    bool isupper;
983
    bool isdigit;
984
    bool ispunctuation;
985
    bool isngram;
986
    bool enabled;
987
    // Possible limits of the top and bottom of the bounding box in
988
    // baseline-normalized coordinates, ie, where the baseline is
989
    // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
990
    // (See normalis.h for the definitions).
991
    uint8_t min_bottom;
992
    uint8_t max_bottom;
993
    uint8_t min_top;
994
    uint8_t max_top;
995
    // Statistics of the widths of bounding box, relative to the median advance.
996
    float width;
997
    float width_sd;
998
    // Stats of the x-bearing and advance, also relative to the median advance.
999
    float bearing;
1000
    float bearing_sd;
1001
    float advance;
1002
    float advance_sd;
1003
    int script_id;
1004
    UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
1005
    Direction direction;   // direction of this unichar
1006
    // Mirror property is useful for reverse DAWG lookup for words in
1007
    // right-to-left languages (e.g. "(word)" would be in
1008
    // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
1009
    // However, what we want in our DAWG is
1010
    // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
1011
    // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
1012
    UNICHAR_ID mirror;
1013
    // A string of unichar_ids that represent the corresponding normed string.
1014
    // For awkward characters like em-dash, this gives hyphen.
1015
    // For ligatures, this gives the string of normal unichars.
1016
    std::vector<UNICHAR_ID> normed_ids;
1017
    std::string normed; // normalized version of this unichar
1018
    // Contains meta information about the fragment if a unichar represents
1019
    // a fragment of a character, otherwise should be set to nullptr.
1020
    // It is assumed that character fragments are added to the unicharset
1021
    // after the corresponding 'base' characters.
1022
    CHAR_FRAGMENT *fragment;
1023
  };
1024
1025
  struct UNICHAR_SLOT {
1026
    char representation[UNICHAR_LEN + 1];
1027
    UNICHAR_PROPERTIES properties;
1028
  };
1029
1030
  // Internal recursive version of encode_string above.
1031
  // str is the start of the whole string.
1032
  // str_index is the current position in str.
1033
  // str_length is the length of str.
1034
  // encoding is a working encoding of str.
1035
  // lengths is a working set of lengths of each element of encoding.
1036
  // best_total_length is the longest length of str that has been successfully
1037
  // encoded so far.
1038
  // On return:
1039
  // best_encoding contains the encoding that used the longest part of str.
1040
  // best_lengths (may be null) contains the lengths of best_encoding.
1041
  void encode_string(const char *str, int str_index, int str_length,
1042
                     std::vector<UNICHAR_ID> *encoding,
1043
                     std::vector<char> *lengths, unsigned *best_total_length,
1044
                     std::vector<UNICHAR_ID> *best_encoding,
1045
                     std::vector<char> *best_lengths) const;
1046
1047
  // Gets the properties for a grapheme string, combining properties for
1048
  // multiple characters in a meaningful way where possible.
1049
  // Returns false if no valid match was found in the unicharset.
1050
  // NOTE that script_id, mirror, and other_case refer to this unicharset on
1051
  // return and will need redirecting if the target unicharset is different.
1052
  bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;
1053
1054
  // Load ourselves from a "file" where our only interface to the file is
1055
  // an implementation of fgets().  This is the parsing primitive accessed by
1056
  // the public routines load_from_file().
1057
  bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,
1058
                      bool skip_fragments);
1059
1060
  // List of mappings to make when ingesting strings from the outside.
1061
  // The substitutions clean up text that should exists for rendering of
1062
  // synthetic data, but not in the recognition set.
1063
  static const char *kCleanupMaps[][2];
1064
  static const char *null_script;
1065
1066
  std::vector<UNICHAR_SLOT> unichars;
1067
  UNICHARMAP ids;
1068
  char **script_table;
1069
  int script_table_size_used;
1070
  int script_table_size_reserved;
1071
  // True if the unichars have their tops/bottoms set.
1072
  bool top_bottom_set_;
1073
  // True if the unicharset has significant upper/lower case chars.
1074
  bool script_has_upper_lower_;
1075
  // True if the unicharset has a significant mean-line with significant
1076
  // ascenders above that.
1077
  bool script_has_xheight_;
1078
  // True if the set contains chars that would be changed by the cleanup.
1079
  bool old_style_included_;
1080
1081
  // A few convenient script name-to-id mapping without using hash.
1082
  // These are initialized when unicharset file is loaded.  Anything
1083
  // missing from this list can be looked up using get_script_id_from_name.
1084
  int null_sid_;
1085
  int common_sid_;
1086
  int latin_sid_;
1087
  int cyrillic_sid_;
1088
  int greek_sid_;
1089
  int han_sid_;
1090
  int hiragana_sid_;
1091
  int katakana_sid_;
1092
  int thai_sid_;
1093
  int hangul_sid_;
1094
  // The most frequently occurring script in the charset.
1095
  int default_sid_;
1096
};
1097
1098
} // namespace tesseract
1099
1100
#endif // TESSERACT_CCUTIL_UNICHARSET_H_