Coverage Report

Created: 2025-11-16 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/dict/trie.h
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * File:        trie.h
4
 * Description: Functions to build a trie data structure.
5
 * Author:      Mark Seaman, SW Productivity
6
 *
7
 * (c) Copyright 1987, Hewlett-Packard Company.
8
 ** Licensed under the Apache License, Version 2.0 (the "License");
9
 ** you may not use this file except in compliance with the License.
10
 ** You may obtain a copy of the License at
11
 ** http://www.apache.org/licenses/LICENSE-2.0
12
 ** Unless required by applicable law or agreed to in writing, software
13
 ** distributed under the License is distributed on an "AS IS" BASIS,
14
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 ** See the License for the specific language governing permissions and
16
 ** limitations under the License.
17
 *
18
 *****************************************************************************/
19
#ifndef TRIE_H
20
#define TRIE_H
21
22
#include "dawg.h"
23
24
namespace tesseract {
25
26
class UNICHARSET;
27
28
// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
29
// max int32, we will need to change vector to use int64 for size
30
// and address indices. This does not seem to be needed immediately,
31
// since currently the largest number of edges limit used by tesseract
32
// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
33
// There are also int casts below to satisfy the WIN32 compiler that would
34
// need to be changed.
35
// It might be cleanest to change the types of most of the Trie/Dawg related
36
// typedefs to int and restrict the casts to extracting these values from
37
// the 64 bit EDGE_RECORD.
38
using EDGE_INDEX = int64_t; // index of an edge in a given node
39
using EDGE_VECTOR = std::vector<EDGE_RECORD>;
40
41
struct TRIE_NODE_RECORD {
42
  EDGE_VECTOR forward_edges;
43
  EDGE_VECTOR backward_edges;
44
};
45
using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
46
47
/**
48
 * Concrete class for Trie data structure that allows to store a list of
49
 * words (extends Dawg base class) as well as dynamically add new words.
50
 * This class stores a vector of pointers to TRIE_NODE_RECORDs, each of
51
 * which has a vector of forward and backward edges.
52
 */
53
class TESS_API Trie : public Dawg {
54
public:
55
  enum RTLReversePolicy {
56
    RRP_DO_NO_REVERSE,
57
    RRP_REVERSE_IF_HAS_RTL,
58
    RRP_FORCE_REVERSE,
59
  };
60
61
  // Minimum number of concrete characters at the beginning of user patterns.
62
  static const int kSaneNumConcreteChars = 0;
63
  // Various unicode whitespace characters are used to denote unichar patterns,
64
  // (character classifier would never produce these whitespace characters as a
65
  // valid classification).
66
  static const char kAlphaPatternUnicode[];
67
  static const char kDigitPatternUnicode[];
68
  static const char kAlphanumPatternUnicode[];
69
  static const char kPuncPatternUnicode[];
70
  static const char kLowerPatternUnicode[];
71
  static const char kUpperPatternUnicode[];
72
73
  static const char *get_reverse_policy_name(RTLReversePolicy reverse_policy);
74
75
  // max_num_edges argument allows limiting the amount of memory this
76
  // Trie can consume (if a new word insert would cause the Trie to
77
  // contain more edges than max_num_edges, all the edges are cleared
78
  // so that new inserts can proceed).
79
  Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
80
8
      : Dawg(type, lang, perm, debug_level) {
81
8
    init(unicharset_size);
82
8
    deref_node_index_mask_ = ~letter_mask_;
83
8
    new_dawg_node(); // need to allocate node 0
84
8
  }
85
0
  ~Trie() override {
86
0
    for (auto node : nodes_) {
87
0
      delete node;
88
0
    }
89
0
  }
90
91
  // Reset the Trie to empty.
92
  void clear();
93
94
  /** Returns the edge that corresponds to the letter out of this node. */
95
3.04M
  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override {
96
3.04M
    EDGE_RECORD *edge_ptr;
97
3.04M
    EDGE_INDEX edge_index;
98
3.04M
    if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
99
3.04M
                      &edge_index)) {
100
3.03M
      return NO_EDGE;
101
3.03M
    }
102
4.61k
    return make_edge_ref(node_ref, edge_index);
103
3.04M
  }
104
105
  /**
106
   * Fills the given NodeChildVector with all the unichar ids (and the
107
   * corresponding EDGE_REFs) for which there is an edge out of this node.
108
   */
109
0
  void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override {
110
0
    const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;
111
0
    for (auto &edge : forward_edges) {
112
0
      if (!word_end || end_of_word_from_edge_rec(edge)) {
113
0
        vec->push_back(
114
0
            NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
115
0
      }
116
0
    }
117
0
  }
118
119
  /**
120
   * Returns the next node visited by following the edge
121
   * indicated by the given EDGE_REF.
122
   */
123
6.70k
  NODE_REF next_node(EDGE_REF edge_ref) const override {
124
6.70k
    if (edge_ref == NO_EDGE || num_edges_ == 0) {
125
0
      return NO_EDGE;
126
0
    }
127
6.70k
    return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
128
6.70k
  }
129
130
  /**
131
   * Returns true if the edge indicated by the given EDGE_REF
132
   * marks the end of a word.
133
   */
134
11.3k
  bool end_of_word(EDGE_REF edge_ref) const override {
135
11.3k
    if (edge_ref == NO_EDGE || num_edges_ == 0) {
136
0
      return false;
137
0
    }
138
11.3k
    return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
139
11.3k
  }
140
141
  /** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */
142
0
  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
143
0
    if (edge_ref == NO_EDGE || num_edges_ == 0) {
144
0
      return INVALID_UNICHAR_ID;
145
0
    }
146
0
    return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
147
0
  }
148
  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
149
  // the edge dead.
150
0
  void KillEdge(EDGE_RECORD *edge_rec) const {
151
0
    *edge_rec &= ~letter_mask_;
152
0
    *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
153
0
  }
154
0
  bool DeadEdge(const EDGE_RECORD &edge_rec) const {
155
0
    return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
156
0
  }
157
158
  // Prints the contents of the node indicated by the given NODE_REF.
159
  // At most max_num_edges will be printed.
160
  void print_node(NODE_REF node, int max_num_edges) const override;
161
162
  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
163
  // Eliminates redundant edges and returns the pointer to the SquishedDawg.
164
  // Note: the caller is responsible for deallocating memory associated
165
  // with the returned SquishedDawg pointer.
166
  SquishedDawg *trie_to_dawg();
167
168
  // Reads a list of words from the given file and adds into the Trie.
169
  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
170
  // policy and information in the unicharset.
171
  // Returns false on error.
172
  bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
173
                              Trie::RTLReversePolicy reverse);
174
175
  // Reads a list of words from the given file.
176
  // Returns false on error.
177
  bool read_word_list(const char *filename, std::vector<std::string> *words);
178
  // Adds a list of words previously read using read_word_list to the trie
179
  // using the given unicharset and reverse_policy to convert to unichar-ids.
180
  // Returns false on error.
181
  bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
182
                     Trie::RTLReversePolicy reverse_policy);
183
184
  // Inserts the list of patterns from the given file into the Trie.
185
  // The pattern list file should contain one pattern per line in UTF-8 format.
186
  //
187
  // Each pattern can contain any non-whitespace characters, however only the
188
  // patterns that contain characters from the unicharset of the corresponding
189
  // language will be useful.
190
  // The only meta character is '\'. To be used in a pattern as an ordinary
191
  // string it should be escaped with '\' (e.g. string "C:\Documents" should
192
  // be written in the patterns file as "C:\\Documents").
193
  // This function supports a very limited regular expression syntax. One can
194
  // express a character, a certain character class and a number of times the
195
  // entity should be repeated in the pattern.
196
  //
197
  // To denote a character class use one of:
198
  // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
199
  // \d - unichar for which UNICHARSET::get_isdigit() is true
200
  // \n - unichar for which UNICHARSET::get_isdigit() or
201
  //      UNICHARSET::isalpha() are true
202
  // \p - unichar for which UNICHARSET::get_ispunct() is true
203
  // \a - unichar for which UNICHARSET::get_islower() is true
204
  // \A - unichar for which UNICHARSET::get_isupper() is true
205
  //
206
  // \* could be specified after each character or pattern to indicate that
207
  // the character/pattern can be repeated any number of times before the next
208
  // character/pattern occurs.
209
  //
210
  // Examples:
211
  // 1-8\d\d-GOOG-411 will be expanded to strings:
212
  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
213
  //
214
  // http://www.\n\*.com will be expanded to strings like:
215
  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
216
  //
217
  // Note: In choosing which patterns to include please be aware of the fact
218
  // providing very generic patterns will make tesseract run slower.
219
  // For example \n\* at the beginning of the pattern will make Tesseract
220
  // consider all the combinations of proposed character choices for each
221
  // of the segmentations, which will be unacceptably slow.
222
  // Because of potential problems with speed that could be difficult to
223
  // identify, each user pattern has to have at least kSaneNumConcreteChars
224
  // concrete characters from the unicharset at the beginning.
225
  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
226
227
  // Initializes the values of *_pattern_ unichar ids.
228
  // This function should be called before calling read_pattern_list().
229
  void initialize_patterns(UNICHARSET *unicharset);
230
231
  // Fills in the given unichar id vector with the unichar ids that represent
232
  // the patterns of the character classes of the given unichar_id.
233
  void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
234
                              std::vector<UNICHAR_ID> *vec) const override;
235
236
  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
237
  // a self loop and the given unichar_id matches the unichar_id stored in the
238
  // EDGE_RECORD, returns NO_EDGE otherwise.
239
  EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,
240
0
                             bool word_end) const override {
241
0
    if (edge_ref == NO_EDGE) {
242
0
      return NO_EDGE;
243
0
    }
244
0
    EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245
0
    return (marker_flag_from_edge_rec(*edge_rec) &&
246
0
            unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247
0
            word_end == end_of_word_from_edge_rec(*edge_rec))
248
0
               ? edge_ref
249
0
               : NO_EDGE;
250
0
  }
251
252
  // Adds a word to the Trie (creates the necessary nodes and edges).
253
  //
254
  // If repetitions vector is not nullptr, each entry in the vector indicates
255
  // whether the unichar id with the corresponding index in the word is allowed
256
  // to repeat an unlimited number of times. For each entry that is true, MARKER
257
  // flag of the corresponding edge created for this unichar id is set to true).
258
  //
259
  // Return true if add succeeded, false otherwise (e.g. when a word contained
260
  // an invalid unichar id or the trie was getting too large and was cleared).
261
  bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
262
10
  bool add_word_to_dawg(const WERD_CHOICE &word) {
263
10
    return add_word_to_dawg(word, nullptr);
264
10
  }
265
266
protected:
267
  // The structure of an EDGE_REF for Trie edges is as follows:
268
  // [LETTER_START_BIT, flag_start_bit_):
269
  //                             edge index in *_edges in a TRIE_NODE_RECORD
270
  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
271
  //
272
  // With this arrangement there are enough bits to represent edge indices
273
  // (each node can have at most unicharset_size_ forward edges and
274
  // the position of flag_start_bit is set to be log2(unicharset_size_)).
275
  // It is also possible to accommodate a maximum number of nodes that is at
276
  // least as large as that of the SquishedDawg representation (in SquishedDawg
277
  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
278
  // the next node index).
279
  //
280
281
  // Returns the pointer to EDGE_RECORD after decoding the location
282
  // of the edge from the information in the given EDGE_REF.
283
  // This function assumes that EDGE_REF holds valid node/edge indices.
284
18.0k
  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
285
18.0k
    int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);
286
18.0k
    int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
287
18.0k
    TRIE_NODE_RECORD *node_rec = nodes_[node_index];
288
18.0k
    return &(node_rec->forward_edges[edge_index]);
289
18.0k
  }
290
  /** Constructs EDGE_REF from the given node_index and edge_index. */
291
4.61k
  inline EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const {
292
4.61k
    return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT));
293
4.61k
  }
294
  /** Sets up this edge record to the requested values. */
295
  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end,
296
118
                        UNICHAR_ID unichar_id) {
297
118
    EDGE_RECORD flags = 0;
298
118
    if (repeats) {
299
0
      flags |= MARKER_FLAG;
300
0
    }
301
118
    if (word_end) {
302
18
      flags |= WERD_END_FLAG;
303
18
    }
304
118
    if (direction == BACKWARD_EDGE) {
305
59
      flags |= DIRECTION_FLAG;
306
59
    }
307
118
    *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
308
118
             (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
309
118
  }
310
  /** Prints the given EDGE_RECORD. */
311
0
  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
312
0
    tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
313
0
            marker_flag_from_edge_rec(edge_rec) ? "R," : "",
314
0
            (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
315
0
            end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec));
316
0
  }
317
  // Returns true if the next node in recorded the given EDGE_RECORD
318
  // has exactly one forward edge.
319
0
  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
320
0
    NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321
0
    return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
322
0
  }
323
324
  // Prints the contents of the Trie.
325
  // At most max_num_edges will be printed for each node.
326
0
  void print_all(const char *msg, int max_num_edges) {
327
0
    tprintf("\n__________________________\n%s\n", msg);
328
0
    for (size_t i = 0; i < nodes_.size(); ++i) {
329
0
      print_node(i, max_num_edges);
330
0
    }
331
0
    tprintf("__________________________\n");
332
0
  }
333
334
  // Finds the edge with the given direction, word_end and unichar_id
335
  // in the node indicated by node_ref. Fills in the pointer to the
336
  // EDGE_RECORD and the index of the edge with the values
337
  // corresponding to the edge found. Returns true if an edge was found.
338
  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
339
                    UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
340
341
  // Adds an single edge linkage between node1 and node2 in the direction
342
  // indicated by direction argument.
343
  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end,
344
                        UNICHAR_ID unichar_id);
345
346
  // Adds forward edge linkage from node1 to node2 and the corresponding
347
  // backward edge linkage in the other direction.
348
  bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end,
349
59
                    UNICHAR_ID unichar_id) {
350
59
    return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&
351
59
            add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));
352
59
  }
353
354
  // Sets the word ending flags in an already existing edge pair.
355
  // Returns true on success.
356
  void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats,
357
                       UNICHAR_ID unichar_id);
358
359
  // Allocates space for a new node in the Trie.
360
  NODE_REF new_dawg_node();
361
362
  // Removes a single edge linkage to between node1 and node2 in the
363
  // direction indicated by direction argument.
364
  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
365
                           UNICHAR_ID unichar_id);
366
367
  // Removes forward edge linkage from node1 to node2 and the corresponding
368
  // backward edge linkage in the other direction.
369
0
  void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) {
370
0
    remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
371
0
    remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
372
0
  }
373
374
  // Compares edge1 and edge2 in the given node to see if they point to two
375
  // next nodes that could be collapsed. If they do, performs the reduction
376
  // and returns true.
377
  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2);
378
379
  // Assuming that edge_index indicates the first edge in a group of edges
380
  // in this node with a particular letter value, looks through these edges
381
  // to see if any of them can be collapsed. If so does it. Returns to the
382
  // caller when all edges with this letter have been reduced.
383
  // Returns true if further reduction is possible with this same letter.
384
  bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
385
                             EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);
386
387
  /**
388
   * Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
389
   * increasing order of unichar ids. This function is normally called
390
   * for all edges in a single node, and since number of edges in each node
391
   * is usually quite small, selection sort is used.
392
   */
393
  void sort_edges(EDGE_VECTOR *edges);
394
395
  /** Eliminates any redundant edges from this node in the Trie. */
396
  void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);
397
398
  // Returns the pattern unichar id for the given character class code.
399
  UNICHAR_ID character_class_to_pattern(char ch);
400
401
  // Member variables
402
  TRIE_NODES nodes_; // vector of nodes in the Trie
403
  // Freelist of edges in the root backwards node that were previously zeroed.
404
  std::vector<EDGE_INDEX> root_back_freelist_;
405
  uint64_t num_edges_ = 0;             // sum of all edges (forward and backward)
406
  uint64_t deref_direction_mask_ = 0;  // mask for EDGE_REF to extract direction
407
  uint64_t deref_node_index_mask_ = 0; // mask for EDGE_REF to extract node index
408
  // Variables for translating character class codes denoted in user patterns
409
  // file to the unichar ids used to represent them in a Trie.
410
  UNICHAR_ID alpha_pattern_ = 0;
411
  UNICHAR_ID digit_pattern_ = 0;
412
  UNICHAR_ID alphanum_pattern_ = 0;
413
  UNICHAR_ID punc_pattern_ = 0;
414
  UNICHAR_ID lower_pattern_ = 0;
415
  UNICHAR_ID upper_pattern_ = 0;
416
  bool initialized_patterns_ = false;
417
};
418
419
} // namespace tesseract
420
421
#endif