Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/dict/trie.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * File:         trie.cpp  (Formerly trie.c)
4
 * Description:  Functions to build a trie data structure.
5
 * Author:       Mark Seaman, OCR Technology
6
 *
7
 * (c) Copyright 1987, Hewlett-Packard Company.
8
 ** Licensed under the Apache License, Version 2.0 (the "License");
9
 ** you may not use this file except in compliance with the License.
10
 ** You may obtain a copy of the License at
11
 ** http://www.apache.org/licenses/LICENSE-2.0
12
 ** Unless required by applicable law or agreed to in writing, software
13
 ** distributed under the License is distributed on an "AS IS" BASIS,
14
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 ** See the License for the specific language governing permissions and
16
 ** limitations under the License.
17
 *
18
 *****************************************************************************/
19
/*----------------------------------------------------------------------
20
              I n c l u d e s
21
----------------------------------------------------------------------*/
22
23
#include "trie.h"
24
25
#include "dawg.h"
26
#include "dict.h"
27
#include "helpers.h"
28
#include "kdpair.h"
29
30
namespace tesseract {
31
32
const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
33
const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
34
const char kForceReverse[] = "RRP_FORCE_REVERSE";
35
36
const char *const RTLReversePolicyNames[] = {kDoNotReverse, kReverseIfHasRTL, kForceReverse};
37
38
const char Trie::kAlphaPatternUnicode[] = "\u2000";
39
const char Trie::kDigitPatternUnicode[] = "\u2001";
40
const char Trie::kAlphanumPatternUnicode[] = "\u2002";
41
const char Trie::kPuncPatternUnicode[] = "\u2003";
42
const char Trie::kLowerPatternUnicode[] = "\u2004";
43
const char Trie::kUpperPatternUnicode[] = "\u2005";
44
45
0
const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
46
0
  return RTLReversePolicyNames[reverse_policy];
47
0
}
48
49
// Reset the Trie to empty.
50
0
void Trie::clear() {
51
0
  for (auto node : nodes_) {
52
0
    delete node;
53
0
  }
54
0
  nodes_.clear();
55
0
  root_back_freelist_.clear();
56
0
  num_edges_ = 0;
57
0
  new_dawg_node(); // Need to allocate node 0.
58
0
}
59
60
bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
61
                        UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr,
62
4.13M
                        EDGE_INDEX *edge_index) const {
63
4.13M
  if (debug_level_ == 3) {
64
0
    tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
65
0
            " direction %d word_end %d unichar_id %d, exploring node:\n",
66
0
            node_ref, next_node, direction, word_end, unichar_id);
67
0
    if (node_ref != NO_EDGE) {
68
0
      print_node(node_ref, nodes_[node_ref]->forward_edges.size());
69
0
    }
70
0
  }
71
4.13M
  if (node_ref == NO_EDGE) {
72
0
    return false;
73
0
  }
74
4.13M
  assert(static_cast<size_t>(node_ref) < nodes_.size());
75
4.13M
  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges
76
4.13M
                                                 : nodes_[node_ref]->backward_edges;
77
4.13M
  int vec_size = vec.size();
78
4.13M
  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
79
3.94M
    EDGE_INDEX start = 0;
80
3.94M
    EDGE_INDEX end = vec_size - 1;
81
3.94M
    EDGE_INDEX k;
82
3.94M
    int compare;
83
11.2M
    while (start <= end) {
84
7.31M
      k = (start + end) >> 1; // (start + end) / 2
85
7.31M
      compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]);
86
7.31M
      if (compare == 0) { // given == vec[k]
87
49.6k
        *edge_ptr = &(vec[k]);
88
49.6k
        *edge_index = k;
89
49.6k
        return true;
90
7.26M
      } else if (compare == 1) { // given > vec[k]
91
5.14M
        start = k + 1;
92
5.14M
      } else { // given < vec[k]
93
2.11M
        end = k - 1;
94
2.11M
      }
95
7.31M
    }
96
3.94M
  } else { // linear search
97
398k
    for (int i = 0; i < vec_size; ++i) {
98
233k
      EDGE_RECORD &edge_rec = vec[i];
99
233k
      if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec),
100
233k
                         end_of_word_from_edge_rec(edge_rec), unichar_id_from_edge_rec(edge_rec))) {
101
21.2k
        *edge_ptr = &(edge_rec);
102
21.2k
        *edge_index = i;
103
21.2k
        return true;
104
21.2k
      }
105
233k
    }
106
186k
  }
107
4.06M
  return false; // not found
108
4.13M
}
109
110
bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, int direction,
111
146
                            bool word_end, UNICHAR_ID unichar_id) {
112
146
  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges)
113
146
                                                 : &(nodes_[node1]->backward_edges);
114
146
  unsigned search_index;
115
146
  if (node1 == 0 && direction == FORWARD_EDGE) {
116
10
    search_index = 0; // find the index to make the add sorted
117
29
    while (search_index < vec->size() &&
118
29
           given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) {
119
19
      search_index++;
120
19
    }
121
136
  } else {
122
136
    search_index = vec->size(); // add is unsorted, so index does not matter
123
136
  }
124
146
  EDGE_RECORD edge_rec;
125
146
  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
126
146
  if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
127
0
    EDGE_INDEX edge_index = root_back_freelist_.back();
128
0
    root_back_freelist_.pop_back();
129
0
    (*vec)[edge_index] = edge_rec;
130
146
  } else if (search_index < vec->size()) {
131
4
    vec->insert(vec->begin() + search_index, edge_rec);
132
142
  } else {
133
142
    vec->push_back(edge_rec);
134
142
  }
135
146
  if (debug_level_ > 1) {
136
0
    tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
137
0
    print_edge_rec(edge_rec);
138
0
    tprintf("\n");
139
0
  }
140
146
  num_edges_++;
141
146
  return true;
142
146
}
143
144
void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool marker_flag,
145
2
                           UNICHAR_ID unichar_id) {
146
2
  EDGE_RECORD *back_edge_ptr;
147
2
  EDGE_INDEX back_edge_index;
148
2
  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr,
149
2
                           &back_edge_index));
150
2
  if (marker_flag) {
151
0
    *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
152
0
    *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
153
0
  }
154
  // Mark both directions as end of word.
155
2
  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
156
2
  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
157
2
}
158
159
17
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
160
17
  if (word.length() <= 0) {
161
0
    return false; // can't add empty words
162
0
  }
163
17
  if (repetitions != nullptr) {
164
0
    ASSERT_HOST(repetitions->size() == word.length());
165
0
  }
166
  // Make sure the word does not contain invalid unchar ids.
167
102
  for (unsigned i = 0; i < word.length(); ++i) {
168
85
    if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) {
169
0
      return false;
170
0
    }
171
85
  }
172
173
17
  EDGE_RECORD *edge_ptr;
174
17
  NODE_REF last_node = 0;
175
17
  NODE_REF the_next_node;
176
17
  bool marker_flag = false;
177
17
  EDGE_INDEX edge_index;
178
17
  int32_t still_finding_chars = true;
179
17
  int32_t word_end = false;
180
17
  bool add_failed = false;
181
17
  bool found;
182
183
17
  if (debug_level_ > 1) {
184
0
    word.print("\nAdding word: ");
185
0
  }
186
187
17
  UNICHAR_ID unichar_id;
188
17
  unsigned i;
189
85
  for (i = 0; i < word.length() - 1; ++i) {
190
68
    unichar_id = word.unichar_id(i);
191
68
    marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
192
68
    if (debug_level_ > 1) {
193
0
      tprintf("Adding letter %d\n", unichar_id);
194
0
    }
195
68
    if (still_finding_chars) {
196
25
      found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
197
25
                           &edge_index);
198
25
      if (found && debug_level_ > 1) {
199
0
        tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", edge_index, last_node);
200
0
      }
201
25
      if (!found) {
202
15
        still_finding_chars = false;
203
15
      } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
204
        // We hit the end of an existing word, but the new word is longer.
205
        // In this case we have to disconnect the existing word from the
206
        // backwards root node, mark the current position as end-of-word
207
        // and add new nodes for the increased length. Disconnecting the
208
        // existing word from the backwards root node requires a linear
209
        // search, so it is much faster to add the longest words first,
210
        // to avoid having to come here.
211
0
        word_end = true;
212
0
        still_finding_chars = false;
213
0
        remove_edge(last_node, 0, word_end, unichar_id);
214
10
      } else {
215
        // We have to add a new branch here for the new word.
216
10
        if (marker_flag) {
217
0
          set_marker_flag_in_edge_rec(edge_ptr);
218
0
        }
219
10
        last_node = next_node_from_edge_rec(*edge_ptr);
220
10
      }
221
25
    }
222
68
    if (!still_finding_chars) {
223
58
      the_next_node = new_dawg_node();
224
58
      if (debug_level_ > 1) {
225
0
        tprintf("adding node " REFFORMAT "\n", the_next_node);
226
0
      }
227
58
      if (the_next_node == 0) {
228
0
        add_failed = true;
229
0
        break;
230
0
      }
231
58
      if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) {
232
0
        add_failed = true;
233
0
        break;
234
0
      }
235
58
      word_end = false;
236
58
      last_node = the_next_node;
237
58
    }
238
68
  }
239
17
  the_next_node = 0;
240
17
  unichar_id = word.unichar_id(i);
241
17
  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
242
17
  if (debug_level_ > 1) {
243
0
    tprintf("Adding letter %d\n", unichar_id);
244
0
  }
245
17
  if (still_finding_chars &&
246
17
      edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) {
247
    // An extension of this word already exists in the trie, so we
248
    // only have to add the ending flags in both directions.
249
2
    add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id);
250
15
  } else {
251
    // Add a link to node 0. All leaves connect to node 0 so the back links can
252
    // be used in reduction to a dawg. This root backward node has one edge
253
    // entry for every word, (except prefixes of longer words) so it is huge.
254
15
    if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) {
255
0
      add_failed = true;
256
0
    }
257
15
  }
258
17
  if (add_failed) {
259
0
    tprintf("Re-initializing document dictionary...\n");
260
0
    clear();
261
0
    return false;
262
17
  } else {
263
17
    return true;
264
17
  }
265
17
}
266
267
66
NODE_REF Trie::new_dawg_node() {
268
66
  auto *node = new TRIE_NODE_RECORD();
269
66
  nodes_.push_back(node);
270
66
  return nodes_.size() - 1;
271
66
}
272
273
bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
274
0
                                  Trie::RTLReversePolicy reverse_policy) {
275
0
  std::vector<std::string> word_list;
276
0
  if (!read_word_list(filename, &word_list)) {
277
0
    return false;
278
0
  }
279
0
  std::sort(word_list.begin(), word_list.end(),
280
0
            [](auto &s1, auto &s2) { return s1.size() > s2.size(); });
281
0
  return add_word_list(word_list, unicharset, reverse_policy);
282
0
}
283
284
0
bool Trie::read_word_list(const char *filename, std::vector<std::string> *words) {
285
0
  FILE *word_file;
286
0
  char line_str[CHARS_PER_LINE];
287
0
  int word_count = 0;
288
289
0
  word_file = fopen(filename, "rb");
290
0
  if (word_file == nullptr) {
291
0
    return false;
292
0
  }
293
294
0
  while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
295
0
    chomp_string(line_str); // remove newline
296
0
    std::string word_str(line_str);
297
0
    ++word_count;
298
0
    if (debug_level_ && word_count % 10000 == 0) {
299
0
      tprintf("Read %d words so far\n", word_count);
300
0
    }
301
0
    words->push_back(word_str);
302
0
  }
303
0
  if (debug_level_) {
304
0
    tprintf("Read %d words total.\n", word_count);
305
0
  }
306
0
  fclose(word_file);
307
0
  return true;
308
0
}
309
310
bool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
311
0
                         Trie::RTLReversePolicy reverse_policy) {
312
0
  for (const auto &i : words) {
313
0
    WERD_CHOICE word(i.c_str(), unicharset);
314
0
    if (word.empty() || word.contains_unichar_id(INVALID_UNICHAR_ID)) {
315
0
      continue;
316
0
    }
317
0
    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) ||
318
0
        reverse_policy == RRP_FORCE_REVERSE) {
319
0
      word.reverse_and_mirror_unichar_ids();
320
0
    }
321
0
    if (!word_in_dawg(word)) {
322
0
      add_word_to_dawg(word);
323
0
      if (!word_in_dawg(word)) {
324
0
        tprintf("Error: word '%s' not in DAWG after adding it\n", i.c_str());
325
0
        return false;
326
0
      }
327
0
    }
328
0
  }
329
0
  return true;
330
0
}
331
332
0
void Trie::initialize_patterns(UNICHARSET *unicharset) {
333
0
  unicharset->unichar_insert(kAlphaPatternUnicode);
334
0
  alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
335
0
  unicharset->unichar_insert(kDigitPatternUnicode);
336
0
  digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
337
0
  unicharset->unichar_insert(kAlphanumPatternUnicode);
338
0
  alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
339
0
  unicharset->unichar_insert(kPuncPatternUnicode);
340
0
  punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
341
0
  unicharset->unichar_insert(kLowerPatternUnicode);
342
0
  lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
343
0
  unicharset->unichar_insert(kUpperPatternUnicode);
344
0
  upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
345
0
  initialized_patterns_ = true;
346
0
  unicharset_size_ = unicharset->size();
347
0
}
348
349
void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
350
0
                                  std::vector<UNICHAR_ID> *vec) const {
351
0
  bool is_alpha = unicharset.get_isalpha(unichar_id);
352
0
  if (is_alpha) {
353
0
    vec->push_back(alpha_pattern_);
354
0
    vec->push_back(alphanum_pattern_);
355
0
    if (unicharset.get_islower(unichar_id)) {
356
0
      vec->push_back(lower_pattern_);
357
0
    } else if (unicharset.get_isupper(unichar_id)) {
358
0
      vec->push_back(upper_pattern_);
359
0
    }
360
0
  }
361
0
  if (unicharset.get_isdigit(unichar_id)) {
362
0
    vec->push_back(digit_pattern_);
363
0
    if (!is_alpha) {
364
0
      vec->push_back(alphanum_pattern_);
365
0
    }
366
0
  }
367
0
  if (unicharset.get_ispunctuation(unichar_id)) {
368
0
    vec->push_back(punc_pattern_);
369
0
  }
370
0
}
371
372
0
UNICHAR_ID Trie::character_class_to_pattern(char ch) {
373
0
  if (ch == 'c') {
374
0
    return alpha_pattern_;
375
0
  } else if (ch == 'd') {
376
0
    return digit_pattern_;
377
0
  } else if (ch == 'n') {
378
0
    return alphanum_pattern_;
379
0
  } else if (ch == 'p') {
380
0
    return punc_pattern_;
381
0
  } else if (ch == 'a') {
382
0
    return lower_pattern_;
383
0
  } else if (ch == 'A') {
384
0
    return upper_pattern_;
385
0
  } else {
386
0
    return INVALID_UNICHAR_ID;
387
0
  }
388
0
}
389
390
0
bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) {
391
0
  if (!initialized_patterns_) {
392
0
    tprintf("please call initialize_patterns() before read_pattern_list()\n");
393
0
    return false;
394
0
  }
395
396
0
  FILE *pattern_file = fopen(filename, "rb");
397
0
  if (pattern_file == nullptr) {
398
0
    tprintf("Error opening pattern file %s\n", filename);
399
0
    return false;
400
0
  }
401
402
0
  int pattern_count = 0;
403
0
  char string[CHARS_PER_LINE];
404
0
  while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
405
0
    chomp_string(string); // remove newline
406
    // Parse the pattern and construct a unichar id vector.
407
    // Record the number of repetitions of each unichar in the parallel vector.
408
0
    WERD_CHOICE word(&unicharset);
409
0
    std::vector<bool> repetitions_vec;
410
0
    const char *str_ptr = string;
411
0
    int step = unicharset.step(str_ptr);
412
0
    bool failed = false;
413
0
    while (step > 0) {
414
0
      UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
415
0
      if (step == 1 && *str_ptr == '\\') {
416
0
        ++str_ptr;
417
0
        if (*str_ptr == '\\') { // regular '\' unichar that was escaped
418
0
          curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
419
0
        } else {
420
#if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0.
421
          if (word.length() < kSaneNumConcreteChars) {
422
            tprintf(
423
                "Please provide at least %d concrete characters at the"
424
                " beginning of the pattern\n",
425
                kSaneNumConcreteChars);
426
            failed = true;
427
            break;
428
          }
429
#endif
430
          // Parse character class from expression.
431
0
          curr_unichar_id = character_class_to_pattern(*str_ptr);
432
0
        }
433
0
      } else {
434
0
        curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
435
0
      }
436
0
      if (curr_unichar_id == INVALID_UNICHAR_ID) {
437
0
        failed = true;
438
0
        break; // failed to parse this pattern
439
0
      }
440
0
      word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
441
0
      repetitions_vec.push_back(false);
442
0
      str_ptr += step;
443
0
      step = unicharset.step(str_ptr);
444
      // Check if there is a repetition pattern specified after this unichar.
445
0
      if (step == 1 && *str_ptr == '\\' && *(str_ptr + 1) == '*') {
446
0
        repetitions_vec[repetitions_vec.size() - 1] = true;
447
0
        str_ptr += 2;
448
0
        step = unicharset.step(str_ptr);
449
0
      }
450
0
    }
451
0
    if (failed) {
452
0
      tprintf("Invalid user pattern %s\n", string);
453
0
      continue;
454
0
    }
455
    // Insert the pattern into the trie.
456
0
    if (debug_level_ > 2) {
457
0
      tprintf("Inserting expanded user pattern %s\n", word.debug_string().c_str());
458
0
    }
459
0
    if (!this->word_in_dawg(word)) {
460
0
      this->add_word_to_dawg(word, &repetitions_vec);
461
0
      if (!this->word_in_dawg(word)) {
462
0
        tprintf("Error: failed to insert pattern '%s'\n", string);
463
0
      }
464
0
    }
465
0
    ++pattern_count;
466
0
  }
467
0
  if (debug_level_) {
468
0
    tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
469
0
  }
470
0
  fclose(pattern_file);
471
0
  return true;
472
0
}
473
474
void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
475
0
                               UNICHAR_ID unichar_id) {
476
0
  EDGE_RECORD *edge_ptr = nullptr;
477
0
  EDGE_INDEX edge_index = 0;
478
0
  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index));
479
0
  if (debug_level_ > 1) {
480
0
    tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
481
0
    print_edge_rec(*edge_ptr);
482
0
    tprintf("\n");
483
0
  }
484
0
  if (direction == FORWARD_EDGE) {
485
0
    nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
486
0
  } else if (node1 == 0) {
487
0
    KillEdge(&nodes_[node1]->backward_edges[edge_index]);
488
0
    root_back_freelist_.push_back(edge_index);
489
0
  } else {
490
0
    nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
491
0
  }
492
0
  --num_edges_;
493
0
}
494
495
// Some optimizations employed in add_word_to_dawg and trie_to_dawg:
496
// 1 Avoid insertion sorting or bubble sorting the tail root node
497
//   (back links on node 0, a list of all the leaves.). The node is
498
//   huge, and sorting it with n^2 time is terrible.
499
// 2 Avoid using vector::erase on the tail root node.
500
//   (a) During add of words to the trie, zero-out the unichars and
501
//       keep a freelist of spaces to re-use.
502
//   (b) During reduction, just zero-out the unichars of deleted back
503
//       links, skipping zero entries while searching.
504
// 3 Avoid linear search of the tail root node. This has to be done when
505
//   a suffix is added to an existing word. Adding words by decreasing
506
//   length avoids this problem entirely. Words can still be added in
507
//   any order, but it is faster to add the longest first.
508
0
SquishedDawg *Trie::trie_to_dawg() {
509
0
  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
510
0
  if (debug_level_ > 2) {
511
0
    print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
512
0
  }
513
0
  std::vector<bool> reduced_nodes(nodes_.size());
514
0
  this->reduce_node_input(0, reduced_nodes);
515
516
0
  if (debug_level_ > 2) {
517
0
    print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
518
0
  }
519
  // Build a translation map from node indices in nodes_ vector to
520
  // their target indices in EDGE_ARRAY.
521
0
  std::vector<NODE_REF> node_ref_map(nodes_.size() + 1);
522
0
  unsigned i;
523
0
  for (i = 0; i < nodes_.size(); ++i) {
524
0
    node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
525
0
  }
526
0
  int num_forward_edges = node_ref_map[i];
527
528
  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
529
  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
530
0
  auto edge_array = new EDGE_RECORD[num_forward_edges];
531
0
  EDGE_ARRAY edge_array_ptr = edge_array;
532
0
  for (i = 0; i < nodes_.size(); ++i) {
533
0
    TRIE_NODE_RECORD *node_ptr = nodes_[i];
534
0
    int end = node_ptr->forward_edges.size();
535
0
    for (int j = 0; j < end; ++j) {
536
0
      EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
537
0
      NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
538
0
      ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size());
539
0
      UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
540
0
      link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
541
0
                end_of_word_from_edge_rec(edge_rec), unichar_id);
542
0
      if (j == end - 1) {
543
0
        set_marker_flag_in_edge_rec(edge_array_ptr);
544
0
      }
545
0
      ++edge_array_ptr;
546
0
    }
547
0
  }
548
549
0
  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_,
550
0
                          debug_level_);
551
0
}
552
553
bool Trie::eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
554
0
                                     const EDGE_RECORD &edge2) {
555
0
  if (debug_level_ > 1) {
556
0
    tprintf("\nCollapsing node %" PRIi64 ":\n", node);
557
0
    print_node(node, MAX_NODE_EDGES_DISPLAY);
558
0
    tprintf("Candidate edges: ");
559
0
    print_edge_rec(edge1);
560
0
    tprintf(", ");
561
0
    print_edge_rec(edge2);
562
0
    tprintf("\n\n");
563
0
  }
564
0
  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
565
0
  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
566
0
  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
567
  // Translate all edges going to/from next_node2 to go to/from next_node1.
568
0
  EDGE_RECORD *edge_ptr = nullptr;
569
0
  EDGE_INDEX edge_index;
570
  // The backward link in node to next_node2 will be zeroed out by the caller.
571
  // Copy all the backward links in next_node2 to node next_node1
572
0
  for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
573
0
    const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
574
0
    NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
575
0
    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
576
0
    int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
577
0
    bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
578
0
    add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end,
579
0
                     curr_unichar_id);
580
    // Relocate the corresponding forward edge in curr_next_node
581
0
    ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end,
582
0
                             curr_unichar_id, &edge_ptr, &edge_index));
583
0
    set_next_node_in_edge_rec(edge_ptr, next_node1);
584
0
  }
585
0
  int next_node2_num_edges =
586
0
      (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size());
587
0
  if (debug_level_ > 1) {
588
0
    tprintf("removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2);
589
0
  }
590
0
  next_node2_ptr->forward_edges.clear();
591
0
  next_node2_ptr->backward_edges.clear();
592
0
  num_edges_ -= next_node2_num_edges;
593
0
  return true;
594
0
}
595
596
bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
597
0
                                 EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) {
598
0
  if (debug_level_ > 1) {
599
0
    tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
600
0
  }
601
  // Compare each of the edge pairs with the given unichar_id.
602
0
  bool did_something = false;
603
0
  for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) {
604
    // Find the first edge that can be eliminated.
605
0
    UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
606
0
    while (i < backward_edges->size()) {
607
0
      if (!DeadEdge((*backward_edges)[i])) {
608
0
        curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
609
0
        if (curr_unichar_id != unichar_id) {
610
0
          return did_something;
611
0
        }
612
0
        if (can_be_eliminated((*backward_edges)[i])) {
613
0
          break;
614
0
        }
615
0
      }
616
0
      ++i;
617
0
    }
618
0
    if (i == backward_edges->size()) {
619
0
      break;
620
0
    }
621
0
    const EDGE_RECORD &edge_rec = (*backward_edges)[i];
622
    // Compare it to the rest of the edges with the given unichar_id.
623
0
    for (auto j = i + 1; j < backward_edges->size(); ++j) {
624
0
      const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
625
0
      if (DeadEdge(next_edge_rec)) {
626
0
        continue;
627
0
      }
628
0
      UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
629
0
      if (next_id != unichar_id) {
630
0
        break;
631
0
      }
632
0
      if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) &&
633
0
          can_be_eliminated(next_edge_rec) &&
634
0
          eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
635
0
        reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
636
0
        did_something = true;
637
0
        KillEdge(&(*backward_edges)[j]);
638
0
      }
639
0
    }
640
0
  }
641
0
  return did_something;
642
0
}
643
644
0
void Trie::sort_edges(EDGE_VECTOR *edges) {
645
0
  int num_edges = edges->size();
646
0
  if (num_edges <= 1) {
647
0
    return;
648
0
  }
649
0
  std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
650
0
  sort_vec.reserve(num_edges);
651
0
  for (int i = 0; i < num_edges; ++i) {
652
0
    sort_vec.emplace_back(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]);
653
0
  }
654
0
  std::sort(sort_vec.begin(), sort_vec.end());
655
0
  for (int i = 0; i < num_edges; ++i) {
656
0
    (*edges)[i] = sort_vec[i].data();
657
0
  }
658
0
}
659
660
0
void Trie::reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes) {
661
0
  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
662
0
  sort_edges(&backward_edges);
663
0
  if (debug_level_ > 1) {
664
0
    tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
665
0
    print_node(node, MAX_NODE_EDGES_DISPLAY);
666
0
  }
667
668
0
  EDGE_INDEX edge_index = 0;
669
0
  while (static_cast<size_t>(edge_index) < backward_edges.size()) {
670
0
    if (DeadEdge(backward_edges[edge_index])) {
671
0
      continue;
672
0
    }
673
0
    UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]);
674
0
    while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) {
675
0
      ;
676
0
    }
677
0
    while (static_cast<size_t>(++edge_index) < backward_edges.size()) {
678
0
      UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
679
0
      if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) {
680
0
        break;
681
0
      }
682
0
    }
683
0
  }
684
0
  reduced_nodes[node] = true; // mark as reduced
685
686
0
  if (debug_level_ > 1) {
687
0
    tprintf("Node " REFFORMAT " after reduction:\n", node);
688
0
    print_node(node, MAX_NODE_EDGES_DISPLAY);
689
0
  }
690
691
0
  for (auto &backward_edge : backward_edges) {
692
0
    if (DeadEdge(backward_edge)) {
693
0
      continue;
694
0
    }
695
0
    NODE_REF next_node = next_node_from_edge_rec(backward_edge);
696
0
    if (next_node != 0 && !reduced_nodes[next_node]) {
697
0
      reduce_node_input(next_node, reduced_nodes);
698
0
    }
699
0
  }
700
0
}
701
702
0
void Trie::print_node(NODE_REF node, int max_num_edges) const {
703
0
  if (node == NO_EDGE) {
704
0
    return; // nothing to print
705
0
  }
706
0
  TRIE_NODE_RECORD *node_ptr = nodes_[node];
707
0
  int num_fwd = node_ptr->forward_edges.size();
708
0
  int num_bkw = node_ptr->backward_edges.size();
709
0
  EDGE_VECTOR *vec;
710
0
  for (int dir = 0; dir < 2; ++dir) {
711
0
    if (dir == 0) {
712
0
      vec = &(node_ptr->forward_edges);
713
0
      tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
714
0
    } else {
715
0
      vec = &(node_ptr->backward_edges);
716
0
      tprintf("\t");
717
0
    }
718
0
    int i;
719
0
    for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) {
720
0
      if (DeadEdge((*vec)[i])) {
721
0
        continue;
722
0
      }
723
0
      print_edge_rec((*vec)[i]);
724
0
      tprintf(" ");
725
0
    }
726
0
    if (dir == 0 ? i < num_fwd : i < num_bkw) {
727
0
      tprintf("...");
728
0
    }
729
0
    tprintf("\n");
730
0
  }
731
0
}
732
733
} // namespace tesseract