Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/dict/dawg.cpp
Line
Count
Source (jump to first uncovered line)
1
/********************************************************************************
2
 *
3
 * File:         dawg.cpp  (Formerly dawg.c)
4
 * Description:  Use a Directed Acyclic Word Graph
5
 * Author:       Mark Seaman, OCR Technology
6
 *
7
 * (c) Copyright 1987, Hewlett-Packard Company.
8
 ** Licensed under the Apache License, Version 2.0 (the "License");
9
 ** you may not use this file except in compliance with the License.
10
 ** You may obtain a copy of the License at
11
 ** http://www.apache.org/licenses/LICENSE-2.0
12
 ** Unless required by applicable law or agreed to in writing, software
13
 ** distributed under the License is distributed on an "AS IS" BASIS,
14
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 ** See the License for the specific language governing permissions and
16
 ** limitations under the License.
17
 *
18
 *********************************************************************************/
19
/*----------------------------------------------------------------------
20
              I n c l u d e s
21
----------------------------------------------------------------------*/
22
23
#include "dawg.h"
24
25
#include "dict.h"
26
#include "helpers.h"
27
#include "tprintf.h"
28
29
#include <memory>
30
31
/*----------------------------------------------------------------------
32
              F u n c t i o n s   f o r   D a w g
33
----------------------------------------------------------------------*/
34
namespace tesseract {
35
36
// Destructor.
37
// It is defined here, so the compiler can create a single vtable
38
// instead of weak vtables in every compilation unit.
39
6
Dawg::~Dawg() = default;
40
41
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
42
0
                          bool requires_complete) const {
43
0
  if (word.empty()) {
44
0
    return !requires_complete;
45
0
  }
46
0
  NODE_REF node = 0;
47
0
  int end_index = word.length() - 1;
48
0
  for (int i = 0; i < end_index; i++) {
49
0
    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
50
0
    if (edge == NO_EDGE) {
51
0
      return false;
52
0
    }
53
0
    if ((node = next_node(edge)) == 0) {
54
      // This only happens if all words following this edge terminate --
55
      // there are no larger words.  See Trie::add_word_to_dawg()
56
0
      return false;
57
0
    }
58
0
  }
59
  // Now check the last character.
60
0
  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
61
0
         NO_EDGE;
62
0
}
63
64
0
bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
65
0
  return prefix_in_dawg(word, true);
66
0
}
67
68
int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset,
69
0
                          bool enable_wildcard) const {
70
0
  if (filename == nullptr) {
71
0
    return 0;
72
0
  }
73
74
0
  FILE *word_file;
75
0
  char string[CHARS_PER_LINE];
76
0
  int misses = 0;
77
0
  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
78
79
0
  word_file = fopen(filename, "r");
80
0
  if (word_file == nullptr) {
81
0
    tprintf("Error: Could not open file %s\n", filename);
82
0
    ASSERT_HOST(word_file);
83
0
  }
84
85
0
  while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) {
86
0
    chomp_string(string); // remove newline
87
0
    WERD_CHOICE word(string, unicharset);
88
0
    if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
89
0
      if (!match_words(&word, 0, 0,
90
0
                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
91
0
        tprintf("Missing word: %s\n", string);
92
0
        ++misses;
93
0
      }
94
0
    } else {
95
0
      tprintf("Failed to create a valid word from %s\n", string);
96
0
    }
97
0
  }
98
0
  fclose(word_file);
99
  // Make sure the user sees this with fprintf instead of tprintf.
100
0
  if (debug_level_) {
101
0
    tprintf("Number of lost words=%d\n", misses);
102
0
  }
103
0
  return misses;
104
0
}
105
106
void Dawg::iterate_words(const UNICHARSET &unicharset,
107
0
                         std::function<void(const WERD_CHOICE *)> cb) const {
108
0
  WERD_CHOICE word(&unicharset);
109
0
  iterate_words_rec(word, 0, cb);
110
0
}
111
112
static void CallWithUTF8(const std::function<void(const char *)> &cb,
113
0
                         const WERD_CHOICE *wc) {
114
0
  std::string s;
115
0
  wc->string_and_lengths(&s, nullptr);
116
0
  cb(s.c_str());
117
0
}
118
119
void Dawg::iterate_words(const UNICHARSET &unicharset,
120
0
                         const std::function<void(const char *)> &cb) const {
121
0
  using namespace std::placeholders; // for _1
122
0
  std::function<void(const WERD_CHOICE *)> shim(
123
0
      std::bind(CallWithUTF8, cb, _1));
124
0
  WERD_CHOICE word(&unicharset);
125
0
  iterate_words_rec(word, 0, shim);
126
0
}
127
128
void Dawg::iterate_words_rec(
129
    const WERD_CHOICE &word_so_far, NODE_REF to_explore,
130
0
    const std::function<void(const WERD_CHOICE *)> &cb) const {
131
0
  NodeChildVector children;
132
0
  this->unichar_ids_of(to_explore, &children, false);
133
0
  for (auto &i : children) {
134
0
    WERD_CHOICE next_word(word_so_far);
135
0
    next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0);
136
0
    if (this->end_of_word(i.edge_ref)) {
137
0
      cb(&next_word);
138
0
    }
139
0
    NODE_REF next = next_node(i.edge_ref);
140
0
    if (next != 0) {
141
0
      iterate_words_rec(next_word, next, cb);
142
0
    }
143
0
  }
144
0
}
145
146
bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,
147
0
                       UNICHAR_ID wildcard) const {
148
0
  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
149
0
    bool any_matched = false;
150
0
    NodeChildVector vec;
151
0
    this->unichar_ids_of(node, &vec, false);
152
0
    for (auto &i : vec) {
153
0
      word->set_unichar_id(i.unichar_id, index);
154
0
      if (match_words(word, index, node, wildcard)) {
155
0
        any_matched = true;
156
0
      }
157
0
    }
158
0
    word->set_unichar_id(wildcard, index);
159
0
    return any_matched;
160
0
  } else {
161
0
    auto word_end = index == word->length() - 1;
162
0
    auto edge = edge_char_of(node, word->unichar_id(index), word_end);
163
0
    if (edge != NO_EDGE) { // normal edge in DAWG
164
0
      node = next_node(edge);
165
0
      if (word_end) {
166
0
        if (debug_level_ > 1) {
167
0
          word->print("match_words() found: ");
168
0
        }
169
0
        return true;
170
0
      } else if (node != 0) {
171
0
        return match_words(word, index + 1, node, wildcard);
172
0
      }
173
0
    }
174
0
  }
175
0
  return false;
176
0
}
177
178
20
void Dawg::init(int unicharset_size) {
179
20
  ASSERT_HOST(unicharset_size > 0);
180
20
  unicharset_size_ = unicharset_size;
181
  // Set bit masks. We will use the value unicharset_size_ as a null char, so
182
  // the actual number of unichars is unicharset_size_ + 1.
183
20
  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
184
20
  next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
185
20
  letter_mask_ = ~(~0ull << flag_start_bit_);
186
20
  next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
187
20
  flags_mask_ = ~(letter_mask_ | next_node_mask_);
188
20
}
189
190
/*----------------------------------------------------------------------
191
         F u n c t i o n s   f o r   S q u i s h e d    D a w g
192
----------------------------------------------------------------------*/
193
194
6
SquishedDawg::~SquishedDawg() {
195
6
  delete[] edges_;
196
6
}
197
198
EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
199
22.6M
                                    bool word_end) const {
200
22.6M
  EDGE_REF edge = node;
201
22.6M
  if (node == 0) { // binary search
202
13.8M
    EDGE_REF start = 0;
203
13.8M
    EDGE_REF end = num_forward_edges_in_node0 - 1;
204
13.8M
    int compare;
205
65.0M
    while (start <= end) {
206
58.2M
      edge = (start + end) >> 1; // (start + end) / 2
207
58.2M
      compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id,
208
58.2M
                                            edges_[edge]);
209
58.2M
      if (compare == 0) { // given == vec[k]
210
7.02M
        return edge;
211
51.2M
      } else if (compare == 1) { // given > vec[k]
212
27.0M
        start = edge + 1;
213
27.0M
      } else { // given < vec[k]
214
24.1M
        end = edge - 1;
215
24.1M
      }
216
58.2M
    }
217
13.8M
  } else { // linear search
218
8.81M
    if (edge != NO_EDGE && edge_occupied(edge)) {
219
102M
      do {
220
102M
        if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
221
102M
            (!word_end || end_of_word_from_edge_rec(edges_[edge]))) {
222
856k
          return (edge);
223
856k
        }
224
102M
      } while (!last_edge(edge++));
225
8.81M
    }
226
8.81M
  }
227
14.8M
  return (NO_EDGE); // not found
228
22.6M
}
229
230
16
int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
231
16
  EDGE_REF edge = node;
232
16
  int32_t num = 0;
233
234
16
  if (forward_edge(edge)) {
235
636
    do {
236
636
      num++;
237
636
    } while (!last_edge(edge++));
238
16
  }
239
240
16
  return (num);
241
16
}
242
243
0
void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
244
0
  if (node == NO_EDGE) {
245
0
    return; // nothing to print
246
0
  }
247
248
0
  EDGE_REF edge = node;
249
0
  const char *forward_string = "FORWARD";
250
0
  const char *backward_string = "       ";
251
252
0
  const char *last_string = "LAST";
253
0
  const char *not_last_string = "    ";
254
255
0
  const char *eow_string = "EOW";
256
0
  const char *not_eow_string = "   ";
257
258
0
  const char *direction;
259
0
  const char *is_last;
260
0
  const char *eow;
261
262
0
  UNICHAR_ID unichar_id;
263
264
0
  if (edge_occupied(edge)) {
265
0
    do {
266
0
      direction = forward_edge(edge) ? forward_string : backward_string;
267
0
      is_last = last_edge(edge) ? last_string : not_last_string;
268
0
      eow = end_of_word(edge) ? eow_string : not_eow_string;
269
270
0
      unichar_id = edge_letter(edge);
271
0
      tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
272
0
              edge, next_node(edge), unichar_id, direction, is_last, eow);
273
274
0
      if (edge - node > max_num_edges) {
275
0
        return;
276
0
      }
277
0
    } while (!last_edge(edge++));
278
279
0
    if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {
280
0
      do {
281
0
        direction = forward_edge(edge) ? forward_string : backward_string;
282
0
        is_last = last_edge(edge) ? last_string : not_last_string;
283
0
        eow = end_of_word(edge) ? eow_string : not_eow_string;
284
285
0
        unichar_id = edge_letter(edge);
286
0
        tprintf(REFFORMAT " : next = " REFFORMAT
287
0
                          ", unichar_id = %d, %s %s %s\n",
288
0
                edge, next_node(edge), unichar_id, direction, is_last, eow);
289
290
0
        if (edge - node > MAX_NODE_EDGES_DISPLAY) {
291
0
          return;
292
0
        }
293
0
      } while (!last_edge(edge++));
294
0
    }
295
0
  } else {
296
0
    tprintf(REFFORMAT " : no edges in this node\n", node);
297
0
  }
298
0
  tprintf("\n");
299
0
}
300
301
0
void SquishedDawg::print_edge(EDGE_REF edge) const {
302
0
  if (edge == NO_EDGE) {
303
0
    tprintf("NO_EDGE\n");
304
0
  } else {
305
0
    tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n",
306
0
            edge, next_node(edge), edge_letter(edge),
307
0
            (forward_edge(edge) ? "FORWARD" : "       "),
308
0
            (last_edge(edge) ? "LAST" : "    "),
309
0
            (end_of_word(edge) ? "EOW" : ""));
310
0
  }
311
0
}
312
313
16
bool SquishedDawg::read_squished_dawg(TFile *file) {
314
16
  if (debug_level_) {
315
0
    tprintf("Reading squished dawg\n");
316
0
  }
317
318
  // Read the magic number and check that it matches kDawgMagicNumber, as
319
  // auto-endian fixing should make sure it is always correct.
320
16
  int16_t magic;
321
16
  if (!file->DeSerialize(&magic)) {
322
0
    return false;
323
0
  }
324
16
  if (magic != kDawgMagicNumber) {
325
0
    tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
326
0
    return false;
327
0
  }
328
329
16
  int32_t unicharset_size;
330
16
  if (!file->DeSerialize(&unicharset_size)) {
331
0
    return false;
332
0
  }
333
16
  if (!file->DeSerialize(&num_edges_)) {
334
0
    return false;
335
0
  }
336
16
  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
337
16
  Dawg::init(unicharset_size);
338
339
16
  edges_ = new EDGE_RECORD[num_edges_];
340
16
  if (!file->DeSerialize(&edges_[0], num_edges_)) {
341
0
    return false;
342
0
  }
343
16
  if (debug_level_ > 2) {
344
0
    tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
345
0
            type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
346
0
    for (EDGE_REF edge = 0; edge < num_edges_; ++edge) {
347
0
      print_edge(edge);
348
0
    }
349
0
  }
350
16
  return true;
351
16
}
352
353
std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
354
0
    int32_t *num_nodes) const {
355
0
  EDGE_REF edge;
356
0
  std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
357
0
  int32_t node_counter;
358
0
  int32_t num_edges;
359
360
0
  for (edge = 0; edge < num_edges_; edge++) { // init all slots
361
0
    node_map[edge] = -1;
362
0
  }
363
364
0
  node_counter = num_forward_edges(0);
365
366
0
  *num_nodes = 0;
367
0
  for (edge = 0; edge < num_edges_; edge++) { // search all slots
368
369
0
    if (forward_edge(edge)) {
370
0
      (*num_nodes)++; // count nodes links
371
0
      node_map[edge] = (edge ? node_counter : 0);
372
0
      num_edges = num_forward_edges(edge);
373
0
      if (edge != 0) {
374
0
        node_counter += num_edges;
375
0
      }
376
0
      edge += num_edges;
377
0
      if (edge >= num_edges_) {
378
0
        break;
379
0
      }
380
0
      if (backward_edge(edge)) {
381
0
        while (!last_edge(edge++)) {
382
0
          ;
383
0
        }
384
0
      }
385
0
      edge--;
386
0
    }
387
0
  }
388
0
  return node_map;
389
0
}
390
391
0
bool SquishedDawg::write_squished_dawg(TFile *file) {
392
0
  EDGE_REF edge;
393
0
  int32_t num_edges;
394
0
  int32_t node_count = 0;
395
0
  EDGE_REF old_index;
396
0
  EDGE_RECORD temp_record;
397
398
0
  if (debug_level_) {
399
0
    tprintf("write_squished_dawg\n");
400
0
  }
401
402
0
  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
403
404
  // Write the magic number to help detecting a change in endianness.
405
0
  int16_t magic = kDawgMagicNumber;
406
0
  if (!file->Serialize(&magic)) {
407
0
    return false;
408
0
  }
409
0
  if (!file->Serialize(&unicharset_size_)) {
410
0
    return false;
411
0
  }
412
413
  // Count the number of edges in this Dawg.
414
0
  num_edges = 0;
415
0
  for (edge = 0; edge < num_edges_; edge++) {
416
0
    if (forward_edge(edge)) {
417
0
      num_edges++;
418
0
    }
419
0
  }
420
421
  // Write edge count to file.
422
0
  if (!file->Serialize(&num_edges)) {
423
0
    return false;
424
0
  }
425
426
0
  if (debug_level_) {
427
0
    tprintf("%d nodes in DAWG\n", node_count);
428
0
    tprintf("%d edges in DAWG\n", num_edges);
429
0
  }
430
431
0
  for (edge = 0; edge < num_edges_; edge++) {
432
0
    if (forward_edge(edge)) { // write forward edges
433
0
      do {
434
0
        old_index = next_node_from_edge_rec(edges_[edge]);
435
0
        set_next_node(edge, node_map[old_index]);
436
0
        temp_record = edges_[edge];
437
0
        if (!file->Serialize(&temp_record)) {
438
0
          return false;
439
0
        }
440
0
        set_next_node(edge, old_index);
441
0
      } while (!last_edge(edge++));
442
443
0
      if (edge >= num_edges_) {
444
0
        break;
445
0
      }
446
0
      if (backward_edge(edge)) { // skip back links
447
0
        while (!last_edge(edge++)) {
448
0
          ;
449
0
        }
450
0
      }
451
452
0
      edge--;
453
0
    }
454
0
  }
455
0
  return true;
456
0
}
457
458
} // namespace tesseract