Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/dict/dawg.h
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * File:         dawg.h
4
 * Description:  Definition of a class that represents Directed Acyclic Word
5
 *               Graph (DAWG), functions to build and manipulate the DAWG.
6
 * Author:       Mark Seaman, SW Productivity
7
 *
8
 * (c) Copyright 1987, Hewlett-Packard Company.
9
 ** Licensed under the Apache License, Version 2.0 (the "License");
10
 ** you may not use this file except in compliance with the License.
11
 ** You may obtain a copy of the License at
12
 ** http://www.apache.org/licenses/LICENSE-2.0
13
 ** Unless required by applicable law or agreed to in writing, software
14
 ** distributed under the License is distributed on an "AS IS" BASIS,
15
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 ** See the License for the specific language governing permissions and
17
 ** limitations under the License.
18
 *
19
 *****************************************************************************/
20
21
#ifndef DICT_DAWG_H_
22
#define DICT_DAWG_H_
23
24
/*----------------------------------------------------------------------
25
              I n c l u d e s
26
----------------------------------------------------------------------*/
27
28
#include <cinttypes>  // for PRId64
29
#include <functional> // for std::function
30
#include <memory>
31
#include "elst.h"
32
#include "params.h"
33
#include "ratngs.h"
34
35
#ifndef __GNUC__
36
#  ifdef _WIN32
37
#    define NO_EDGE static_cast<int64_t>(0xffffffffffffffffi64)
38
#  endif /*_WIN32*/
39
#else
40
164M
#  define NO_EDGE static_cast<int64_t>(0xffffffffffffffffll)
41
#endif /*__GNUC__*/
42
43
namespace tesseract {
44
45
class UNICHARSET;
46
47
using EDGE_RECORD = uint64_t;
48
using EDGE_ARRAY = EDGE_RECORD *;
49
using EDGE_REF = int64_t;
50
using NODE_REF = int64_t;
51
using NODE_MAP = EDGE_REF *;
52
53
struct NodeChild {
54
  UNICHAR_ID unichar_id;
55
  EDGE_REF edge_ref;
56
0
  NodeChild(UNICHAR_ID id, EDGE_REF ref) : unichar_id(id), edge_ref(ref) {}
57
0
  NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
58
};
59
60
using NodeChildVector = std::vector<NodeChild>;
61
using SuccessorList = std::vector<int>;
62
using SuccessorListsVector = std::vector<SuccessorList *>;
63
64
enum DawgType {
65
  DAWG_TYPE_PUNCTUATION,
66
  DAWG_TYPE_WORD,
67
  DAWG_TYPE_NUMBER,
68
  DAWG_TYPE_PATTERN,
69
70
  DAWG_TYPE_COUNT // number of enum entries
71
};
72
73
/*----------------------------------------------------------------------
74
              C o n s t a n t s
75
----------------------------------------------------------------------*/
76
77
7.03M
#define FORWARD_EDGE static_cast<int32_t>(0)
78
250
#define BACKWARD_EDGE static_cast<int32_t>(1)
79
0
#define MAX_NODE_EDGES_DISPLAY static_cast<int64_t>(100)
80
101M
#define MARKER_FLAG static_cast<int64_t>(1)
81
63
#define DIRECTION_FLAG static_cast<int64_t>(2)
82
72.2M
#define WERD_END_FLAG static_cast<int64_t>(4)
83
163M
#define LETTER_START_BIT 0
84
40
#define NUM_FLAG_BITS 3
85
0
#define REFFORMAT "%" PRId64
86
87
static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
88
    {false, true, true, false},   // for DAWG_TYPE_PUNCTUATION
89
    {true, false, false, false},  // for DAWG_TYPE_WORD
90
    {true, false, false, false},  // for DAWG_TYPE_NUMBER
91
    {false, false, false, false}, // for DAWG_TYPE_PATTERN
92
};
93
94
static const char kWildcard[] = "*";
95
96
/*----------------------------------------------------------------------
97
              C l a s s e s   a n d   S t r u c t s
98
----------------------------------------------------------------------*/
99
//
100
/// Abstract class (an interface) that declares methods needed by the
101
/// various tesseract classes to operate on SquishedDawg and Trie objects.
102
///
103
/// This class initializes all the edge masks (since their usage by
104
/// SquishedDawg and Trie is identical) and implements simple accessors
105
/// for each of the fields encoded in an EDGE_RECORD.
106
/// This class also implements word_in_dawg() and check_for_words()
107
/// (since they use only the public methods of SquishedDawg and Trie
108
/// classes that are inherited from the Dawg base class).
109
//
110
class TESS_API Dawg {
111
public:
112
  /// Magic number to determine endianness when reading the Dawg from file.
113
  static constexpr int16_t kDawgMagicNumber = 42;
114
  /// A special unichar id that indicates that any appropriate pattern
115
  /// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead
116
  /// Used for expressing patterns in punctuation and number Dawgs.
117
  static const UNICHAR_ID kPatternUnicharID = 0;
118
119
32.1M
  inline DawgType type() const {
120
32.1M
    return type_;
121
32.1M
  }
122
136
  inline const std::string &lang() const {
123
136
    return lang_;
124
136
  }
125
5.82M
  inline PermuterType permuter() const {
126
5.82M
    return perm_;
127
5.82M
  }
128
129
  virtual ~Dawg();
130
131
  /// Returns true if the given word is in the Dawg.
132
  bool word_in_dawg(const WERD_CHOICE &word) const;
133
134
  // Returns true if the given word prefix is not contraindicated by the dawg.
135
  // If requires_complete is true, then the exact complete word must be present.
136
  bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
137
138
  /// Checks the Dawg for the words that are listed in the requested file.
139
  /// Returns the number of words in the given file missing from the Dawg.
140
  int check_for_words(const char *filename, const UNICHARSET &unicharset,
141
                      bool enable_wildcard) const;
142
143
  // For each word in the Dawg, call the given (permanent) callback with the
144
  // text (UTF-8) version of the word.
145
  void iterate_words(const UNICHARSET &unicharset,
146
                     std::function<void(const WERD_CHOICE *)> cb) const;
147
148
  // For each word in the Dawg, call the given (permanent) callback with the
149
  // text (UTF-8) version of the word.
150
  void iterate_words(const UNICHARSET &unicharset,
151
                     const std::function<void(const char *)> &cb) const;
152
153
  // Pure virtual function that should be implemented by the derived classes.
154
155
  /// Returns the edge that corresponds to the letter out of this node.
156
  virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
157
                                bool word_end) const = 0;
158
159
  /// Fills the given NodeChildVector with all the unichar ids (and the
160
  /// corresponding EDGE_REFs) for which there is an edge out of this node.
161
  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
162
                              bool word_end) const = 0;
163
164
  /// Returns the next node visited by following the edge
165
  /// indicated by the given EDGE_REF.
166
  virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
167
168
  /// Returns true if the edge indicated by the given EDGE_REF
169
  /// marks the end of a word.
170
  virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
171
172
  /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
173
  virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
174
175
  /// Prints the contents of the node indicated by the given NODE_REF.
176
  /// At most max_num_edges will be printed.
177
  virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
178
179
  /// Fills vec with unichar ids that represent the character classes
180
  /// of the given unichar_id.
181
  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
182
                                      const UNICHARSET &unicharset,
183
0
                                      std::vector<UNICHAR_ID> *vec) const {
184
0
    (void)unichar_id;
185
0
    (void)unicharset;
186
0
    (void)vec;
187
0
  }
188
189
  /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
190
  /// a self loop and the given unichar_id matches the unichar_id stored in the
191
  /// EDGE_RECORD, returns NO_EDGE otherwise.
192
  virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,
193
0
                                     bool word_end) const {
194
0
    (void)edge_ref;
195
0
    (void)unichar_id;
196
0
    (void)word_end;
197
0
    return false;
198
0
  }
199
200
protected:
201
  Dawg(DawgType type, const std::string &lang, PermuterType perm,
202
       int debug_level)
203
20
      : lang_(lang),
204
20
        type_(type),
205
20
        perm_(perm),
206
20
        unicharset_size_(0),
207
20
        debug_level_(debug_level) {}
208
209
  /// Returns the next node visited by following this edge.
210
70.4M
  inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
211
70.4M
    return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
212
70.4M
  }
213
  /// Returns the marker flag of this edge.
214
0
  inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
215
0
    return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
216
0
  }
217
  /// Returns the direction flag of this edge.
218
16
  inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
219
16
    return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? BACKWARD_EDGE
220
16
                                                              : FORWARD_EDGE;
221
16
  }
222
  /// Returns true if this edge marks the end of a word.
223
72.2M
  inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
224
72.2M
    return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
225
72.2M
  }
226
  /// Returns UNICHAR_ID recorded in this edge.
227
  inline UNICHAR_ID unichar_id_from_edge_rec(
228
163M
      const EDGE_RECORD &edge_rec) const {
229
163M
    return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
230
163M
  }
231
  /// Sets the next node link for this edge in the Dawg.
232
0
  inline void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value) {
233
0
    *edge_rec &= (~next_node_mask_);
234
0
    *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
235
0
  }
236
  /// Sets this edge record to be the last one in a sequence of edges.
237
0
  inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
238
0
    *edge_rec |= (MARKER_FLAG << flag_start_bit_);
239
0
  }
240
  /// Sequentially compares the given values of unichar ID, next node
241
  /// and word end marker with the values in the given EDGE_RECORD.
242
  /// Returns: 1 if at any step the given input value exceeds
243
  ///            that of edge_rec (and all the values already
244
  ///            checked are the same)
245
  ///          0 if edge_rec_match() returns true
246
  ///         -1 otherwise
247
  inline int given_greater_than_edge_rec(NODE_REF next_node, bool word_end,
248
                                         UNICHAR_ID unichar_id,
249
61.2M
                                         const EDGE_RECORD &edge_rec) const {
250
61.2M
    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
251
61.2M
    NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
252
61.2M
    bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
253
61.2M
    if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
254
61.2M
                       curr_word_end, curr_unichar_id)) {
255
7.03M
      return 0;
256
7.03M
    }
257
54.2M
    if (unichar_id > curr_unichar_id) {
258
28.7M
      return 1;
259
28.7M
    }
260
25.4M
    if (unichar_id == curr_unichar_id) {
261
162k
      if (next_node > curr_next_node) {
262
0
        return 1;
263
0
      }
264
162k
      if (next_node == curr_next_node) {
265
0
        if (word_end > curr_word_end) {
266
0
          return 1;
267
0
        }
268
0
      }
269
162k
    }
270
25.4M
    return -1;
271
25.4M
  }
272
  /// Returns true if all the values are equal (any value matches
273
  /// next_node if next_node == NO_EDGE, any value matches word_end
274
  /// if word_end is false).
275
  inline bool edge_rec_match(NODE_REF next_node, bool word_end,
276
                             UNICHAR_ID unichar_id, NODE_REF other_next_node,
277
                             bool other_word_end,
278
61.3M
                             UNICHAR_ID other_unichar_id) const {
279
61.3M
    return ((unichar_id == other_unichar_id) &&
280
61.3M
            (next_node == NO_EDGE || next_node == other_next_node) &&
281
61.3M
            (!word_end || (word_end == other_word_end)));
282
61.3M
  }
283
284
  /// Sets unicharset_size_.
285
  /// Initializes the values of various masks from unicharset_size_.
286
  void init(int unicharset_size);
287
288
  /// Matches all of the words that are represented by this string.
289
  /// If wildcard is set to something other than INVALID_UNICHAR_ID,
290
  /// the *'s in this string are interpreted as wildcards.
291
  /// WERD_CHOICE param is not passed by const so that wildcard searches
292
  /// can modify it and work without having to copy WERD_CHOICEs.
293
  bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,
294
                   UNICHAR_ID wildcard) const;
295
296
  // Recursively iterate over all words in a dawg (see public iterate_words).
297
  void iterate_words_rec(
298
      const WERD_CHOICE &word_so_far, NODE_REF to_explore,
299
      const std::function<void(const WERD_CHOICE *)> &cb) const;
300
301
  // Member Variables.
302
  std::string lang_;
303
  DawgType type_;
304
  /// Permuter code that should be used if the word is found in this Dawg.
305
  PermuterType perm_;
306
  // Variables to construct various edge masks. Formerly:
307
  // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64
308
  // #define FLAGS_MASK     (int64_t) 0x0000000700000000i64
309
  // #define LETTER_MASK    (int64_t) 0x00000000ffffffffi64
310
  uint64_t next_node_mask_ = 0;
311
  uint64_t flags_mask_ = 0;
312
  uint64_t letter_mask_ = 0;
313
  int unicharset_size_;
314
  int flag_start_bit_ = 0;
315
  int next_node_start_bit_ = 0;
316
  // Level of debug statements to print to stdout.
317
  int debug_level_;
318
};
319
320
//
321
// DawgPosition keeps track of where we are in the primary dawg we're searching
322
// as well as where we may be in the "punctuation dawg" which may provide
323
// surrounding context.
324
//
325
// Example:
326
//   punctuation dawg  -- space is the "pattern character"
327
//     " "     // no punctuation
328
//     "' '"   // leading and trailing apostrophes
329
//     " '"    // trailing apostrophe
330
//   word dawg:
331
//     "cat"
332
//     "cab"
333
//     "cat's"
334
//
335
//  DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
336
//
337
//  DawgPosition(-1, NO_EDGE, p, pe, false)
338
//    We're in the punctuation dawg, no other dawg has been started.
339
//    (1) If there's a pattern edge as a punc dawg child of us,
340
//        for each punc-following dawg starting with ch, produce:
341
//        Result: DawgPosition(k, w, p', false)
342
//    (2) If there's a valid continuation in the punc dawg, produce:
343
//        Result: DawgPosition(-k, NO_EDGE, p', false)
344
//
345
//  DawgPosition(k, w, -1, NO_EDGE, false)
346
//    We're in dawg k.  Going back to punctuation dawg is not an option.
347
//    Follow ch in dawg k.
348
//
349
//  DawgPosition(k, w, p, pe, false)
350
//    We're in dawg k.  Continue in dawg k and/or go back to the punc dawg.
351
//    If ending, check that the punctuation dawg is also ok to end here.
352
//
353
//  DawgPosition(k, w, p, pe true)
354
//    We're back in the punctuation dawg.  Continuing there is the only option.
355
struct DawgPosition {
356
  DawgPosition() = default;
357
  DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref,
358
               bool backtopunc)
359
5.58M
      : dawg_ref(dawgref),
360
5.58M
        punc_ref(puncref),
361
5.58M
        dawg_index(dawg_idx),
362
5.58M
        punc_index(punc_idx),
363
5.58M
        back_to_punc(backtopunc) {}
364
1.30M
  bool operator==(const DawgPosition &other) const {
365
1.30M
    return dawg_index == other.dawg_index && dawg_ref == other.dawg_ref &&
366
1.30M
           punc_index == other.punc_index && punc_ref == other.punc_ref &&
367
1.30M
           back_to_punc == other.back_to_punc;
368
1.30M
  }
369
370
  EDGE_REF dawg_ref = NO_EDGE;
371
  EDGE_REF punc_ref = NO_EDGE;
372
  int8_t dawg_index = -1;
373
  int8_t punc_index = -1;
374
  // Have we returned to the punc dawg at the end of the word?
375
  bool back_to_punc = false;
376
};
377
378
class DawgPositionVector : public std::vector<DawgPosition> {
379
public:
380
  /// Adds an entry for the given dawg_index with the given node to the vec.
381
  /// Returns false if the same entry already exists in the vector,
382
  /// true otherwise.
383
  inline bool add_unique(const DawgPosition &new_pos, bool debug,
384
3.22M
                         const char *debug_msg) {
385
3.22M
    for (auto &&position : *this) {
386
1.30M
      if (position == new_pos) {
387
0
        return false;
388
0
      }
389
1.30M
    }
390
3.22M
    push_back(new_pos);
391
3.22M
    if (debug) {
392
0
      tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n", debug_msg,
393
0
              new_pos.dawg_index, new_pos.dawg_ref, new_pos.punc_ref,
394
0
              new_pos.back_to_punc ? " returned" : "");
395
0
    }
396
3.22M
    return true;
397
3.22M
  }
398
};
399
400
//
401
/// Concrete class that can operate on a compacted (squished) Dawg (read,
402
/// search and write to file). This class is read-only in the sense that
403
/// new words cannot be added to an instance of SquishedDawg.
404
/// The underlying representation of the nodes and edges in SquishedDawg
405
/// is stored as a contiguous EDGE_ARRAY (read from file or given as an
406
/// argument to the constructor).
407
//
408
class TESS_API SquishedDawg : public Dawg {
409
public:
410
  SquishedDawg(DawgType type, const std::string &lang, PermuterType perm,
411
               int debug_level)
412
16
      : Dawg(type, lang, perm, debug_level) {}
413
  SquishedDawg(const char *filename, DawgType type, const std::string &lang,
414
               PermuterType perm, int debug_level)
415
0
      : Dawg(type, lang, perm, debug_level) {
416
0
    TFile file;
417
0
    ASSERT_HOST(file.Open(filename, nullptr));
418
0
    ASSERT_HOST(read_squished_dawg(&file));
419
0
    num_forward_edges_in_node0 = num_forward_edges(0);
420
0
  }
421
  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
422
               const std::string &lang, PermuterType perm, int unicharset_size,
423
               int debug_level)
424
0
      : Dawg(type, lang, perm, debug_level),
425
0
        edges_(edges),
426
0
        num_edges_(num_edges) {
427
0
    init(unicharset_size);
428
0
    num_forward_edges_in_node0 = num_forward_edges(0);
429
0
    if (debug_level > 3) {
430
0
      print_all("SquishedDawg:");
431
0
    }
432
0
  }
433
  ~SquishedDawg() override;
434
435
  // Loads using the given TFile. Returns false on failure.
436
16
  bool Load(TFile *fp) {
437
16
    if (!read_squished_dawg(fp)) {
438
0
      return false;
439
0
    }
440
16
    num_forward_edges_in_node0 = num_forward_edges(0);
441
16
    return true;
442
16
  }
443
444
0
  int NumEdges() {
445
0
    return num_edges_;
446
0
  }
447
448
  /// Returns the edge that corresponds to the letter out of this node.
449
  EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
450
                        bool word_end) const override;
451
452
  /// Fills the given NodeChildVector with all the unichar ids (and the
453
  /// corresponding EDGE_REFs) for which there is an edge out of this node.
454
  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
455
0
                      bool word_end) const override {
456
0
    EDGE_REF edge = node;
457
0
    if (!edge_occupied(edge) || edge == NO_EDGE) {
458
0
      return;
459
0
    }
460
0
    assert(forward_edge(edge)); // we don't expect any backward edges to
461
0
    do {                        // be present when this function is called
462
0
      if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
463
0
        vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
464
0
      }
465
0
    } while (!last_edge(edge++));
466
0
  }
467
468
  /// Returns the next node visited by following the edge
469
  /// indicated by the given EDGE_REF.
470
9.02M
  NODE_REF next_node(EDGE_REF edge) const override {
471
9.02M
    return next_node_from_edge_rec((edges_[edge]));
472
9.02M
  }
473
474
  /// Returns true if the edge indicated by the given EDGE_REF
475
  /// marks the end of a word.
476
10.8M
  bool end_of_word(EDGE_REF edge_ref) const override {
477
10.8M
    return end_of_word_from_edge_rec((edges_[edge_ref]));
478
10.8M
  }
479
480
  /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
481
0
  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
482
0
    return unichar_id_from_edge_rec((edges_[edge_ref]));
483
0
  }
484
485
  /// Prints the contents of the node indicated by the given NODE_REF.
486
  /// At most max_num_edges will be printed.
487
  void print_node(NODE_REF node, int max_num_edges) const override;
488
489
  /// Writes the squished/reduced Dawg to a file.
490
  bool write_squished_dawg(TFile *file);
491
492
  /// Opens the file with the given filename and writes the
493
  /// squished/reduced Dawg to the file.
494
0
  bool write_squished_dawg(const char *filename) {
495
0
    TFile file;
496
0
    file.OpenWrite(nullptr);
497
0
    if (!this->write_squished_dawg(&file)) {
498
0
      tprintf("Error serializing %s\n", filename);
499
0
      return false;
500
0
    }
501
0
    if (!file.CloseWrite(filename, nullptr)) {
502
0
      tprintf("Error writing file %s\n", filename);
503
0
      return false;
504
0
    }
505
0
    return true;
506
0
  }
507
508
private:
509
  /// Sets the next node link for this edge.
510
0
  inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
511
0
    set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
512
0
  }
513
  /// Sets the edge to be empty.
514
0
  inline void set_empty_edge(EDGE_REF edge_ref) {
515
0
    (edges_[edge_ref] = next_node_mask_);
516
0
  }
517
  /// Goes through all the edges and clears each one out.
518
0
  inline void clear_all_edges() {
519
0
    for (int edge = 0; edge < num_edges_; edge++) {
520
0
      set_empty_edge(edge);
521
0
    }
522
0
  }
523
  /// Clears the last flag of this edge.
524
0
  inline void clear_marker_flag(EDGE_REF edge_ref) {
525
0
    (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
526
0
  }
527
  /// Returns true if this edge is in the forward direction.
528
16
  inline bool forward_edge(EDGE_REF edge_ref) const {
529
16
    return (edge_occupied(edge_ref) &&
530
16
            (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
531
16
  }
532
  /// Returns true if this edge is in the backward direction.
533
0
  inline bool backward_edge(EDGE_REF edge_ref) const {
534
0
    return (edge_occupied(edge_ref) &&
535
0
            (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
536
0
  }
537
  /// Returns true if the edge spot in this location is occupied.
538
8.81M
  inline bool edge_occupied(EDGE_REF edge_ref) const {
539
8.81M
    return (edges_[edge_ref] != next_node_mask_);
540
8.81M
  }
541
  /// Returns true if this edge is the last edge in a sequence.
542
101M
  inline bool last_edge(EDGE_REF edge_ref) const {
543
101M
    return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
544
101M
  }
545
546
  /// Counts and returns the number of forward edges in this node.
547
  int32_t num_forward_edges(NODE_REF node) const;
548
549
  /// Reads SquishedDawg from a file.
550
  bool read_squished_dawg(TFile *file);
551
552
  /// Prints the contents of an edge indicated by the given EDGE_REF.
553
  void print_edge(EDGE_REF edge) const;
554
555
  /// Prints the contents of the SquishedDawg.
556
0
  void print_all(const char *msg) {
557
0
    tprintf("\n__________________________\n%s\n", msg);
558
0
    for (int i = 0; i < num_edges_; ++i) {
559
0
      print_edge(i);
560
0
    }
561
0
    tprintf("__________________________\n");
562
0
  }
563
  /// Constructs a mapping from the memory node indices to disk node indices.
564
  std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const;
565
566
  // Member variables.
567
  EDGE_ARRAY edges_ = nullptr;
568
  int32_t num_edges_ = 0;
569
  int num_forward_edges_in_node0 = 0;
570
};
571
572
} // namespace tesseract
573
574
#endif // DICT_DAWG_H_