/src/tesseract/src/dict/dawg.h
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * File: dawg.h |
4 | | * Description: Definition of a class that represents Directed Acyclic Word |
5 | | * Graph (DAWG), functions to build and manipulate the DAWG. |
6 | | * Author: Mark Seaman, SW Productivity |
7 | | * |
8 | | * (c) Copyright 1987, Hewlett-Packard Company. |
9 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
10 | | ** you may not use this file except in compliance with the License. |
11 | | ** You may obtain a copy of the License at |
12 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
13 | | ** Unless required by applicable law or agreed to in writing, software |
14 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
15 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | ** See the License for the specific language governing permissions and |
17 | | ** limitations under the License. |
18 | | * |
19 | | *****************************************************************************/ |
20 | | |
21 | | #ifndef DICT_DAWG_H_ |
22 | | #define DICT_DAWG_H_ |
23 | | |
24 | | /*---------------------------------------------------------------------- |
25 | | I n c l u d e s |
26 | | ----------------------------------------------------------------------*/ |
27 | | |
28 | | #include <cinttypes> // for PRId64 |
29 | | #include <functional> // for std::function |
30 | | #include <memory> |
31 | | #include "elst.h" |
32 | | #include "params.h" |
33 | | #include "ratngs.h" |
34 | | |
35 | | #ifndef __GNUC__ |
36 | | # ifdef _WIN32 |
37 | | # define NO_EDGE static_cast<int64_t>(0xffffffffffffffffi64) |
38 | | # endif /*_WIN32*/ |
39 | | #else |
40 | 164M | # define NO_EDGE static_cast<int64_t>(0xffffffffffffffffll) |
41 | | #endif /*__GNUC__*/ |
42 | | |
43 | | namespace tesseract { |
44 | | |
45 | | class UNICHARSET; |
46 | | |
47 | | using EDGE_RECORD = uint64_t; |
48 | | using EDGE_ARRAY = EDGE_RECORD *; |
49 | | using EDGE_REF = int64_t; |
50 | | using NODE_REF = int64_t; |
51 | | using NODE_MAP = EDGE_REF *; |
52 | | |
53 | | struct NodeChild { |
54 | | UNICHAR_ID unichar_id; |
55 | | EDGE_REF edge_ref; |
56 | 0 | NodeChild(UNICHAR_ID id, EDGE_REF ref) : unichar_id(id), edge_ref(ref) {} |
57 | 0 | NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} |
58 | | }; |
59 | | |
60 | | using NodeChildVector = std::vector<NodeChild>; |
61 | | using SuccessorList = std::vector<int>; |
62 | | using SuccessorListsVector = std::vector<SuccessorList *>; |
63 | | |
64 | | enum DawgType { |
65 | | DAWG_TYPE_PUNCTUATION, |
66 | | DAWG_TYPE_WORD, |
67 | | DAWG_TYPE_NUMBER, |
68 | | DAWG_TYPE_PATTERN, |
69 | | |
70 | | DAWG_TYPE_COUNT // number of enum entries |
71 | | }; |
72 | | |
73 | | /*---------------------------------------------------------------------- |
74 | | C o n s t a n t s |
75 | | ----------------------------------------------------------------------*/ |
76 | | |
77 | 7.03M | #define FORWARD_EDGE static_cast<int32_t>(0) |
78 | 250 | #define BACKWARD_EDGE static_cast<int32_t>(1) |
79 | 0 | #define MAX_NODE_EDGES_DISPLAY static_cast<int64_t>(100) |
80 | 101M | #define MARKER_FLAG static_cast<int64_t>(1) |
81 | 63 | #define DIRECTION_FLAG static_cast<int64_t>(2) |
82 | 72.2M | #define WERD_END_FLAG static_cast<int64_t>(4) |
83 | 163M | #define LETTER_START_BIT 0 |
84 | 40 | #define NUM_FLAG_BITS 3 |
85 | 0 | #define REFFORMAT "%" PRId64 |
86 | | |
87 | | static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { |
88 | | {false, true, true, false}, // for DAWG_TYPE_PUNCTUATION |
89 | | {true, false, false, false}, // for DAWG_TYPE_WORD |
90 | | {true, false, false, false}, // for DAWG_TYPE_NUMBER |
91 | | {false, false, false, false}, // for DAWG_TYPE_PATTERN |
92 | | }; |
93 | | |
94 | | static const char kWildcard[] = "*"; |
95 | | |
96 | | /*---------------------------------------------------------------------- |
97 | | C l a s s e s a n d S t r u c t s |
98 | | ----------------------------------------------------------------------*/ |
99 | | // |
100 | | /// Abstract class (an interface) that declares methods needed by the |
101 | | /// various tesseract classes to operate on SquishedDawg and Trie objects. |
102 | | /// |
103 | | /// This class initializes all the edge masks (since their usage by |
104 | | /// SquishedDawg and Trie is identical) and implements simple accessors |
105 | | /// for each of the fields encoded in an EDGE_RECORD. |
106 | | /// This class also implements word_in_dawg() and check_for_words() |
107 | | /// (since they use only the public methods of SquishedDawg and Trie |
108 | | /// classes that are inherited from the Dawg base class). |
109 | | // |
110 | | class TESS_API Dawg { |
111 | | public: |
112 | | /// Magic number to determine endianness when reading the Dawg from file. |
113 | | static constexpr int16_t kDawgMagicNumber = 42; |
114 | | /// A special unichar id that indicates that any appropriate pattern |
115 | | /// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead |
116 | | /// Used for expressing patterns in punctuation and number Dawgs. |
117 | | static const UNICHAR_ID kPatternUnicharID = 0; |
118 | | |
119 | 32.1M | inline DawgType type() const { |
120 | 32.1M | return type_; |
121 | 32.1M | } |
122 | 136 | inline const std::string &lang() const { |
123 | 136 | return lang_; |
124 | 136 | } |
125 | 5.82M | inline PermuterType permuter() const { |
126 | 5.82M | return perm_; |
127 | 5.82M | } |
128 | | |
129 | | virtual ~Dawg(); |
130 | | |
131 | | /// Returns true if the given word is in the Dawg. |
132 | | bool word_in_dawg(const WERD_CHOICE &word) const; |
133 | | |
134 | | // Returns true if the given word prefix is not contraindicated by the dawg. |
135 | | // If requires_complete is true, then the exact complete word must be present. |
136 | | bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const; |
137 | | |
138 | | /// Checks the Dawg for the words that are listed in the requested file. |
139 | | /// Returns the number of words in the given file missing from the Dawg. |
140 | | int check_for_words(const char *filename, const UNICHARSET &unicharset, |
141 | | bool enable_wildcard) const; |
142 | | |
143 | | // For each word in the Dawg, call the given (permanent) callback with the |
144 | | // text (UTF-8) version of the word. |
145 | | void iterate_words(const UNICHARSET &unicharset, |
146 | | std::function<void(const WERD_CHOICE *)> cb) const; |
147 | | |
148 | | // For each word in the Dawg, call the given (permanent) callback with the |
149 | | // text (UTF-8) version of the word. |
150 | | void iterate_words(const UNICHARSET &unicharset, |
151 | | const std::function<void(const char *)> &cb) const; |
152 | | |
153 | | // Pure virtual function that should be implemented by the derived classes. |
154 | | |
155 | | /// Returns the edge that corresponds to the letter out of this node. |
156 | | virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, |
157 | | bool word_end) const = 0; |
158 | | |
159 | | /// Fills the given NodeChildVector with all the unichar ids (and the |
160 | | /// corresponding EDGE_REFs) for which there is an edge out of this node. |
161 | | virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, |
162 | | bool word_end) const = 0; |
163 | | |
164 | | /// Returns the next node visited by following the edge |
165 | | /// indicated by the given EDGE_REF. |
166 | | virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; |
167 | | |
168 | | /// Returns true if the edge indicated by the given EDGE_REF |
169 | | /// marks the end of a word. |
170 | | virtual bool end_of_word(EDGE_REF edge_ref) const = 0; |
171 | | |
172 | | /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. |
173 | | virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; |
174 | | |
175 | | /// Prints the contents of the node indicated by the given NODE_REF. |
176 | | /// At most max_num_edges will be printed. |
177 | | virtual void print_node(NODE_REF node, int max_num_edges) const = 0; |
178 | | |
179 | | /// Fills vec with unichar ids that represent the character classes |
180 | | /// of the given unichar_id. |
181 | | virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, |
182 | | const UNICHARSET &unicharset, |
183 | 0 | std::vector<UNICHAR_ID> *vec) const { |
184 | 0 | (void)unichar_id; |
185 | 0 | (void)unicharset; |
186 | 0 | (void)vec; |
187 | 0 | } |
188 | | |
189 | | /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has |
190 | | /// a self loop and the given unichar_id matches the unichar_id stored in the |
191 | | /// EDGE_RECORD, returns NO_EDGE otherwise. |
192 | | virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, |
193 | 0 | bool word_end) const { |
194 | 0 | (void)edge_ref; |
195 | 0 | (void)unichar_id; |
196 | 0 | (void)word_end; |
197 | 0 | return false; |
198 | 0 | } |
199 | | |
200 | | protected: |
201 | | Dawg(DawgType type, const std::string &lang, PermuterType perm, |
202 | | int debug_level) |
203 | 20 | : lang_(lang), |
204 | 20 | type_(type), |
205 | 20 | perm_(perm), |
206 | 20 | unicharset_size_(0), |
207 | 20 | debug_level_(debug_level) {} |
208 | | |
209 | | /// Returns the next node visited by following this edge. |
210 | 70.4M | inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
211 | 70.4M | return ((edge_rec & next_node_mask_) >> next_node_start_bit_); |
212 | 70.4M | } |
213 | | /// Returns the marker flag of this edge. |
214 | 0 | inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
215 | 0 | return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0; |
216 | 0 | } |
217 | | /// Returns the direction flag of this edge. |
218 | 16 | inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
219 | 16 | return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? BACKWARD_EDGE |
220 | 16 | : FORWARD_EDGE; |
221 | 16 | } |
222 | | /// Returns true if this edge marks the end of a word. |
223 | 72.2M | inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { |
224 | 72.2M | return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; |
225 | 72.2M | } |
226 | | /// Returns UNICHAR_ID recorded in this edge. |
227 | | inline UNICHAR_ID unichar_id_from_edge_rec( |
228 | 163M | const EDGE_RECORD &edge_rec) const { |
229 | 163M | return ((edge_rec & letter_mask_) >> LETTER_START_BIT); |
230 | 163M | } |
231 | | /// Sets the next node link for this edge in the Dawg. |
232 | 0 | inline void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value) { |
233 | 0 | *edge_rec &= (~next_node_mask_); |
234 | 0 | *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); |
235 | 0 | } |
236 | | /// Sets this edge record to be the last one in a sequence of edges. |
237 | 0 | inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) { |
238 | 0 | *edge_rec |= (MARKER_FLAG << flag_start_bit_); |
239 | 0 | } |
240 | | /// Sequentially compares the given values of unichar ID, next node |
241 | | /// and word end marker with the values in the given EDGE_RECORD. |
242 | | /// Returns: 1 if at any step the given input value exceeds |
243 | | /// that of edge_rec (and all the values already |
244 | | /// checked are the same) |
245 | | /// 0 if edge_rec_match() returns true |
246 | | /// -1 otherwise |
247 | | inline int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, |
248 | | UNICHAR_ID unichar_id, |
249 | 61.2M | const EDGE_RECORD &edge_rec) const { |
250 | 61.2M | UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); |
251 | 61.2M | NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); |
252 | 61.2M | bool curr_word_end = end_of_word_from_edge_rec(edge_rec); |
253 | 61.2M | if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, |
254 | 61.2M | curr_word_end, curr_unichar_id)) { |
255 | 7.03M | return 0; |
256 | 7.03M | } |
257 | 54.2M | if (unichar_id > curr_unichar_id) { |
258 | 28.7M | return 1; |
259 | 28.7M | } |
260 | 25.4M | if (unichar_id == curr_unichar_id) { |
261 | 162k | if (next_node > curr_next_node) { |
262 | 0 | return 1; |
263 | 0 | } |
264 | 162k | if (next_node == curr_next_node) { |
265 | 0 | if (word_end > curr_word_end) { |
266 | 0 | return 1; |
267 | 0 | } |
268 | 0 | } |
269 | 162k | } |
270 | 25.4M | return -1; |
271 | 25.4M | } |
272 | | /// Returns true if all the values are equal (any value matches |
273 | | /// next_node if next_node == NO_EDGE, any value matches word_end |
274 | | /// if word_end is false). |
275 | | inline bool edge_rec_match(NODE_REF next_node, bool word_end, |
276 | | UNICHAR_ID unichar_id, NODE_REF other_next_node, |
277 | | bool other_word_end, |
278 | 61.3M | UNICHAR_ID other_unichar_id) const { |
279 | 61.3M | return ((unichar_id == other_unichar_id) && |
280 | 61.3M | (next_node == NO_EDGE || next_node == other_next_node) && |
281 | 61.3M | (!word_end || (word_end == other_word_end))); |
282 | 61.3M | } |
283 | | |
284 | | /// Sets unicharset_size_. |
285 | | /// Initializes the values of various masks from unicharset_size_. |
286 | | void init(int unicharset_size); |
287 | | |
288 | | /// Matches all of the words that are represented by this string. |
289 | | /// If wildcard is set to something other than INVALID_UNICHAR_ID, |
290 | | /// the *'s in this string are interpreted as wildcards. |
291 | | /// WERD_CHOICE param is not passed by const so that wildcard searches |
292 | | /// can modify it and work without having to copy WERD_CHOICEs. |
293 | | bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, |
294 | | UNICHAR_ID wildcard) const; |
295 | | |
296 | | // Recursively iterate over all words in a dawg (see public iterate_words). |
297 | | void iterate_words_rec( |
298 | | const WERD_CHOICE &word_so_far, NODE_REF to_explore, |
299 | | const std::function<void(const WERD_CHOICE *)> &cb) const; |
300 | | |
301 | | // Member Variables. |
302 | | std::string lang_; |
303 | | DawgType type_; |
304 | | /// Permuter code that should be used if the word is found in this Dawg. |
305 | | PermuterType perm_; |
306 | | // Variables to construct various edge masks. Formerly: |
307 | | // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64 |
308 | | // #define FLAGS_MASK (int64_t) 0x0000000700000000i64 |
309 | | // #define LETTER_MASK (int64_t) 0x00000000ffffffffi64 |
310 | | uint64_t next_node_mask_ = 0; |
311 | | uint64_t flags_mask_ = 0; |
312 | | uint64_t letter_mask_ = 0; |
313 | | int unicharset_size_; |
314 | | int flag_start_bit_ = 0; |
315 | | int next_node_start_bit_ = 0; |
316 | | // Level of debug statements to print to stdout. |
317 | | int debug_level_; |
318 | | }; |
319 | | |
320 | | // |
321 | | // DawgPosition keeps track of where we are in the primary dawg we're searching |
322 | | // as well as where we may be in the "punctuation dawg" which may provide |
323 | | // surrounding context. |
324 | | // |
325 | | // Example: |
326 | | // punctuation dawg -- space is the "pattern character" |
327 | | // " " // no punctuation |
328 | | // "' '" // leading and trailing apostrophes |
329 | | // " '" // trailing apostrophe |
330 | | // word dawg: |
331 | | // "cat" |
332 | | // "cab" |
333 | | // "cat's" |
334 | | // |
335 | | // DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp) |
336 | | // |
337 | | // DawgPosition(-1, NO_EDGE, p, pe, false) |
338 | | // We're in the punctuation dawg, no other dawg has been started. |
339 | | // (1) If there's a pattern edge as a punc dawg child of us, |
340 | | // for each punc-following dawg starting with ch, produce: |
341 | | // Result: DawgPosition(k, w, p', false) |
342 | | // (2) If there's a valid continuation in the punc dawg, produce: |
343 | | // Result: DawgPosition(-k, NO_EDGE, p', false) |
344 | | // |
345 | | // DawgPosition(k, w, -1, NO_EDGE, false) |
346 | | // We're in dawg k. Going back to punctuation dawg is not an option. |
347 | | // Follow ch in dawg k. |
348 | | // |
349 | | // DawgPosition(k, w, p, pe, false) |
350 | | // We're in dawg k. Continue in dawg k and/or go back to the punc dawg. |
351 | | // If ending, check that the punctuation dawg is also ok to end here. |
352 | | // |
353 | | // DawgPosition(k, w, p, pe true) |
354 | | // We're back in the punctuation dawg. Continuing there is the only option. |
355 | | struct DawgPosition { |
356 | | DawgPosition() = default; |
357 | | DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, |
358 | | bool backtopunc) |
359 | 5.58M | : dawg_ref(dawgref), |
360 | 5.58M | punc_ref(puncref), |
361 | 5.58M | dawg_index(dawg_idx), |
362 | 5.58M | punc_index(punc_idx), |
363 | 5.58M | back_to_punc(backtopunc) {} |
364 | 1.30M | bool operator==(const DawgPosition &other) const { |
365 | 1.30M | return dawg_index == other.dawg_index && dawg_ref == other.dawg_ref && |
366 | 1.30M | punc_index == other.punc_index && punc_ref == other.punc_ref && |
367 | 1.30M | back_to_punc == other.back_to_punc; |
368 | 1.30M | } |
369 | | |
370 | | EDGE_REF dawg_ref = NO_EDGE; |
371 | | EDGE_REF punc_ref = NO_EDGE; |
372 | | int8_t dawg_index = -1; |
373 | | int8_t punc_index = -1; |
374 | | // Have we returned to the punc dawg at the end of the word? |
375 | | bool back_to_punc = false; |
376 | | }; |
377 | | |
378 | | class DawgPositionVector : public std::vector<DawgPosition> { |
379 | | public: |
380 | | /// Adds an entry for the given dawg_index with the given node to the vec. |
381 | | /// Returns false if the same entry already exists in the vector, |
382 | | /// true otherwise. |
383 | | inline bool add_unique(const DawgPosition &new_pos, bool debug, |
384 | 3.22M | const char *debug_msg) { |
385 | 3.22M | for (auto &&position : *this) { |
386 | 1.30M | if (position == new_pos) { |
387 | 0 | return false; |
388 | 0 | } |
389 | 1.30M | } |
390 | 3.22M | push_back(new_pos); |
391 | 3.22M | if (debug) { |
392 | 0 | tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n", debug_msg, |
393 | 0 | new_pos.dawg_index, new_pos.dawg_ref, new_pos.punc_ref, |
394 | 0 | new_pos.back_to_punc ? " returned" : ""); |
395 | 0 | } |
396 | 3.22M | return true; |
397 | 3.22M | } |
398 | | }; |
399 | | |
400 | | // |
401 | | /// Concrete class that can operate on a compacted (squished) Dawg (read, |
402 | | /// search and write to file). This class is read-only in the sense that |
403 | | /// new words cannot be added to an instance of SquishedDawg. |
404 | | /// The underlying representation of the nodes and edges in SquishedDawg |
405 | | /// is stored as a contiguous EDGE_ARRAY (read from file or given as an |
406 | | /// argument to the constructor). |
407 | | // |
408 | | class TESS_API SquishedDawg : public Dawg { |
409 | | public: |
410 | | SquishedDawg(DawgType type, const std::string &lang, PermuterType perm, |
411 | | int debug_level) |
412 | 16 | : Dawg(type, lang, perm, debug_level) {} |
413 | | SquishedDawg(const char *filename, DawgType type, const std::string &lang, |
414 | | PermuterType perm, int debug_level) |
415 | 0 | : Dawg(type, lang, perm, debug_level) { |
416 | 0 | TFile file; |
417 | 0 | ASSERT_HOST(file.Open(filename, nullptr)); |
418 | 0 | ASSERT_HOST(read_squished_dawg(&file)); |
419 | 0 | num_forward_edges_in_node0 = num_forward_edges(0); |
420 | 0 | } |
421 | | SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, |
422 | | const std::string &lang, PermuterType perm, int unicharset_size, |
423 | | int debug_level) |
424 | 0 | : Dawg(type, lang, perm, debug_level), |
425 | 0 | edges_(edges), |
426 | 0 | num_edges_(num_edges) { |
427 | 0 | init(unicharset_size); |
428 | 0 | num_forward_edges_in_node0 = num_forward_edges(0); |
429 | 0 | if (debug_level > 3) { |
430 | 0 | print_all("SquishedDawg:"); |
431 | 0 | } |
432 | 0 | } |
433 | | ~SquishedDawg() override; |
434 | | |
435 | | // Loads using the given TFile. Returns false on failure. |
436 | 16 | bool Load(TFile *fp) { |
437 | 16 | if (!read_squished_dawg(fp)) { |
438 | 0 | return false; |
439 | 0 | } |
440 | 16 | num_forward_edges_in_node0 = num_forward_edges(0); |
441 | 16 | return true; |
442 | 16 | } |
443 | | |
444 | 0 | int NumEdges() { |
445 | 0 | return num_edges_; |
446 | 0 | } |
447 | | |
448 | | /// Returns the edge that corresponds to the letter out of this node. |
449 | | EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, |
450 | | bool word_end) const override; |
451 | | |
452 | | /// Fills the given NodeChildVector with all the unichar ids (and the |
453 | | /// corresponding EDGE_REFs) for which there is an edge out of this node. |
454 | | void unichar_ids_of(NODE_REF node, NodeChildVector *vec, |
455 | 0 | bool word_end) const override { |
456 | 0 | EDGE_REF edge = node; |
457 | 0 | if (!edge_occupied(edge) || edge == NO_EDGE) { |
458 | 0 | return; |
459 | 0 | } |
460 | 0 | assert(forward_edge(edge)); // we don't expect any backward edges to |
461 | 0 | do { // be present when this function is called |
462 | 0 | if (!word_end || end_of_word_from_edge_rec(edges_[edge])) { |
463 | 0 | vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); |
464 | 0 | } |
465 | 0 | } while (!last_edge(edge++)); |
466 | 0 | } |
467 | | |
468 | | /// Returns the next node visited by following the edge |
469 | | /// indicated by the given EDGE_REF. |
470 | 9.02M | NODE_REF next_node(EDGE_REF edge) const override { |
471 | 9.02M | return next_node_from_edge_rec((edges_[edge])); |
472 | 9.02M | } |
473 | | |
474 | | /// Returns true if the edge indicated by the given EDGE_REF |
475 | | /// marks the end of a word. |
476 | 10.8M | bool end_of_word(EDGE_REF edge_ref) const override { |
477 | 10.8M | return end_of_word_from_edge_rec((edges_[edge_ref])); |
478 | 10.8M | } |
479 | | |
480 | | /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. |
481 | 0 | UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override { |
482 | 0 | return unichar_id_from_edge_rec((edges_[edge_ref])); |
483 | 0 | } |
484 | | |
485 | | /// Prints the contents of the node indicated by the given NODE_REF. |
486 | | /// At most max_num_edges will be printed. |
487 | | void print_node(NODE_REF node, int max_num_edges) const override; |
488 | | |
489 | | /// Writes the squished/reduced Dawg to a file. |
490 | | bool write_squished_dawg(TFile *file); |
491 | | |
492 | | /// Opens the file with the given filename and writes the |
493 | | /// squished/reduced Dawg to the file. |
494 | 0 | bool write_squished_dawg(const char *filename) { |
495 | 0 | TFile file; |
496 | 0 | file.OpenWrite(nullptr); |
497 | 0 | if (!this->write_squished_dawg(&file)) { |
498 | 0 | tprintf("Error serializing %s\n", filename); |
499 | 0 | return false; |
500 | 0 | } |
501 | 0 | if (!file.CloseWrite(filename, nullptr)) { |
502 | 0 | tprintf("Error writing file %s\n", filename); |
503 | 0 | return false; |
504 | 0 | } |
505 | 0 | return true; |
506 | 0 | } |
507 | | |
508 | | private: |
509 | | /// Sets the next node link for this edge. |
510 | 0 | inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { |
511 | 0 | set_next_node_in_edge_rec(&(edges_[edge_ref]), value); |
512 | 0 | } |
513 | | /// Sets the edge to be empty. |
514 | 0 | inline void set_empty_edge(EDGE_REF edge_ref) { |
515 | 0 | (edges_[edge_ref] = next_node_mask_); |
516 | 0 | } |
517 | | /// Goes through all the edges and clears each one out. |
518 | 0 | inline void clear_all_edges() { |
519 | 0 | for (int edge = 0; edge < num_edges_; edge++) { |
520 | 0 | set_empty_edge(edge); |
521 | 0 | } |
522 | 0 | } |
523 | | /// Clears the last flag of this edge. |
524 | 0 | inline void clear_marker_flag(EDGE_REF edge_ref) { |
525 | 0 | (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_)); |
526 | 0 | } |
527 | | /// Returns true if this edge is in the forward direction. |
528 | 16 | inline bool forward_edge(EDGE_REF edge_ref) const { |
529 | 16 | return (edge_occupied(edge_ref) && |
530 | 16 | (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); |
531 | 16 | } |
532 | | /// Returns true if this edge is in the backward direction. |
533 | 0 | inline bool backward_edge(EDGE_REF edge_ref) const { |
534 | 0 | return (edge_occupied(edge_ref) && |
535 | 0 | (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); |
536 | 0 | } |
537 | | /// Returns true if the edge spot in this location is occupied. |
538 | 8.81M | inline bool edge_occupied(EDGE_REF edge_ref) const { |
539 | 8.81M | return (edges_[edge_ref] != next_node_mask_); |
540 | 8.81M | } |
541 | | /// Returns true if this edge is the last edge in a sequence. |
542 | 101M | inline bool last_edge(EDGE_REF edge_ref) const { |
543 | 101M | return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0; |
544 | 101M | } |
545 | | |
546 | | /// Counts and returns the number of forward edges in this node. |
547 | | int32_t num_forward_edges(NODE_REF node) const; |
548 | | |
549 | | /// Reads SquishedDawg from a file. |
550 | | bool read_squished_dawg(TFile *file); |
551 | | |
552 | | /// Prints the contents of an edge indicated by the given EDGE_REF. |
553 | | void print_edge(EDGE_REF edge) const; |
554 | | |
555 | | /// Prints the contents of the SquishedDawg. |
556 | 0 | void print_all(const char *msg) { |
557 | 0 | tprintf("\n__________________________\n%s\n", msg); |
558 | 0 | for (int i = 0; i < num_edges_; ++i) { |
559 | 0 | print_edge(i); |
560 | 0 | } |
561 | 0 | tprintf("__________________________\n"); |
562 | 0 | } |
563 | | /// Constructs a mapping from the memory node indices to disk node indices. |
564 | | std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const; |
565 | | |
566 | | // Member variables. |
567 | | EDGE_ARRAY edges_ = nullptr; |
568 | | int32_t num_edges_ = 0; |
569 | | int num_forward_edges_in_node0 = 0; |
570 | | }; |
571 | | |
572 | | } // namespace tesseract |
573 | | |
574 | | #endif // DICT_DAWG_H_ |