/src/tesseract/src/dict/trie.h

Source
/******************************************************************************
 *
 * File:        trie.h
 * Description: Functions to build a trie data structure.
 * Author:      Mark Seaman, SW Productivity
 *
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 *****************************************************************************/
#ifndef TRIE_H
#define TRIE_H

#include "dawg.h"

namespace tesseract {

class UNICHARSET;

// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
// max int32, we will need to change vector to use int64 for size
// and address indices. This does not seem to be needed immediately,
// since currently the largest number of edges limit used by tesseract
// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
// There are also int casts below to satisfy the WIN32 compiler that would
// need to be changed.
// It might be cleanest to change the types of most of the Trie/Dawg related
// typedefs to int and restrict the casts to extracting these values from
// the 64 bit EDGE_RECORD.
using EDGE_INDEX = int64_t; // index of an edge in a given node
using EDGE_VECTOR = std::vector<EDGE_RECORD>;

struct TRIE_NODE_RECORD {
  EDGE_VECTOR forward_edges;
  EDGE_VECTOR backward_edges;
};
using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;

/**
 * Concrete class for Trie data structure that allows to store a list of
 * words (extends Dawg base class) as well as dynamically add new words.
 * This class stores a vector of pointers to TRIE_NODE_RECORDs, each of
 * which has a vector of forward and backward edges.
 */
class TESS_API Trie : public Dawg {
public:
  enum RTLReversePolicy {
    RRP_DO_NO_REVERSE,
    RRP_REVERSE_IF_HAS_RTL,
    RRP_FORCE_REVERSE,
  };

  // Minimum number of concrete characters at the beginning of user patterns.
  static const int kSaneNumConcreteChars = 0;
  // Various unicode whitespace characters are used to denote unichar patterns,
  // (character classifier would never produce these whitespace characters as a
  // valid classification).
  static const char kAlphaPatternUnicode[];
  static const char kDigitPatternUnicode[];
  static const char kAlphanumPatternUnicode[];
  static const char kPuncPatternUnicode[];
  static const char kLowerPatternUnicode[];
  static const char kUpperPatternUnicode[];

  static const char *get_reverse_policy_name(RTLReversePolicy reverse_policy);

  // max_num_edges argument allows limiting the amount of memory this
  // Trie can consume (if a new word insert would cause the Trie to
  // contain more edges than max_num_edges, all the edges are cleared
  // so that new inserts can proceed).
  Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
      : Dawg(type, lang, perm, debug_level) {
    init(unicharset_size);
    deref_node_index_mask_ = ~letter_mask_;
    new_dawg_node(); // need to allocate node 0
  }
  ~Trie() override {
    for (auto node : nodes_) {
      delete node;
    }
  }

  // Reset the Trie to empty.
  void clear();

  /** Returns the edge that corresponds to the letter out of this node. */
  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override {
    EDGE_RECORD *edge_ptr;
    EDGE_INDEX edge_index;
    if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
                      &edge_index)) {
      return NO_EDGE;
    }
    return make_edge_ref(node_ref, edge_index);
  }

  /**
   * Fills the given NodeChildVector with all the unichar ids (and the
   * corresponding EDGE_REFs) for which there is an edge out of this node.
   */
  void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override {
    const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;
    for (auto &edge : forward_edges) {
      if (!word_end || end_of_word_from_edge_rec(edge)) {
        vec->push_back(
            NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
      }
    }
  }

  /**
   * Returns the next node visited by following the edge
   * indicated by the given EDGE_REF.
   */
  NODE_REF next_node(EDGE_REF edge_ref) const override {
    if (edge_ref == NO_EDGE || num_edges_ == 0) {
      return NO_EDGE;
    }
    return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
  }

  /**
   * Returns true if the edge indicated by the given EDGE_REF
   * marks the end of a word.
   */
  bool end_of_word(EDGE_REF edge_ref) const override {
    if (edge_ref == NO_EDGE || num_edges_ == 0) {
      return false;
    }
    return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
  }

  /** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */
  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
    if (edge_ref == NO_EDGE || num_edges_ == 0) {
      return INVALID_UNICHAR_ID;
    }
    return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
  }
  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
  // the edge dead.
  void KillEdge(EDGE_RECORD *edge_rec) const {
    *edge_rec &= ~letter_mask_;
    *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
  }
  bool DeadEdge(const EDGE_RECORD &edge_rec) const {
    return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
  }

  // Prints the contents of the node indicated by the given NODE_REF.
  // At most max_num_edges will be printed.
  void print_node(NODE_REF node, int max_num_edges) const override;

  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
  // Eliminates redundant edges and returns the pointer to the SquishedDawg.
  // Note: the caller is responsible for deallocating memory associated
  // with the returned SquishedDawg pointer.
  SquishedDawg *trie_to_dawg();

  // Reads a list of words from the given file and adds into the Trie.
  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
  // policy and information in the unicharset.
  // Returns false on error.
  bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
                              Trie::RTLReversePolicy reverse);

  // Reads a list of words from the given file.
  // Returns false on error.
  bool read_word_list(const char *filename, std::vector<std::string> *words);
  // Adds a list of words previously read using read_word_list to the trie
  // using the given unicharset and reverse_policy to convert to unichar-ids.
  // Returns false on error.
  bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
                     Trie::RTLReversePolicy reverse_policy);

  // Inserts the list of patterns from the given file into the Trie.
  // The pattern list file should contain one pattern per line in UTF-8 format.
  //
  // Each pattern can contain any non-whitespace characters, however only the
  // patterns that contain characters from the unicharset of the corresponding
  // language will be useful.
  // The only meta character is '\'. To be used in a pattern as an ordinary
  // string it should be escaped with '\' (e.g. string "C:\Documents" should
  // be written in the patterns file as "C:\\Documents").
  // This function supports a very limited regular expression syntax. One can
  // express a character, a certain character class and a number of times the
  // entity should be repeated in the pattern.
  //
  // To denote a character class use one of:
  // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
  // \d - unichar for which UNICHARSET::get_isdigit() is true
  // \n - unichar for which UNICHARSET::get_isdigit() or
  //      UNICHARSET::isalpha() are true
  // \p - unichar for which UNICHARSET::get_ispunct() is true
  // \a - unichar for which UNICHARSET::get_islower() is true
  // \A - unichar for which UNICHARSET::get_isupper() is true
  //
  // \* could be specified after each character or pattern to indicate that
  // the character/pattern can be repeated any number of times before the next
  // character/pattern occurs.
  //
  // Examples:
  // 1-8\d\d-GOOG-411 will be expanded to strings:
  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
  //
  // http://www.\n\*.com will be expanded to strings like:
  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
  //
  // Note: In choosing which patterns to include please be aware of the fact
  // providing very generic patterns will make tesseract run slower.
  // For example \n\* at the beginning of the pattern will make Tesseract
  // consider all the combinations of proposed character choices for each
  // of the segmentations, which will be unacceptably slow.
  // Because of potential problems with speed that could be difficult to
  // identify, each user pattern has to have at least kSaneNumConcreteChars
  // concrete characters from the unicharset at the beginning.
  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);

  // Initializes the values of *_pattern_ unichar ids.
  // This function should be called before calling read_pattern_list().
  void initialize_patterns(UNICHARSET *unicharset);

  // Fills in the given unichar id vector with the unichar ids that represent
  // the patterns of the character classes of the given unichar_id.
  void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
                              std::vector<UNICHAR_ID> *vec) const override;

  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
  // a self loop and the given unichar_id matches the unichar_id stored in the
  // EDGE_RECORD, returns NO_EDGE otherwise.
  EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,
                             bool word_end) const override {
    if (edge_ref == NO_EDGE) {
      return NO_EDGE;
    }
    EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
    return (marker_flag_from_edge_rec(*edge_rec) &&
            unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
            word_end == end_of_word_from_edge_rec(*edge_rec))
               ? edge_ref
               : NO_EDGE;
  }

  // Adds a word to the Trie (creates the necessary nodes and edges).
  //
  // If repetitions vector is not nullptr, each entry in the vector indicates
  // whether the unichar id with the corresponding index in the word is allowed
  // to repeat an unlimited number of times. For each entry that is true, MARKER
  // flag of the corresponding edge created for this unichar id is set to true).
  //
  // Return true if add succeeded, false otherwise (e.g. when a word contained
  // an invalid unichar id or the trie was getting too large and was cleared).
  bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
  bool add_word_to_dawg(const WERD_CHOICE &word) {
    return add_word_to_dawg(word, nullptr);
  }

protected:
  // The structure of an EDGE_REF for Trie edges is as follows:
  // [LETTER_START_BIT, flag_start_bit_):
  //                             edge index in *_edges in a TRIE_NODE_RECORD
  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
  //
  // With this arrangement there are enough bits to represent edge indices
  // (each node can have at most unicharset_size_ forward edges and
  // the position of flag_start_bit is set to be log2(unicharset_size_)).
  // It is also possible to accommodate a maximum number of nodes that is at
  // least as large as that of the SquishedDawg representation (in SquishedDawg
  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
  // the next node index).
  //

  // Returns the pointer to EDGE_RECORD after decoding the location
  // of the edge from the information in the given EDGE_REF.
  // This function assumes that EDGE_REF holds valid node/edge indices.
  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
    int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);
    int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
    TRIE_NODE_RECORD *node_rec = nodes_[node_index];
    return &(node_rec->forward_edges[edge_index]);
  }
  /** Constructs EDGE_REF from the given node_index and edge_index. */
  inline EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const {
    return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT));
  }
  /** Sets up this edge record to the requested values. */
  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end,
                        UNICHAR_ID unichar_id) {
    EDGE_RECORD flags = 0;
    if (repeats) {
      flags |= MARKER_FLAG;
    }
    if (word_end) {
      flags |= WERD_END_FLAG;
    }
    if (direction == BACKWARD_EDGE) {
      flags |= DIRECTION_FLAG;
    }
    *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
             (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
  }
  /** Prints the given EDGE_RECORD. */
  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
    tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
            marker_flag_from_edge_rec(edge_rec) ? "R," : "",
            (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
            end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec));
  }
  // Returns true if the next node in recorded the given EDGE_RECORD
  // has exactly one forward edge.
  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
    NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
    return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
  }

  // Prints the contents of the Trie.
  // At most max_num_edges will be printed for each node.
  void print_all(const char *msg, int max_num_edges) {
    tprintf("\n__________________________\n%s\n", msg);
    for (size_t i = 0; i < nodes_.size(); ++i) {
      print_node(i, max_num_edges);
    }
    tprintf("__________________________\n");
  }

  // Finds the edge with the given direction, word_end and unichar_id
  // in the node indicated by node_ref. Fills in the pointer to the
  // EDGE_RECORD and the index of the edge with the values
  // corresponding to the edge found. Returns true if an edge was found.
  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
                    UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;

  // Adds an single edge linkage between node1 and node2 in the direction
  // indicated by direction argument.
  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end,
                        UNICHAR_ID unichar_id);

  // Adds forward edge linkage from node1 to node2 and the corresponding
  // backward edge linkage in the other direction.
  bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end,
                    UNICHAR_ID unichar_id) {
    return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&
            add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));
  }

  // Sets the word ending flags in an already existing edge pair.
  // Returns true on success.
  void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats,
                       UNICHAR_ID unichar_id);

  // Allocates space for a new node in the Trie.
  NODE_REF new_dawg_node();

  // Removes a single edge linkage to between node1 and node2 in the
  // direction indicated by direction argument.
  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
                           UNICHAR_ID unichar_id);

  // Removes forward edge linkage from node1 to node2 and the corresponding
  // backward edge linkage in the other direction.
  void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) {
    remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
    remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
  }

  // Compares edge1 and edge2 in the given node to see if they point to two
  // next nodes that could be collapsed. If they do, performs the reduction
  // and returns true.
  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2);

  // Assuming that edge_index indicates the first edge in a group of edges
  // in this node with a particular letter value, looks through these edges
  // to see if any of them can be collapsed. If so does it. Returns to the
  // caller when all edges with this letter have been reduced.
  // Returns true if further reduction is possible with this same letter.
  bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
                             EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);

  /**
   * Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
   * increasing order of unichar ids. This function is normally called
   * for all edges in a single node, and since number of edges in each node
   * is usually quite small, selection sort is used.
   */
  void sort_edges(EDGE_VECTOR *edges);

  /** Eliminates any redundant edges from this node in the Trie. */
  void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);

  // Returns the pattern unichar id for the given character class code.
  UNICHAR_ID character_class_to_pattern(char ch);

  // Member variables
  TRIE_NODES nodes_; // vector of nodes in the Trie
  // Freelist of edges in the root backwards node that were previously zeroed.
  std::vector<EDGE_INDEX> root_back_freelist_;
  uint64_t num_edges_ = 0;             // sum of all edges (forward and backward)
  uint64_t deref_direction_mask_ = 0;  // mask for EDGE_REF to extract direction
  uint64_t deref_node_index_mask_ = 0; // mask for EDGE_REF to extract node index
  // Variables for translating character class codes denoted in user patterns
  // file to the unichar ids used to represent them in a Trie.
  UNICHAR_ID alpha_pattern_ = 0;
  UNICHAR_ID digit_pattern_ = 0;
  UNICHAR_ID alphanum_pattern_ = 0;
  UNICHAR_ID punc_pattern_ = 0;
  UNICHAR_ID lower_pattern_ = 0;
  UNICHAR_ID upper_pattern_ = 0;
  bool initialized_patterns_ = false;
};

} // namespace tesseract

#endif

Coverage Report

Created: 2025-11-16 06:50

Line	Count	Source
1		/******************************************************************************
2		*
3		* File: trie.h
4		* Description: Functions to build a trie data structure.
5		* Author: Mark Seaman, SW Productivity
6		*
7		* (c) Copyright 1987, Hewlett-Packard Company.
8		** Licensed under the Apache License, Version 2.0 (the "License");
9		** you may not use this file except in compliance with the License.
10		** You may obtain a copy of the License at
11		** http://www.apache.org/licenses/LICENSE-2.0
12		** Unless required by applicable law or agreed to in writing, software
13		** distributed under the License is distributed on an "AS IS" BASIS,
14		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		** See the License for the specific language governing permissions and
16		** limitations under the License.
17		*
18		*****************************************************************************/
19		#ifndef TRIE_H
20		#define TRIE_H
21
22		#include "dawg.h"
23
24		namespace tesseract {
25
26		class UNICHARSET;
27
28		// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
29		// max int32, we will need to change vector to use int64 for size
30		// and address indices. This does not seem to be needed immediately,
31		// since currently the largest number of edges limit used by tesseract
32		// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
33		// There are also int casts below to satisfy the WIN32 compiler that would
34		// need to be changed.
35		// It might be cleanest to change the types of most of the Trie/Dawg related
36		// typedefs to int and restrict the casts to extracting these values from
37		// the 64 bit EDGE_RECORD.
38		using EDGE_INDEX = int64_t; // index of an edge in a given node
39		using EDGE_VECTOR = std::vector<EDGE_RECORD>;
40
41		struct TRIE_NODE_RECORD {
42		EDGE_VECTOR forward_edges;
43		EDGE_VECTOR backward_edges;
44		};
45		using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
46
47		/**
48		* Concrete class for Trie data structure that allows to store a list of
49		* words (extends Dawg base class) as well as dynamically add new words.
50		* This class stores a vector of pointers to TRIE_NODE_RECORDs, each of
51		* which has a vector of forward and backward edges.
52		*/
53		class TESS_API Trie : public Dawg {
54		public:
55		enum RTLReversePolicy {
56		RRP_DO_NO_REVERSE,
57		RRP_REVERSE_IF_HAS_RTL,
58		RRP_FORCE_REVERSE,
59		};
60
61		// Minimum number of concrete characters at the beginning of user patterns.
62		static const int kSaneNumConcreteChars = 0;
63		// Various unicode whitespace characters are used to denote unichar patterns,
64		// (character classifier would never produce these whitespace characters as a
65		// valid classification).
66		static const char kAlphaPatternUnicode[];
67		static const char kDigitPatternUnicode[];
68		static const char kAlphanumPatternUnicode[];
69		static const char kPuncPatternUnicode[];
70		static const char kLowerPatternUnicode[];
71		static const char kUpperPatternUnicode[];
72
73		static const char *get_reverse_policy_name(RTLReversePolicy reverse_policy);
74
75		// max_num_edges argument allows limiting the amount of memory this
76		// Trie can consume (if a new word insert would cause the Trie to
77		// contain more edges than max_num_edges, all the edges are cleared
78		// so that new inserts can proceed).
79		Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
80	8	: Dawg(type, lang, perm, debug_level) {
81	8	init(unicharset_size);
82	8	deref_node_index_mask_ = ~letter_mask_;
83	8	new_dawg_node(); // need to allocate node 0
84	8	}
85	0	~Trie() override {
86	0	for (auto node : nodes_) {
87	0	delete node;
88	0	}
89	0	}
90
91		// Reset the Trie to empty.
92		void clear();
93
94		/** Returns the edge that corresponds to the letter out of this node. */
95	3.04M	EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override {
96	3.04M	EDGE_RECORD *edge_ptr;
97	3.04M	EDGE_INDEX edge_index;
98	3.04M	if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
99	3.04M	&edge_index)) {
100	3.03M	return NO_EDGE;
101	3.03M	}
102	4.61k	return make_edge_ref(node_ref, edge_index);
103	3.04M	}
104
105		/**
106		* Fills the given NodeChildVector with all the unichar ids (and the
107		* corresponding EDGE_REFs) for which there is an edge out of this node.
108		*/
109	0	void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override {
110	0	const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;
111	0	for (auto &edge : forward_edges) {
112	0	if (!word_end \|\| end_of_word_from_edge_rec(edge)) {
113	0	vec->push_back(
114	0	NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
115	0	}
116	0	}
117	0	}
118
119		/**
120		* Returns the next node visited by following the edge
121		* indicated by the given EDGE_REF.
122		*/
123	6.70k	NODE_REF next_node(EDGE_REF edge_ref) const override {
124	6.70k	if (edge_ref == NO_EDGE \|\| num_edges_ == 0) {
125	0	return NO_EDGE;
126	0	}
127	6.70k	return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
128	6.70k	}
129
130		/**
131		* Returns true if the edge indicated by the given EDGE_REF
132		* marks the end of a word.
133		*/
134	11.3k	bool end_of_word(EDGE_REF edge_ref) const override {
135	11.3k	if (edge_ref == NO_EDGE \|\| num_edges_ == 0) {
136	0	return false;
137	0	}
138	11.3k	return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
139	11.3k	}
140
141		/** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */
142	0	UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
143	0	if (edge_ref == NO_EDGE \|\| num_edges_ == 0) {
144	0	return INVALID_UNICHAR_ID;
145	0	}
146	0	return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
147	0	}
148		// Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
149		// the edge dead.
150	0	void KillEdge(EDGE_RECORD *edge_rec) const {
151	0	*edge_rec &= ~letter_mask_;
152	0	*edge_rec \|= (unicharset_size_ << LETTER_START_BIT);
153	0	}
154	0	bool DeadEdge(const EDGE_RECORD &edge_rec) const {
155	0	return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
156	0	}
157
158		// Prints the contents of the node indicated by the given NODE_REF.
159		// At most max_num_edges will be printed.
160		void print_node(NODE_REF node, int max_num_edges) const override;
161
162		// Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
163		// Eliminates redundant edges and returns the pointer to the SquishedDawg.
164		// Note: the caller is responsible for deallocating memory associated
165		// with the returned SquishedDawg pointer.
166		SquishedDawg *trie_to_dawg();
167
168		// Reads a list of words from the given file and adds into the Trie.
169		// Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
170		// policy and information in the unicharset.
171		// Returns false on error.
172		bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
173		Trie::RTLReversePolicy reverse);
174
175		// Reads a list of words from the given file.
176		// Returns false on error.
177		bool read_word_list(const char filename, std::vector<std::string> words);
178		// Adds a list of words previously read using read_word_list to the trie
179		// using the given unicharset and reverse_policy to convert to unichar-ids.
180		// Returns false on error.
181		bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
182		Trie::RTLReversePolicy reverse_policy);
183
184		// Inserts the list of patterns from the given file into the Trie.
185		// The pattern list file should contain one pattern per line in UTF-8 format.
186		//
187		// Each pattern can contain any non-whitespace characters, however only the
188		// patterns that contain characters from the unicharset of the corresponding
189		// language will be useful.
190		// The only meta character is '\'. To be used in a pattern as an ordinary
191		// string it should be escaped with '\' (e.g. string "C:\Documents" should
192		// be written in the patterns file as "C:\\Documents").
193		// This function supports a very limited regular expression syntax. One can
194		// express a character, a certain character class and a number of times the
195		// entity should be repeated in the pattern.
196		//
197		// To denote a character class use one of:
198		// \c - unichar for which UNICHARSET::get_isalpha() is true (character)
199		// \d - unichar for which UNICHARSET::get_isdigit() is true
200		// \n - unichar for which UNICHARSET::get_isdigit() or
201		// UNICHARSET::isalpha() are true
202		// \p - unichar for which UNICHARSET::get_ispunct() is true
203		// \a - unichar for which UNICHARSET::get_islower() is true
204		// \A - unichar for which UNICHARSET::get_isupper() is true
205		//
206		// \* could be specified after each character or pattern to indicate that
207		// the character/pattern can be repeated any number of times before the next
208		// character/pattern occurs.
209		//
210		// Examples:
211		// 1-8\d\d-GOOG-411 will be expanded to strings:
212		// 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
213		//
214		// http://www.\n\*.com will be expanded to strings like:
215		// http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
216		//
217		// Note: In choosing which patterns to include please be aware of the fact
218		// providing very generic patterns will make tesseract run slower.
219		// For example \n\* at the beginning of the pattern will make Tesseract
220		// consider all the combinations of proposed character choices for each
221		// of the segmentations, which will be unacceptably slow.
222		// Because of potential problems with speed that could be difficult to
223		// identify, each user pattern has to have at least kSaneNumConcreteChars
224		// concrete characters from the unicharset at the beginning.
225		bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
226
227		// Initializes the values of *_pattern_ unichar ids.
228		// This function should be called before calling read_pattern_list().
229		void initialize_patterns(UNICHARSET *unicharset);
230
231		// Fills in the given unichar id vector with the unichar ids that represent
232		// the patterns of the character classes of the given unichar_id.
233		void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
234		std::vector<UNICHAR_ID> *vec) const override;
235
236		// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
237		// a self loop and the given unichar_id matches the unichar_id stored in the
238		// EDGE_RECORD, returns NO_EDGE otherwise.
239		EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,
240	0	bool word_end) const override {
241	0	if (edge_ref == NO_EDGE) {
242	0	return NO_EDGE;
243	0	}
244	0	EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245	0	return (marker_flag_from_edge_rec(*edge_rec) &&
246	0	unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247	0	word_end == end_of_word_from_edge_rec(*edge_rec))
248	0	? edge_ref
249	0	: NO_EDGE;
250	0	}
251
252		// Adds a word to the Trie (creates the necessary nodes and edges).
253		//
254		// If repetitions vector is not nullptr, each entry in the vector indicates
255		// whether the unichar id with the corresponding index in the word is allowed
256		// to repeat an unlimited number of times. For each entry that is true, MARKER
257		// flag of the corresponding edge created for this unichar id is set to true).
258		//
259		// Return true if add succeeded, false otherwise (e.g. when a word contained
260		// an invalid unichar id or the trie was getting too large and was cleared).
261		bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
262	10	bool add_word_to_dawg(const WERD_CHOICE &word) {
263	10	return add_word_to_dawg(word, nullptr);
264	10	}
265
266		protected:
267		// The structure of an EDGE_REF for Trie edges is as follows:
268		// [LETTER_START_BIT, flag_start_bit_):
269		// edge index in *_edges in a TRIE_NODE_RECORD
270		// [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
271		//
272		// With this arrangement there are enough bits to represent edge indices
273		// (each node can have at most unicharset_size_ forward edges and
274		// the position of flag_start_bit is set to be log2(unicharset_size_)).
275		// It is also possible to accommodate a maximum number of nodes that is at
276		// least as large as that of the SquishedDawg representation (in SquishedDawg
277		// each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
278		// the next node index).
279		//
280
281		// Returns the pointer to EDGE_RECORD after decoding the location
282		// of the edge from the information in the given EDGE_REF.
283		// This function assumes that EDGE_REF holds valid node/edge indices.
284	18.0k	inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
285	18.0k	int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);
286	18.0k	int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
287	18.0k	TRIE_NODE_RECORD *node_rec = nodes_[node_index];
288	18.0k	return &(node_rec->forward_edges[edge_index]);
289	18.0k	}
290		/** Constructs EDGE_REF from the given node_index and edge_index. */
291	4.61k	inline EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const {
292	4.61k	return ((node_index << flag_start_bit_) \| (edge_index << LETTER_START_BIT));
293	4.61k	}
294		/** Sets up this edge record to the requested values. */
295		inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end,
296	118	UNICHAR_ID unichar_id) {
297	118	EDGE_RECORD flags = 0;
298	118	if (repeats) {
299	0	flags \|= MARKER_FLAG;
300	0	}
301	118	if (word_end) {
302	18	flags \|= WERD_END_FLAG;
303	18	}
304	118	if (direction == BACKWARD_EDGE) {
305	59	flags \|= DIRECTION_FLAG;
306	59	}
307	118	*edge = ((nxt << next_node_start_bit_) \| (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) \|
308	118	(static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
309	118	}
310		/** Prints the given EDGE_RECORD. */
311	0	inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
312	0	tprintf("\|" REFFORMAT "\|%s%s%s\|%d\|", next_node_from_edge_rec(edge_rec),
313	0	marker_flag_from_edge_rec(edge_rec) ? "R," : "",
314	0	(direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
315	0	end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec));
316	0	}
317		// Returns true if the next node in recorded the given EDGE_RECORD
318		// has exactly one forward edge.
319	0	inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
320	0	NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321	0	return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
322	0	}
323
324		// Prints the contents of the Trie.
325		// At most max_num_edges will be printed for each node.
326	0	void print_all(const char *msg, int max_num_edges) {
327	0	tprintf("\n__________________________\n%s\n", msg);
328	0	for (size_t i = 0; i < nodes_.size(); ++i) {
329	0	print_node(i, max_num_edges);
330	0	}
331	0	tprintf("__________________________\n");
332	0	}
333
334		// Finds the edge with the given direction, word_end and unichar_id
335		// in the node indicated by node_ref. Fills in the pointer to the
336		// EDGE_RECORD and the index of the edge with the values
337		// corresponding to the edge found. Returns true if an edge was found.
338		bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
339		UNICHAR_ID unichar_id, EDGE_RECORD *edge_ptr, EDGE_INDEX edge_index) const;
340
341		// Adds an single edge linkage between node1 and node2 in the direction
342		// indicated by direction argument.
343		bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end,
344		UNICHAR_ID unichar_id);
345
346		// Adds forward edge linkage from node1 to node2 and the corresponding
347		// backward edge linkage in the other direction.
348		bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end,
349	59	UNICHAR_ID unichar_id) {
350	59	return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&
351	59	add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));
352	59	}
353
354		// Sets the word ending flags in an already existing edge pair.
355		// Returns true on success.
356		void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats,
357		UNICHAR_ID unichar_id);
358
359		// Allocates space for a new node in the Trie.
360		NODE_REF new_dawg_node();
361
362		// Removes a single edge linkage to between node1 and node2 in the
363		// direction indicated by direction argument.
364		void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
365		UNICHAR_ID unichar_id);
366
367		// Removes forward edge linkage from node1 to node2 and the corresponding
368		// backward edge linkage in the other direction.
369	0	void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) {
370	0	remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
371	0	remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
372	0	}
373
374		// Compares edge1 and edge2 in the given node to see if they point to two
375		// next nodes that could be collapsed. If they do, performs the reduction
376		// and returns true.
377		bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2);
378
379		// Assuming that edge_index indicates the first edge in a group of edges
380		// in this node with a particular letter value, looks through these edges
381		// to see if any of them can be collapsed. If so does it. Returns to the
382		// caller when all edges with this letter have been reduced.
383		// Returns true if further reduction is possible with this same letter.
384		bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
385		EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);
386
387		/**
388		* Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
389		* increasing order of unichar ids. This function is normally called
390		* for all edges in a single node, and since number of edges in each node
391		* is usually quite small, selection sort is used.
392		*/
393		void sort_edges(EDGE_VECTOR *edges);
394
395		/** Eliminates any redundant edges from this node in the Trie. */
396		void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);
397
398		// Returns the pattern unichar id for the given character class code.
399		UNICHAR_ID character_class_to_pattern(char ch);
400
401		// Member variables
402		TRIE_NODES nodes_; // vector of nodes in the Trie
403		// Freelist of edges in the root backwards node that were previously zeroed.
404		std::vector<EDGE_INDEX> root_back_freelist_;
405		uint64_t num_edges_ = 0; // sum of all edges (forward and backward)
406		uint64_t deref_direction_mask_ = 0; // mask for EDGE_REF to extract direction
407		uint64_t deref_node_index_mask_ = 0; // mask for EDGE_REF to extract node index
408		// Variables for translating character class codes denoted in user patterns
409		// file to the unichar ids used to represent them in a Trie.
410		UNICHAR_ID alpha_pattern_ = 0;
411		UNICHAR_ID digit_pattern_ = 0;
412		UNICHAR_ID alphanum_pattern_ = 0;
413		UNICHAR_ID punc_pattern_ = 0;
414		UNICHAR_ID lower_pattern_ = 0;
415		UNICHAR_ID upper_pattern_ = 0;
416		bool initialized_patterns_ = false;
417		};
418
419		} // namespace tesseract
420
421		#endif