/src/tesseract/src/ccutil/unicharmap.cpp
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: unicharmap.cpp |
3 | | // Description: Unicode character/ligature to integer id class. |
4 | | // Author: Thomas Kielbus |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #include "unicharmap.h" |
20 | | |
21 | | #include <tesseract/unichar.h> |
22 | | |
23 | | #include <cassert> |
24 | | |
25 | | namespace tesseract { |
26 | | |
27 | 12 | UNICHARMAP::UNICHARMAP() : nodes(nullptr) {} |
28 | | |
29 | 4 | UNICHARMAP::~UNICHARMAP() { |
30 | 4 | delete[] nodes; |
31 | 4 | } |
32 | | |
33 | | // Search the given unichar representation in the tree, using length characters |
34 | | // from it maximum. Each character in the string is interpreted as an index in |
35 | | // an array of nodes. |
36 | 1.53M | UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const { |
37 | 1.53M | UNICHARMAP_NODE *current_nodes = nodes; |
38 | | |
39 | 1.53M | assert(*unichar_repr != '\0'); |
40 | 1.53M | assert(length > 0 && length <= UNICHAR_LEN); |
41 | | |
42 | 1.53M | int index = 0; |
43 | 1.53M | if (length <= 0 || unichar_repr[index] == '\0') { |
44 | 0 | return INVALID_UNICHAR_ID; |
45 | 0 | } |
46 | 2.98M | do { |
47 | 2.98M | if (index + 1 >= length || unichar_repr[index + 1] == '\0') { |
48 | 1.53M | return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id; |
49 | 1.53M | } |
50 | 1.44M | current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; |
51 | 1.44M | ++index; |
52 | 1.44M | } while (true); |
53 | 1.53M | } |
54 | | |
55 | | // Search the given unichar representation in the tree, creating the possibly |
56 | | // missing nodes. Once the right place has been found, insert the given id and |
57 | | // update the inserted flag to keep track of the insert. Each character in the |
58 | | // string is interpreted as an index in an array of nodes. |
59 | 2.85k | void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) { |
60 | 2.85k | const char *current_char = unichar_repr; |
61 | 2.85k | if (*current_char == '\0') { |
62 | 0 | return; |
63 | 0 | } |
64 | 2.85k | UNICHARMAP_NODE **current_nodes_pointer = &nodes; |
65 | 10.4k | do { |
66 | 10.4k | if (*current_nodes_pointer == nullptr) { |
67 | 3.60k | *current_nodes_pointer = new UNICHARMAP_NODE[256]; |
68 | 3.60k | } |
69 | 10.4k | if (current_char[1] == '\0') { |
70 | 2.85k | (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id; |
71 | 2.85k | return; |
72 | 2.85k | } |
73 | 7.62k | current_nodes_pointer = |
74 | 7.62k | &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children); |
75 | 7.62k | ++current_char; |
76 | 7.62k | } while (true); |
77 | 2.85k | } |
78 | | |
79 | | // Search the given unichar representation in the tree, using length characters |
80 | | // from it maximum. Each character in the string is interpreted as an index in |
81 | | // an array of nodes. Stop once the tree does not have anymore nodes or once we |
82 | | // found the right unichar_repr. |
83 | 2.06M | bool UNICHARMAP::contains(const char *const unichar_repr, int length) const { |
84 | 2.06M | if (unichar_repr == nullptr || *unichar_repr == '\0') { |
85 | 0 | return false; |
86 | 0 | } |
87 | 2.06M | if (length <= 0 || length > UNICHAR_LEN) { |
88 | 0 | return false; |
89 | 0 | } |
90 | 2.06M | int index = 0; |
91 | 2.06M | if (unichar_repr[index] == '\0') { |
92 | 0 | return false; |
93 | 0 | } |
94 | 2.06M | UNICHARMAP_NODE *current_nodes = nodes; |
95 | | |
96 | 4.95M | while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') { |
97 | 2.89M | current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; |
98 | 2.89M | ++index; |
99 | 2.89M | } |
100 | 2.06M | return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') && |
101 | 2.06M | current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0; |
102 | 2.06M | } |
103 | | |
104 | | // Return the minimum number of characters that must be used from this string |
105 | | // to obtain a match in the UNICHARMAP. |
106 | 613k | int UNICHARMAP::minmatch(const char *const unichar_repr) const { |
107 | 613k | const char *current_char = unichar_repr; |
108 | 613k | if (*current_char == '\0') { |
109 | 0 | return 0; |
110 | 0 | } |
111 | 613k | UNICHARMAP_NODE *current_nodes = nodes; |
112 | | |
113 | 615k | while (current_nodes != nullptr && *current_char != '\0') { |
114 | 614k | if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) { |
115 | 612k | return current_char + 1 - unichar_repr; |
116 | 612k | } |
117 | 1.89k | current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children; |
118 | 1.89k | ++current_char; |
119 | 1.89k | } |
120 | 1.33k | return 0; |
121 | 613k | } |
122 | | |
123 | 28 | void UNICHARMAP::clear() { |
124 | 28 | delete[] nodes; |
125 | 28 | nodes = nullptr; |
126 | 28 | } |
127 | | |
128 | 923k | UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {} |
129 | | |
130 | | // Recursively delete the children |
131 | 72.7k | UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() { |
132 | 72.7k | delete[] children; |
133 | 72.7k | } |
134 | | |
135 | | } // namespace tesseract |