Coverage Report

Created: 2026-05-24 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccutil/unicharmap.cpp
Line
Count
Source
1
///////////////////////////////////////////////////////////////////////
2
// File:        unicharmap.cpp
3
// Description: Unicode character/ligature to integer id class.
4
// Author:      Thomas Kielbus
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#include "unicharmap.h"
20
21
#include <tesseract/unichar.h>
22
23
#include <cassert>
24
25
namespace tesseract {
26
27
12
UNICHARMAP::UNICHARMAP() : nodes(nullptr) {}
28
29
4
UNICHARMAP::~UNICHARMAP() {
30
4
  delete[] nodes;
31
4
}
32
33
// Search the given unichar representation in the tree, using length characters
34
// from it maximum. Each character in the string is interpreted as an index in
35
// an array of nodes.
36
1.53M
UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const {
37
1.53M
  UNICHARMAP_NODE *current_nodes = nodes;
38
39
1.53M
  assert(*unichar_repr != '\0');
40
1.53M
  assert(length > 0 && length <= UNICHAR_LEN);
41
42
1.53M
  int index = 0;
43
1.53M
  if (length <= 0 || unichar_repr[index] == '\0') {
44
0
    return INVALID_UNICHAR_ID;
45
0
  }
46
2.98M
  do {
47
2.98M
    if (index + 1 >= length || unichar_repr[index + 1] == '\0') {
48
1.53M
      return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
49
1.53M
    }
50
1.44M
    current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
51
1.44M
    ++index;
52
1.44M
  } while (true);
53
1.53M
}
54
55
// Search the given unichar representation in the tree, creating the possibly
56
// missing nodes. Once the right place has been found, insert the given id and
57
// update the inserted flag to keep track of the insert. Each character in the
58
// string is interpreted as an index in an array of nodes.
59
2.85k
void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) {
60
2.85k
  const char *current_char = unichar_repr;
61
2.85k
  if (*current_char == '\0') {
62
0
    return;
63
0
  }
64
2.85k
  UNICHARMAP_NODE **current_nodes_pointer = &nodes;
65
10.4k
  do {
66
10.4k
    if (*current_nodes_pointer == nullptr) {
67
3.60k
      *current_nodes_pointer = new UNICHARMAP_NODE[256];
68
3.60k
    }
69
10.4k
    if (current_char[1] == '\0') {
70
2.85k
      (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id;
71
2.85k
      return;
72
2.85k
    }
73
7.62k
    current_nodes_pointer =
74
7.62k
        &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children);
75
7.62k
    ++current_char;
76
7.62k
  } while (true);
77
2.85k
}
78
79
// Search the given unichar representation in the tree, using length characters
80
// from it maximum. Each character in the string is interpreted as an index in
81
// an array of nodes. Stop once the tree does not have anymore nodes or once we
82
// found the right unichar_repr.
83
2.06M
bool UNICHARMAP::contains(const char *const unichar_repr, int length) const {
84
2.06M
  if (unichar_repr == nullptr || *unichar_repr == '\0') {
85
0
    return false;
86
0
  }
87
2.06M
  if (length <= 0 || length > UNICHAR_LEN) {
88
0
    return false;
89
0
  }
90
2.06M
  int index = 0;
91
2.06M
  if (unichar_repr[index] == '\0') {
92
0
    return false;
93
0
  }
94
2.06M
  UNICHARMAP_NODE *current_nodes = nodes;
95
96
4.95M
  while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') {
97
2.89M
    current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
98
2.89M
    ++index;
99
2.89M
  }
100
2.06M
  return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
101
2.06M
         current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
102
2.06M
}
103
104
// Return the minimum number of characters that must be used from this string
105
// to obtain a match in the UNICHARMAP.
106
613k
int UNICHARMAP::minmatch(const char *const unichar_repr) const {
107
613k
  const char *current_char = unichar_repr;
108
613k
  if (*current_char == '\0') {
109
0
    return 0;
110
0
  }
111
613k
  UNICHARMAP_NODE *current_nodes = nodes;
112
113
615k
  while (current_nodes != nullptr && *current_char != '\0') {
114
614k
    if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) {
115
612k
      return current_char + 1 - unichar_repr;
116
612k
    }
117
1.89k
    current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children;
118
1.89k
    ++current_char;
119
1.89k
  }
120
1.33k
  return 0;
121
613k
}
122
123
28
void UNICHARMAP::clear() {
124
28
  delete[] nodes;
125
28
  nodes = nullptr;
126
28
}
127
128
923k
UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {}
129
130
// Recursively delete the children
131
72.7k
UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
132
72.7k
  delete[] children;
133
72.7k
}
134
135
} // namespace tesseract