Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/ccutil/unichar.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        unichar.cpp
3
// Description: Unicode character/ligature class.
4
// Author:      Ray Smith
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#include <tesseract/unichar.h>
20
#include "errcode.h"
21
#include "tprintf.h"
22
23
0
#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
24
25
namespace tesseract {
26
27
// Construct from a utf8 string. If len<0 then the string is null terminated.
28
// If the string is too long to fit in the UNICHAR then it takes only what
29
// will fit. Checks for illegal input and stops at an illegal sequence.
30
// The resulting UNICHAR may be empty.
31
93.4M
UNICHAR::UNICHAR(const char *utf8_str, int len) {
32
93.4M
  int total_len = 0;
33
93.4M
  int step = 0;
34
93.4M
  if (len < 0) {
35
217M
    for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
36
124M
      ;
37
124M
    }
38
93.4M
  }
39
187M
  for (total_len = 0; total_len < len; total_len += step) {
40
93.5M
    step = utf8_step(utf8_str + total_len);
41
93.5M
    if (total_len + step > UNICHAR_LEN) {
42
0
      break; // Too long.
43
0
    }
44
93.5M
    if (step == 0) {
45
0
      break; // Illegal first byte.
46
0
    }
47
93.5M
    int i;
48
124M
    for (i = 1; i < step; ++i) {
49
30.7M
      if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
50
0
        break;
51
0
      }
52
30.7M
    }
53
93.5M
    if (i < step) {
54
0
      break; // Illegal surrogate
55
0
    }
56
93.5M
  }
57
93.4M
  memcpy(chars, utf8_str, total_len);
58
93.4M
  if (total_len < UNICHAR_LEN) {
59
93.4M
    chars[UNICHAR_LEN - 1] = total_len;
60
2.67G
    while (total_len < UNICHAR_LEN - 1) {
61
2.58G
      chars[total_len++] = 0;
62
2.58G
    }
63
93.4M
  }
64
93.4M
}
65
66
// Construct from a single UCS4 character. Illegal values are ignored,
67
// resulting in an empty UNICHAR.
68
0
UNICHAR::UNICHAR(int unicode) {
69
0
  const int bytemask = 0xBF;
70
0
  const int bytemark = 0x80;
71
72
0
  if (unicode < 0x80) {
73
0
    chars[UNICHAR_LEN - 1] = 1;
74
0
    chars[2] = 0;
75
0
    chars[1] = 0;
76
0
    chars[0] = static_cast<char>(unicode);
77
0
  } else if (unicode < 0x800) {
78
0
    chars[UNICHAR_LEN - 1] = 2;
79
0
    chars[2] = 0;
80
0
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81
0
    unicode >>= 6;
82
0
    chars[0] = static_cast<char>(unicode | 0xc0);
83
0
  } else if (unicode < 0x10000) {
84
0
    chars[UNICHAR_LEN - 1] = 3;
85
0
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
86
0
    unicode >>= 6;
87
0
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
88
0
    unicode >>= 6;
89
0
    chars[0] = static_cast<char>(unicode | 0xe0);
90
0
  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
91
0
    chars[UNICHAR_LEN - 1] = 4;
92
0
    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
93
0
    unicode >>= 6;
94
0
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
95
0
    unicode >>= 6;
96
0
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
97
0
    unicode >>= 6;
98
0
    chars[0] = static_cast<char>(unicode | 0xf0);
99
0
  } else {
100
0
    memset(chars, 0, UNICHAR_LEN);
101
0
  }
102
0
}
103
104
// Get the first character as UCS-4.
105
93.4M
int UNICHAR::first_uni() const {
106
93.4M
  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
107
93.4M
  int uni = 0;
108
93.4M
  int len = utf8_step(chars);
109
93.4M
  const char *src = chars;
110
111
93.4M
  switch (len) {
112
0
    default:
113
0
      break;
114
0
    case 4:
115
0
      uni += static_cast<unsigned char>(*src++);
116
0
      uni <<= 6;
117
      // Fall through.
118
14.4M
    case 3:
119
14.4M
      uni += static_cast<unsigned char>(*src++);
120
14.4M
      uni <<= 6;
121
      // Fall through.
122
16.3M
    case 2:
123
16.3M
      uni += static_cast<unsigned char>(*src++);
124
16.3M
      uni <<= 6;
125
      // Fall through.
126
93.4M
    case 1:
127
93.4M
      uni += static_cast<unsigned char>(*src++);
128
93.4M
  }
129
93.4M
  uni -= utf8_offsets[len];
130
93.4M
  return uni;
131
93.4M
}
132
133
// Get a terminated UTF8 string: Must delete[] it after use.
134
0
char *UNICHAR::utf8_str() const {
135
0
  int len = utf8_len();
136
0
  char *str = new char[len + 1];
137
0
  memcpy(str, chars, len);
138
0
  str[len] = 0;
139
0
  return str;
140
0
}
141
142
// Get the number of bytes in the first character of the given utf8 string.
143
187M
int UNICHAR::utf8_step(const char *utf8_str) {
144
187M
  static const char utf8_bytes[256] = {
145
187M
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146
187M
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147
187M
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148
187M
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149
187M
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150
187M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151
187M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152
187M
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153
187M
      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
154
155
187M
  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
156
187M
}
157
158
0
UNICHAR::const_iterator &UNICHAR::const_iterator::operator++() {
159
0
  ASSERT_HOST(it_ != nullptr);
160
0
  int step = utf8_step(it_);
161
0
  if (step == 0) {
162
0
    tprintf("ERROR: Illegal UTF8 encountered.\n");
163
0
    for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
164
0
      tprintf("Index %d char = 0x%x\n", i, it_[i]);
165
0
    }
166
0
    step = 1;
167
0
  }
168
0
  it_ += step;
169
0
  return *this;
170
0
}
171
172
0
int UNICHAR::const_iterator::operator*() const {
173
0
  ASSERT_HOST(it_ != nullptr);
174
0
  const int len = utf8_step(it_);
175
0
  if (len == 0) {
176
0
    tprintf("WARNING: Illegal UTF8 encountered\n");
177
0
    return ' ';
178
0
  }
179
0
  UNICHAR uch(it_, len);
180
0
  return uch.first_uni();
181
0
}
182
183
0
int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {
184
0
  ASSERT_HOST(it_ != nullptr);
185
0
  const int len = utf8_step(it_);
186
0
  if (len == 0) {
187
0
    tprintf("WARNING: Illegal UTF8 encountered\n");
188
0
    utf8_output[0] = ' ';
189
0
    return 1;
190
0
  }
191
0
  strncpy(utf8_output, it_, len);
192
0
  return len;
193
0
}
194
195
0
int UNICHAR::const_iterator::utf8_len() const {
196
0
  ASSERT_HOST(it_ != nullptr);
197
0
  const int len = utf8_step(it_);
198
0
  if (len == 0) {
199
0
    tprintf("WARNING: Illegal UTF8 encountered\n");
200
0
    return 1;
201
0
  }
202
0
  return len;
203
0
}
204
205
0
bool UNICHAR::const_iterator::is_legal() const {
206
0
  return utf8_step(it_) > 0;
207
0
}
208
209
0
UNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) {
210
0
  return UNICHAR::const_iterator(utf8_str);
211
0
}
212
213
0
UNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) {
214
0
  return UNICHAR::const_iterator(utf8_str + len);
215
0
}
216
217
// Converts a utf-8 string to a vector of unicodes.
218
// Returns an empty vector if the input contains invalid UTF-8.
219
/* static */
220
0
std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {
221
0
  const int utf8_length = strlen(utf8_str);
222
0
  std::vector<char32> unicodes;
223
0
  unicodes.reserve(utf8_length);
224
0
  const_iterator end_it(end(utf8_str, utf8_length));
225
0
  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
226
0
    if (it.is_legal()) {
227
0
      unicodes.push_back(*it);
228
0
    } else {
229
0
      unicodes.clear();
230
0
      return unicodes;
231
0
    }
232
0
  }
233
0
  return unicodes;
234
0
}
235
236
// Returns an empty string if the input contains an invalid unicode.
237
0
std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {
238
0
  std::string utf8_str;
239
0
  for (char32 ch : str32) {
240
0
    UNICHAR uni_ch(ch);
241
0
    int step;
242
0
    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
243
0
      utf8_str.append(uni_ch.utf8(), step);
244
0
    } else {
245
0
      return "";
246
0
    }
247
0
  }
248
0
  return utf8_str;
249
0
}
250
251
} // namespace tesseract