/src/tesseract/src/ccutil/unichar.cpp

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        unichar.cpp
// Description: Unicode character/ligature class.
// Author:      Ray Smith
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include <tesseract/unichar.h>
#include "errcode.h"
#include "tprintf.h"

#define UNI_MAX_LEGAL_UTF32 0x0010FFFF

namespace tesseract {

// Construct from a utf8 string. If len<0 then the string is null terminated.
// If the string is too long to fit in the UNICHAR then it takes only what
// will fit. Checks for illegal input and stops at an illegal sequence.
// The resulting UNICHAR may be empty.
UNICHAR::UNICHAR(const char *utf8_str, int len) {
  int total_len = 0;
  int step = 0;
  if (len < 0) {
    for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
      ;
    }
  }
  for (total_len = 0; total_len < len; total_len += step) {
    step = utf8_step(utf8_str + total_len);
    if (total_len + step > UNICHAR_LEN) {
      break; // Too long.
    }
    if (step == 0) {
      break; // Illegal first byte.
    }
    int i;
    for (i = 1; i < step; ++i) {
      if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
        break;
      }
    }
    if (i < step) {
      break; // Illegal surrogate
    }
  }
  memcpy(chars, utf8_str, total_len);
  if (total_len < UNICHAR_LEN) {
    chars[UNICHAR_LEN - 1] = total_len;
    while (total_len < UNICHAR_LEN - 1) {
      chars[total_len++] = 0;
    }
  }
}

// Construct from a single UCS4 character. Illegal values are ignored,
// resulting in an empty UNICHAR.
UNICHAR::UNICHAR(int unicode) {
  const int bytemask = 0xBF;
  const int bytemark = 0x80;

  if (unicode < 0x80) {
    chars[UNICHAR_LEN - 1] = 1;
    chars[2] = 0;
    chars[1] = 0;
    chars[0] = static_cast<char>(unicode);
  } else if (unicode < 0x800) {
    chars[UNICHAR_LEN - 1] = 2;
    chars[2] = 0;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xc0);
  } else if (unicode < 0x10000) {
    chars[UNICHAR_LEN - 1] = 3;
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xe0);
  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
    chars[UNICHAR_LEN - 1] = 4;
    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xf0);
  } else {
    memset(chars, 0, UNICHAR_LEN);
  }
}

// Get the first character as UCS-4.
int UNICHAR::first_uni() const {
  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
  int uni = 0;
  int len = utf8_step(chars);
  const char *src = chars;

  switch (len) {
    default:
      break;
    case 4:
      uni += static_cast<unsigned char>(*src++);
      uni <<= 6;
      // Fall through.
    case 3:
      uni += static_cast<unsigned char>(*src++);
      uni <<= 6;
      // Fall through.
    case 2:
      uni += static_cast<unsigned char>(*src++);
      uni <<= 6;
      // Fall through.
    case 1:
      uni += static_cast<unsigned char>(*src++);
  }
  uni -= utf8_offsets[len];
  return uni;
}

// Get a terminated UTF8 string: Must delete[] it after use.
char *UNICHAR::utf8_str() const {
  int len = utf8_len();
  char *str = new char[len + 1];
  memcpy(str, chars, len);
  str[len] = 0;
  return str;
}

// Get the number of bytes in the first character of the given utf8 string.
int UNICHAR::utf8_step(const char *utf8_str) {
  static const char utf8_bytes[256] = {
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};

  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
}

UNICHAR::const_iterator &UNICHAR::const_iterator::operator++() {
  ASSERT_HOST(it_ != nullptr);
  int step = utf8_step(it_);
  if (step == 0) {
    tprintf("ERROR: Illegal UTF8 encountered.\n");
    for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
      tprintf("Index %d char = 0x%x\n", i, it_[i]);
    }
    step = 1;
  }
  it_ += step;
  return *this;
}

int UNICHAR::const_iterator::operator*() const {
  ASSERT_HOST(it_ != nullptr);
  const int len = utf8_step(it_);
  if (len == 0) {
    tprintf("WARNING: Illegal UTF8 encountered\n");
    return ' ';
  }
  UNICHAR uch(it_, len);
  return uch.first_uni();
}

int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {
  ASSERT_HOST(it_ != nullptr);
  const int len = utf8_step(it_);
  if (len == 0) {
    tprintf("WARNING: Illegal UTF8 encountered\n");
    utf8_output[0] = ' ';
    return 1;
  }
  strncpy(utf8_output, it_, len);
  return len;
}

int UNICHAR::const_iterator::utf8_len() const {
  ASSERT_HOST(it_ != nullptr);
  const int len = utf8_step(it_);
  if (len == 0) {
    tprintf("WARNING: Illegal UTF8 encountered\n");
    return 1;
  }
  return len;
}

bool UNICHAR::const_iterator::is_legal() const {
  return utf8_step(it_) > 0;
}

UNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) {
  return UNICHAR::const_iterator(utf8_str);
}

UNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) {
  return UNICHAR::const_iterator(utf8_str + len);
}

// Converts a utf-8 string to a vector of unicodes.
// Returns an empty vector if the input contains invalid UTF-8.
/* static */
std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {
  const int utf8_length = strlen(utf8_str);
  std::vector<char32> unicodes;
  unicodes.reserve(utf8_length);
  const_iterator end_it(end(utf8_str, utf8_length));
  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
    if (it.is_legal()) {
      unicodes.push_back(*it);
    } else {
      unicodes.clear();
      return unicodes;
    }
  }
  return unicodes;
}

// Returns an empty string if the input contains an invalid unicode.
std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {
  std::string utf8_str;
  for (char32 ch : str32) {
    UNICHAR uni_ch(ch);
    int step;
    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
      utf8_str.append(uni_ch.utf8(), step);
    } else {
      return "";
    }
  }
  return utf8_str;
}

} // namespace tesseract

Coverage Report

Created: 2024-02-28 06:46

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: unichar.cpp
3		// Description: Unicode character/ligature class.
4		// Author: Ray Smith
5		//
6		// (C) Copyright 2006, Google Inc.
7		// Licensed under the Apache License, Version 2.0 (the "License");
8		// you may not use this file except in compliance with the License.
9		// You may obtain a copy of the License at
10		// http://www.apache.org/licenses/LICENSE-2.0
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16		//
17		///////////////////////////////////////////////////////////////////////
18
19		#include <tesseract/unichar.h>
20		#include "errcode.h"
21		#include "tprintf.h"
22
23	0	#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
24
25		namespace tesseract {
26
27		// Construct from a utf8 string. If len<0 then the string is null terminated.
28		// If the string is too long to fit in the UNICHAR then it takes only what
29		// will fit. Checks for illegal input and stops at an illegal sequence.
30		// The resulting UNICHAR may be empty.
31	93.4M	UNICHAR::UNICHAR(const char *utf8_str, int len) {
32	93.4M	int total_len = 0;
33	93.4M	int step = 0;
34	93.4M	if (len < 0) {
35	217M	for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
36	124M	;
37	124M	}
38	93.4M	}
39	187M	for (total_len = 0; total_len < len; total_len += step) {
40	93.5M	step = utf8_step(utf8_str + total_len);
41	93.5M	if (total_len + step > UNICHAR_LEN) {
42	0	break; // Too long.
43	0	}
44	93.5M	if (step == 0) {
45	0	break; // Illegal first byte.
46	0	}
47	93.5M	int i;
48	124M	for (i = 1; i < step; ++i) {
49	30.7M	if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
50	0	break;
51	0	}
52	30.7M	}
53	93.5M	if (i < step) {
54	0	break; // Illegal surrogate
55	0	}
56	93.5M	}
57	93.4M	memcpy(chars, utf8_str, total_len);
58	93.4M	if (total_len < UNICHAR_LEN) {
59	93.4M	chars[UNICHAR_LEN - 1] = total_len;
60	2.67G	while (total_len < UNICHAR_LEN - 1) {
61	2.58G	chars[total_len++] = 0;
62	2.58G	}
63	93.4M	}
64	93.4M	}
65
66		// Construct from a single UCS4 character. Illegal values are ignored,
67		// resulting in an empty UNICHAR.
68	0	UNICHAR::UNICHAR(int unicode) {
69	0	const int bytemask = 0xBF;
70	0	const int bytemark = 0x80;
71
72	0	if (unicode < 0x80) {
73	0	chars[UNICHAR_LEN - 1] = 1;
74	0	chars[2] = 0;
75	0	chars[1] = 0;
76	0	chars[0] = static_cast<char>(unicode);
77	0	} else if (unicode < 0x800) {
78	0	chars[UNICHAR_LEN - 1] = 2;
79	0	chars[2] = 0;
80	0	chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);
81	0	unicode >>= 6;
82	0	chars[0] = static_cast<char>(unicode \| 0xc0);
83	0	} else if (unicode < 0x10000) {
84	0	chars[UNICHAR_LEN - 1] = 3;
85	0	chars[2] = static_cast<char>((unicode \| bytemark) & bytemask);
86	0	unicode >>= 6;
87	0	chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);
88	0	unicode >>= 6;
89	0	chars[0] = static_cast<char>(unicode \| 0xe0);
90	0	} else if (unicode <= UNI_MAX_LEGAL_UTF32) {
91	0	chars[UNICHAR_LEN - 1] = 4;
92	0	chars[3] = static_cast<char>((unicode \| bytemark) & bytemask);
93	0	unicode >>= 6;
94	0	chars[2] = static_cast<char>((unicode \| bytemark) & bytemask);
95	0	unicode >>= 6;
96	0	chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);
97	0	unicode >>= 6;
98	0	chars[0] = static_cast<char>(unicode \| 0xf0);
99	0	} else {
100	0	memset(chars, 0, UNICHAR_LEN);
101	0	}
102	0	}
103
104		// Get the first character as UCS-4.
105	93.4M	int UNICHAR::first_uni() const {
106	93.4M	static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
107	93.4M	int uni = 0;
108	93.4M	int len = utf8_step(chars);
109	93.4M	const char *src = chars;
110
111	93.4M	switch (len) {
112	0	default:
113	0	break;
114	0	case 4:
115	0	uni += static_cast<unsigned char>(*src++);
116	0	uni <<= 6;
117		// Fall through.
118	14.4M	case 3:
119	14.4M	uni += static_cast<unsigned char>(*src++);
120	14.4M	uni <<= 6;
121		// Fall through.
122	16.3M	case 2:
123	16.3M	uni += static_cast<unsigned char>(*src++);
124	16.3M	uni <<= 6;
125		// Fall through.
126	93.4M	case 1:
127	93.4M	uni += static_cast<unsigned char>(*src++);
128	93.4M	}
129	93.4M	uni -= utf8_offsets[len];
130	93.4M	return uni;
131	93.4M	}
132
133		// Get a terminated UTF8 string: Must delete[] it after use.
134	0	char *UNICHAR::utf8_str() const {
135	0	int len = utf8_len();
136	0	char *str = new char[len + 1];
137	0	memcpy(str, chars, len);
138	0	str[len] = 0;
139	0	return str;
140	0	}
141
142		// Get the number of bytes in the first character of the given utf8 string.
143	187M	int UNICHAR::utf8_step(const char *utf8_str) {
144	187M	static const char utf8_bytes[256] = {
145	187M	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146	187M	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147	187M	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148	187M	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149	187M	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150	187M	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151	187M	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152	187M	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153	187M	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
154
155	187M	return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
156	187M	}
157
158	0	UNICHAR::const_iterator &UNICHAR::const_iterator::operator++() {
159	0	ASSERT_HOST(it_ != nullptr);
160	0	int step = utf8_step(it_);
161	0	if (step == 0) {
162	0	tprintf("ERROR: Illegal UTF8 encountered.\n");
163	0	for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
164	0	tprintf("Index %d char = 0x%x\n", i, it_[i]);
165	0	}
166	0	step = 1;
167	0	}
168	0	it_ += step;
169	0	return *this;
170	0	}
171
172	0	int UNICHAR::const_iterator::operator*() const {
173	0	ASSERT_HOST(it_ != nullptr);
174	0	const int len = utf8_step(it_);
175	0	if (len == 0) {
176	0	tprintf("WARNING: Illegal UTF8 encountered\n");
177	0	return ' ';
178	0	}
179	0	UNICHAR uch(it_, len);
180	0	return uch.first_uni();
181	0	}
182
183	0	int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {
184	0	ASSERT_HOST(it_ != nullptr);
185	0	const int len = utf8_step(it_);
186	0	if (len == 0) {
187	0	tprintf("WARNING: Illegal UTF8 encountered\n");
188	0	utf8_output[0] = ' ';
189	0	return 1;
190	0	}
191	0	strncpy(utf8_output, it_, len);
192	0	return len;
193	0	}
194
195	0	int UNICHAR::const_iterator::utf8_len() const {
196	0	ASSERT_HOST(it_ != nullptr);
197	0	const int len = utf8_step(it_);
198	0	if (len == 0) {
199	0	tprintf("WARNING: Illegal UTF8 encountered\n");
200	0	return 1;
201	0	}
202	0	return len;
203	0	}
204
205	0	bool UNICHAR::const_iterator::is_legal() const {
206	0	return utf8_step(it_) > 0;
207	0	}
208
209	0	UNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) {
210	0	return UNICHAR::const_iterator(utf8_str);
211	0	}
212
213	0	UNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) {
214	0	return UNICHAR::const_iterator(utf8_str + len);
215	0	}
216
217		// Converts a utf-8 string to a vector of unicodes.
218		// Returns an empty vector if the input contains invalid UTF-8.
219		/* static */
220	0	std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {
221	0	const int utf8_length = strlen(utf8_str);
222	0	std::vector<char32> unicodes;
223	0	unicodes.reserve(utf8_length);
224	0	const_iterator end_it(end(utf8_str, utf8_length));
225	0	for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
226	0	if (it.is_legal()) {
227	0	unicodes.push_back(*it);
228	0	} else {
229	0	unicodes.clear();
230	0	return unicodes;
231	0	}
232	0	}
233	0	return unicodes;
234	0	}
235
236		// Returns an empty string if the input contains an invalid unicode.
237	0	std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {
238	0	std::string utf8_str;
239	0	for (char32 ch : str32) {
240	0	UNICHAR uni_ch(ch);
241	0	int step;
242	0	if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
243	0	utf8_str.append(uni_ch.utf8(), step);
244	0	} else {
245	0	return "";
246	0	}
247	0	}
248	0	return utf8_str;
249	0	}
250
251		} // namespace tesseract