/src/harfbuzz/src/hb-utf.hh
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2011,2012,2014 Google, Inc. |
3 | | * |
4 | | * This is part of HarfBuzz, a text shaping library. |
5 | | * |
6 | | * Permission is hereby granted, without written agreement and without |
7 | | * license or royalty fees, to use, copy, modify, and distribute this |
8 | | * software and its documentation for any purpose, provided that the |
9 | | * above copyright notice and the following two paragraphs appear in |
10 | | * all copies of this software. |
11 | | * |
12 | | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
13 | | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
14 | | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
15 | | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
16 | | * DAMAGE. |
17 | | * |
18 | | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
19 | | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
20 | | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
21 | | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
22 | | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
23 | | * |
24 | | * Google Author(s): Behdad Esfahbod |
25 | | */ |
26 | | |
27 | | #ifndef HB_UTF_HH |
28 | | #define HB_UTF_HH |
29 | | |
30 | | #include "hb.hh" |
31 | | |
32 | | #include "hb-open-type.hh" |
33 | | |
34 | | |
35 | | struct hb_utf8_t |
36 | | { |
37 | | typedef uint8_t codepoint_t; |
38 | | |
39 | | static const codepoint_t * |
40 | | next (const codepoint_t *text, |
41 | | const codepoint_t *end, |
42 | | hb_codepoint_t *unicode, |
43 | | hb_codepoint_t replacement) |
44 | 91.8M | { |
45 | | /* Written to only accept well-formed sequences. |
46 | | * Based on ideas from ICU's U8_NEXT. |
47 | | * Generates one "replacement" for each ill-formed byte. */ |
48 | | |
49 | 91.8M | hb_codepoint_t c = *text++; |
50 | | |
51 | 91.8M | if (c > 0x7Fu) |
52 | 0 | { |
53 | 0 | if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ |
54 | 0 | { |
55 | 0 | unsigned int t1; |
56 | 0 | if (likely (text < end && |
57 | 0 | (t1 = text[0] - 0x80u) <= 0x3Fu)) |
58 | 0 | { |
59 | 0 | c = ((c&0x1Fu)<<6) | t1; |
60 | 0 | text++; |
61 | 0 | } |
62 | 0 | else |
63 | 0 | goto error; |
64 | 0 | } |
65 | 0 | else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ |
66 | 0 | { |
67 | 0 | unsigned int t1, t2; |
68 | 0 | if (likely (1 < end - text && |
69 | 0 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
70 | 0 | (t2 = text[1] - 0x80u) <= 0x3Fu)) |
71 | 0 | { |
72 | 0 | c = ((c&0xFu)<<12) | (t1<<6) | t2; |
73 | 0 | if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
74 | 0 | goto error; |
75 | 0 | text += 2; |
76 | 0 | } |
77 | 0 | else |
78 | 0 | goto error; |
79 | 0 | } |
80 | 0 | else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ |
81 | 0 | { |
82 | 0 | unsigned int t1, t2, t3; |
83 | 0 | if (likely (2 < end - text && |
84 | 0 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
85 | 0 | (t2 = text[1] - 0x80u) <= 0x3Fu && |
86 | 0 | (t3 = text[2] - 0x80u) <= 0x3Fu)) |
87 | 0 | { |
88 | 0 | c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
89 | 0 | if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) |
90 | 0 | goto error; |
91 | 0 | text += 3; |
92 | 0 | } |
93 | 0 | else |
94 | 0 | goto error; |
95 | 0 | } |
96 | 0 | else |
97 | 0 | goto error; |
98 | 0 | } |
99 | | |
100 | 91.8M | *unicode = c; |
101 | 91.8M | return text; |
102 | | |
103 | 0 | error: |
104 | 0 | *unicode = replacement; |
105 | 0 | return text; |
106 | 91.8M | } |
107 | | |
108 | | static const codepoint_t * |
109 | | prev (const codepoint_t *text, |
110 | | const codepoint_t *start, |
111 | | hb_codepoint_t *unicode, |
112 | | hb_codepoint_t replacement) |
113 | 0 | { |
114 | 0 | const codepoint_t *end = text--; |
115 | 0 | while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
116 | 0 | text--; |
117 | |
|
118 | 0 | if (likely (next (text, end, unicode, replacement) == end)) |
119 | 0 | return text; |
120 | | |
121 | 0 | *unicode = replacement; |
122 | 0 | return end - 1; |
123 | 0 | } |
124 | | |
125 | | static unsigned int |
126 | | strlen (const codepoint_t *text) |
127 | 6.38M | { return ::strlen ((const char *) text); } |
128 | | |
129 | | static unsigned int |
130 | | encode_len (hb_codepoint_t unicode) |
131 | 7.85M | { |
132 | 7.85M | if (unicode < 0x0080u) return 1; |
133 | 402k | if (unicode < 0x0800u) return 2; |
134 | 382k | if (unicode < 0x10000u) return 3; |
135 | 61 | if (unicode < 0x110000u) return 4; |
136 | 0 | return 3; |
137 | 61 | } |
138 | | |
139 | | static codepoint_t * |
140 | | encode (codepoint_t *text, |
141 | | const codepoint_t *end, |
142 | | hb_codepoint_t unicode) |
143 | 45.1k | { |
144 | 45.1k | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
145 | 0 | unicode = 0xFFFDu; |
146 | 45.1k | if (unicode < 0x0080u) |
147 | 41.2k | *text++ = unicode; |
148 | 3.90k | else if (unicode < 0x0800u) |
149 | 2.81k | { |
150 | 2.81k | if (end - text >= 2) |
151 | 2.74k | { |
152 | 2.74k | *text++ = 0xC0u + (0x1Fu & (unicode >> 6)); |
153 | 2.74k | *text++ = 0x80u + (0x3Fu & (unicode )); |
154 | 2.74k | } |
155 | 2.81k | } |
156 | 1.09k | else if (unicode < 0x10000u) |
157 | 1.09k | { |
158 | 1.09k | if (end - text >= 3) |
159 | 558 | { |
160 | 558 | *text++ = 0xE0u + (0x0Fu & (unicode >> 12)); |
161 | 558 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
162 | 558 | *text++ = 0x80u + (0x3Fu & (unicode )); |
163 | 558 | } |
164 | 1.09k | } |
165 | 2 | else |
166 | 2 | { |
167 | 2 | if (end - text >= 4) |
168 | 1 | { |
169 | 1 | *text++ = 0xF0u + (0x07u & (unicode >> 18)); |
170 | 1 | *text++ = 0x80u + (0x3Fu & (unicode >> 12)); |
171 | 1 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
172 | 1 | *text++ = 0x80u + (0x3Fu & (unicode )); |
173 | 1 | } |
174 | 2 | } |
175 | 45.1k | return text; |
176 | 45.1k | } |
177 | | }; |
178 | | |
179 | | |
180 | | template <typename TCodepoint> |
181 | | struct hb_utf16_xe_t |
182 | | { |
183 | | static_assert (sizeof (TCodepoint) == 2, ""); |
184 | | typedef TCodepoint codepoint_t; |
185 | | |
186 | | static const codepoint_t * |
187 | | next (const codepoint_t *text, |
188 | | const codepoint_t *end, |
189 | | hb_codepoint_t *unicode, |
190 | | hb_codepoint_t replacement) |
191 | 22.8M | { |
192 | 22.8M | hb_codepoint_t c = *text++; |
193 | | |
194 | 22.8M | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
195 | 22.8M | { |
196 | 22.8M | *unicode = c; |
197 | 22.8M | return text; |
198 | 22.8M | } |
199 | | |
200 | 7.29k | if (likely (c <= 0xDBFFu && text < end)) |
201 | 3.91k | { |
202 | | /* High-surrogate in c */ |
203 | 3.91k | hb_codepoint_t l = *text; |
204 | 3.91k | if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) |
205 | 187 | { |
206 | | /* Low-surrogate in l */ |
207 | 187 | *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
208 | 187 | text++; |
209 | 187 | return text; |
210 | 187 | } |
211 | 3.91k | } |
212 | | |
213 | | /* Lonely / out-of-order surrogate. */ |
214 | 7.10k | *unicode = replacement; |
215 | 7.10k | return text; |
216 | 7.29k | } Unexecuted instantiation: hb_utf16_xe_t<unsigned short>::next(unsigned short const*, unsigned short const*, unsigned int*, unsigned int) hb_utf16_xe_t<OT::IntType<unsigned short, 2u> >::next(OT::IntType<unsigned short, 2u> const*, OT::IntType<unsigned short, 2u> const*, unsigned int*, unsigned int) Line | Count | Source | 191 | 22.8M | { | 192 | 22.8M | hb_codepoint_t c = *text++; | 193 | | | 194 | 22.8M | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) | 195 | 22.8M | { | 196 | 22.8M | *unicode = c; | 197 | 22.8M | return text; | 198 | 22.8M | } | 199 | | | 200 | 7.29k | if (likely (c <= 0xDBFFu && text < end)) | 201 | 3.91k | { | 202 | | /* High-surrogate in c */ | 203 | 3.91k | hb_codepoint_t l = *text; | 204 | 3.91k | if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) | 205 | 187 | { | 206 | | /* Low-surrogate in l */ | 207 | 187 | *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); | 208 | 187 | text++; | 209 | 187 | return text; | 210 | 187 | } | 211 | 3.91k | } | 212 | | | 213 | | /* Lonely / out-of-order surrogate. */ | 214 | 7.10k | *unicode = replacement; | 215 | 7.10k | return text; | 216 | 7.29k | } |
|
217 | | |
218 | | static const codepoint_t * |
219 | | prev (const codepoint_t *text, |
220 | | const codepoint_t *start, |
221 | | hb_codepoint_t *unicode, |
222 | | hb_codepoint_t replacement) |
223 | 0 | { |
224 | 0 | hb_codepoint_t c = *--text; |
225 | |
|
226 | 0 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
227 | 0 | { |
228 | 0 | *unicode = c; |
229 | 0 | return text; |
230 | 0 | } |
231 | | |
232 | 0 | if (likely (c >= 0xDC00u && start < text)) |
233 | 0 | { |
234 | | /* Low-surrogate in c */ |
235 | 0 | hb_codepoint_t h = text[-1]; |
236 | 0 | if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) |
237 | 0 | { |
238 | | /* High-surrogate in h */ |
239 | 0 | *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
240 | 0 | text--; |
241 | 0 | return text; |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | /* Lonely / out-of-order surrogate. */ |
246 | 0 | *unicode = replacement; |
247 | 0 | return text; |
248 | 0 | } |
249 | | |
250 | | |
251 | | static unsigned int |
252 | | strlen (const codepoint_t *text) |
253 | 0 | { |
254 | 0 | unsigned int l = 0; |
255 | 0 | while (*text++) l++; |
256 | 0 | return l; |
257 | 0 | } |
258 | | |
259 | | static unsigned int |
260 | | encode_len (hb_codepoint_t unicode) |
261 | 7.90M | { |
262 | 7.90M | return unicode < 0x10000 ? 1 : 2; |
263 | 7.90M | } |
264 | | |
265 | | static codepoint_t * |
266 | | encode (codepoint_t *text, |
267 | | const codepoint_t *end, |
268 | | hb_codepoint_t unicode) |
269 | 0 | { |
270 | 0 | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
271 | 0 | unicode = 0xFFFDu; |
272 | 0 | if (unicode < 0x10000u) |
273 | 0 | *text++ = unicode; |
274 | 0 | else if (end - text >= 2) |
275 | 0 | { |
276 | 0 | unicode -= 0x10000u; |
277 | 0 | *text++ = 0xD800u + (unicode >> 10); |
278 | 0 | *text++ = 0xDC00u + (unicode & 0x03FFu); |
279 | 0 | } |
280 | 0 | return text; |
281 | 0 | } |
282 | | }; |
283 | | |
284 | | typedef hb_utf16_xe_t<uint16_t> hb_utf16_t; |
285 | | typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t; |
286 | | |
287 | | |
288 | | template <typename TCodepoint, bool validate=true> |
289 | | struct hb_utf32_xe_t |
290 | | { |
291 | | static_assert (sizeof (TCodepoint) == 4, ""); |
292 | | typedef TCodepoint codepoint_t; |
293 | | |
294 | | static const TCodepoint * |
295 | | next (const TCodepoint *text, |
296 | | const TCodepoint *end HB_UNUSED, |
297 | | hb_codepoint_t *unicode, |
298 | | hb_codepoint_t replacement) |
299 | 102M | { |
300 | 102M | hb_codepoint_t c = *unicode = *text++; |
301 | 102M | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
302 | 10.3M | *unicode = replacement; |
303 | 102M | return text; |
304 | 102M | } hb_utf32_xe_t<unsigned int, true>::next(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) Line | Count | Source | 299 | 102M | { | 300 | 102M | hb_codepoint_t c = *unicode = *text++; | 301 | 102M | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) | 302 | 10.3M | *unicode = replacement; | 303 | 102M | return text; | 304 | 102M | } |
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::next(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) |
305 | | |
306 | | static const TCodepoint * |
307 | | prev (const TCodepoint *text, |
308 | | const TCodepoint *start HB_UNUSED, |
309 | | hb_codepoint_t *unicode, |
310 | | hb_codepoint_t replacement) |
311 | 0 | { |
312 | 0 | hb_codepoint_t c = *unicode = *--text; |
313 | 0 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
314 | 0 | *unicode = replacement; |
315 | 0 | return text; |
316 | 0 | } Unexecuted instantiation: hb_utf32_xe_t<unsigned int, true>::prev(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::prev(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) |
317 | | |
318 | | static unsigned int |
319 | | strlen (const TCodepoint *text) |
320 | 0 | { |
321 | 0 | unsigned int l = 0; |
322 | 0 | while (*text++) l++; |
323 | 0 | return l; |
324 | 0 | } Unexecuted instantiation: hb_utf32_xe_t<unsigned int, true>::strlen(unsigned int const*) Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::strlen(unsigned int const*) |
325 | | |
326 | | static unsigned int |
327 | | encode_len (hb_codepoint_t unicode HB_UNUSED) |
328 | 7.90M | { |
329 | 7.90M | return 1; |
330 | 7.90M | } |
331 | | |
332 | | static codepoint_t * |
333 | | encode (codepoint_t *text, |
334 | | const codepoint_t *end HB_UNUSED, |
335 | | hb_codepoint_t unicode) |
336 | 0 | { |
337 | 0 | if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
338 | 0 | unicode = 0xFFFDu; |
339 | 0 | *text++ = unicode; |
340 | 0 | return text; |
341 | 0 | } |
342 | | }; |
343 | | |
344 | | typedef hb_utf32_xe_t<uint32_t> hb_utf32_t; |
345 | | typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t; |
346 | | |
347 | | |
348 | | struct hb_latin1_t |
349 | | { |
350 | | typedef uint8_t codepoint_t; |
351 | | |
352 | | static const codepoint_t * |
353 | | next (const codepoint_t *text, |
354 | | const codepoint_t *end HB_UNUSED, |
355 | | hb_codepoint_t *unicode, |
356 | | hb_codepoint_t replacement HB_UNUSED) |
357 | 0 | { |
358 | 0 | *unicode = *text++; |
359 | 0 | return text; |
360 | 0 | } |
361 | | |
362 | | static const codepoint_t * |
363 | | prev (const codepoint_t *text, |
364 | | const codepoint_t *start HB_UNUSED, |
365 | | hb_codepoint_t *unicode, |
366 | | hb_codepoint_t replacement HB_UNUSED) |
367 | 0 | { |
368 | 0 | *unicode = *--text; |
369 | 0 | return text; |
370 | 0 | } |
371 | | |
372 | | static unsigned int |
373 | | strlen (const codepoint_t *text) |
374 | 0 | { |
375 | 0 | unsigned int l = 0; |
376 | 0 | while (*text++) l++; |
377 | 0 | return l; |
378 | 0 | } |
379 | | |
380 | | static unsigned int |
381 | | encode_len (hb_codepoint_t unicode HB_UNUSED) |
382 | 0 | { |
383 | 0 | return 1; |
384 | 0 | } |
385 | | |
386 | | static codepoint_t * |
387 | | encode (codepoint_t *text, |
388 | | const codepoint_t *end HB_UNUSED, |
389 | | hb_codepoint_t unicode) |
390 | 0 | { |
391 | 0 | if (unlikely (unicode >= 0x0100u)) |
392 | 0 | unicode = '?'; |
393 | 0 | *text++ = unicode; |
394 | 0 | return text; |
395 | 0 | } |
396 | | }; |
397 | | |
398 | | |
399 | | struct hb_ascii_t |
400 | | { |
401 | | typedef uint8_t codepoint_t; |
402 | | |
403 | | static const codepoint_t * |
404 | | next (const codepoint_t *text, |
405 | | const codepoint_t *end HB_UNUSED, |
406 | | hb_codepoint_t *unicode, |
407 | | hb_codepoint_t replacement HB_UNUSED) |
408 | 824k | { |
409 | 824k | *unicode = *text++; |
410 | 824k | if (*unicode >= 0x0080u) |
411 | 1.16k | *unicode = replacement; |
412 | 824k | return text; |
413 | 824k | } |
414 | | |
415 | | static const codepoint_t * |
416 | | prev (const codepoint_t *text, |
417 | | const codepoint_t *start HB_UNUSED, |
418 | | hb_codepoint_t *unicode, |
419 | | hb_codepoint_t replacement) |
420 | 0 | { |
421 | 0 | *unicode = *--text; |
422 | 0 | if (*unicode >= 0x0080u) |
423 | 0 | *unicode = replacement; |
424 | 0 | return text; |
425 | 0 | } |
426 | | |
427 | | static unsigned int |
428 | | strlen (const codepoint_t *text) |
429 | 0 | { |
430 | 0 | unsigned int l = 0; |
431 | 0 | while (*text++) l++; |
432 | 0 | return l; |
433 | 0 | } |
434 | | |
435 | | static unsigned int |
436 | | encode_len (hb_codepoint_t unicode HB_UNUSED) |
437 | 0 | { |
438 | 0 | return 1; |
439 | 0 | } |
440 | | |
441 | | static codepoint_t * |
442 | | encode (codepoint_t *text, |
443 | | const codepoint_t *end HB_UNUSED, |
444 | | hb_codepoint_t unicode) |
445 | 0 | { |
446 | 0 | if (unlikely (unicode >= 0x0080u)) |
447 | 0 | unicode = '?'; |
448 | 0 | *text++ = unicode; |
449 | 0 | return text; |
450 | 0 | } |
451 | | }; |
452 | | |
453 | | #endif /* HB_UTF_HH */ |