/src/harfbuzz/src/hb-utf.hh
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright © 2011,2012,2014 Google, Inc. |
3 | | * |
4 | | * This is part of HarfBuzz, a text shaping library. |
5 | | * |
6 | | * Permission is hereby granted, without written agreement and without |
7 | | * license or royalty fees, to use, copy, modify, and distribute this |
8 | | * software and its documentation for any purpose, provided that the |
9 | | * above copyright notice and the following two paragraphs appear in |
10 | | * all copies of this software. |
11 | | * |
12 | | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
13 | | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
14 | | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
15 | | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
16 | | * DAMAGE. |
17 | | * |
18 | | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
19 | | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
20 | | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
21 | | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
22 | | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
23 | | * |
24 | | * Google Author(s): Behdad Esfahbod |
25 | | */ |
26 | | |
27 | | #ifndef HB_UTF_HH |
28 | | #define HB_UTF_HH |
29 | | |
30 | | #include "hb.hh" |
31 | | |
32 | | #include "hb-open-type.hh" |
33 | | |
34 | | |
35 | | struct hb_utf8_t |
36 | | { |
37 | | typedef uint8_t codepoint_t; |
38 | | static constexpr unsigned max_len = 4; |
39 | | |
40 | | static const codepoint_t * |
41 | | next (const codepoint_t *text, |
42 | | const codepoint_t *end, |
43 | | hb_codepoint_t *unicode, |
44 | | hb_codepoint_t replacement) |
45 | 0 | { |
46 | | /* Written to only accept well-formed sequences. |
47 | | * Based on ideas from ICU's U8_NEXT. |
48 | | * Generates one "replacement" for each ill-formed byte. */ |
49 | |
|
50 | 0 | hb_codepoint_t c = *text++; |
51 | |
|
52 | 0 | if (c > 0x7Fu) |
53 | 0 | { |
54 | 0 | if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ |
55 | 0 | { |
56 | 0 | unsigned int t1; |
57 | 0 | if (likely (text < end && |
58 | 0 | (t1 = text[0] - 0x80u) <= 0x3Fu)) |
59 | 0 | { |
60 | 0 | c = ((c&0x1Fu)<<6) | t1; |
61 | 0 | text++; |
62 | 0 | } |
63 | 0 | else |
64 | 0 | goto error; |
65 | 0 | } |
66 | 0 | else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ |
67 | 0 | { |
68 | 0 | unsigned int t1, t2; |
69 | 0 | if (likely (1 < end - text && |
70 | 0 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
71 | 0 | (t2 = text[1] - 0x80u) <= 0x3Fu)) |
72 | 0 | { |
73 | 0 | c = ((c&0xFu)<<12) | (t1<<6) | t2; |
74 | 0 | if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
75 | 0 | goto error; |
76 | 0 | text += 2; |
77 | 0 | } |
78 | 0 | else |
79 | 0 | goto error; |
80 | 0 | } |
81 | 0 | else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ |
82 | 0 | { |
83 | 0 | unsigned int t1, t2, t3; |
84 | 0 | if (likely (2 < end - text && |
85 | 0 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
86 | 0 | (t2 = text[1] - 0x80u) <= 0x3Fu && |
87 | 0 | (t3 = text[2] - 0x80u) <= 0x3Fu)) |
88 | 0 | { |
89 | 0 | c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
90 | 0 | if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) |
91 | 0 | goto error; |
92 | 0 | text += 3; |
93 | 0 | } |
94 | 0 | else |
95 | 0 | goto error; |
96 | 0 | } |
97 | 0 | else |
98 | 0 | goto error; |
99 | 0 | } |
100 | | |
101 | 0 | *unicode = c; |
102 | 0 | return text; |
103 | | |
104 | 0 | error: |
105 | 0 | *unicode = replacement; |
106 | 0 | return text; |
107 | 0 | } |
108 | | |
109 | | static const codepoint_t * |
110 | | prev (const codepoint_t *text, |
111 | | const codepoint_t *start, |
112 | | hb_codepoint_t *unicode, |
113 | | hb_codepoint_t replacement) |
114 | 0 | { |
115 | 0 | const codepoint_t *end = text--; |
116 | 0 | while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
117 | 0 | text--; |
118 | |
|
119 | 0 | if (likely (next (text, end, unicode, replacement) == end)) |
120 | 0 | return text; |
121 | | |
122 | 0 | *unicode = replacement; |
123 | 0 | return end - 1; |
124 | 0 | } |
125 | | |
126 | | static unsigned int |
127 | | strlen (const codepoint_t *text) |
128 | 0 | { return ::strlen ((const char *) text); } |
129 | | |
130 | | static unsigned int |
131 | | encode_len (hb_codepoint_t unicode) |
132 | 0 | { |
133 | 0 | if (unicode < 0x0080u) return 1; |
134 | 0 | if (unicode < 0x0800u) return 2; |
135 | 0 | if (unicode < 0x10000u) return 3; |
136 | 0 | if (unicode < 0x110000u) return 4; |
137 | 0 | return 3; |
138 | 0 | } |
139 | | |
140 | | static codepoint_t * |
141 | | encode (codepoint_t *text, |
142 | | const codepoint_t *end, |
143 | | hb_codepoint_t unicode) |
144 | 0 | { |
145 | 0 | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
146 | 0 | unicode = 0xFFFDu; |
147 | 0 | if (unicode < 0x0080u) |
148 | 0 | *text++ = unicode; |
149 | 0 | else if (unicode < 0x0800u) |
150 | 0 | { |
151 | 0 | if (end - text >= 2) |
152 | 0 | { |
153 | 0 | *text++ = 0xC0u + (0x1Fu & (unicode >> 6)); |
154 | 0 | *text++ = 0x80u + (0x3Fu & (unicode )); |
155 | 0 | } |
156 | 0 | } |
157 | 0 | else if (unicode < 0x10000u) |
158 | 0 | { |
159 | 0 | if (end - text >= 3) |
160 | 0 | { |
161 | 0 | *text++ = 0xE0u + (0x0Fu & (unicode >> 12)); |
162 | 0 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
163 | 0 | *text++ = 0x80u + (0x3Fu & (unicode )); |
164 | 0 | } |
165 | 0 | } |
166 | 0 | else |
167 | 0 | { |
168 | 0 | if (end - text >= 4) |
169 | 0 | { |
170 | 0 | *text++ = 0xF0u + (0x07u & (unicode >> 18)); |
171 | 0 | *text++ = 0x80u + (0x3Fu & (unicode >> 12)); |
172 | 0 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
173 | 0 | *text++ = 0x80u + (0x3Fu & (unicode )); |
174 | 0 | } |
175 | 0 | } |
176 | 0 | return text; |
177 | 0 | } |
178 | | }; |
179 | | |
180 | | |
181 | | template <typename TCodepoint> |
182 | | struct hb_utf16_xe_t |
183 | | { |
184 | | static_assert (sizeof (TCodepoint) == 2, ""); |
185 | | typedef TCodepoint codepoint_t; |
186 | | static constexpr unsigned max_len = 2; |
187 | | |
188 | | static const codepoint_t * |
189 | | next (const codepoint_t *text, |
190 | | const codepoint_t *end, |
191 | | hb_codepoint_t *unicode, |
192 | | hb_codepoint_t replacement) |
193 | 0 | { |
194 | 0 | hb_codepoint_t c = *text++; |
195 | |
|
196 | 0 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
197 | 0 | { |
198 | 0 | *unicode = c; |
199 | 0 | return text; |
200 | 0 | } |
201 | | |
202 | 0 | if (likely (c <= 0xDBFFu && text < end)) |
203 | 0 | { |
204 | | /* High-surrogate in c */ |
205 | 0 | hb_codepoint_t l = *text; |
206 | 0 | if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) |
207 | 0 | { |
208 | | /* Low-surrogate in l */ |
209 | 0 | *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
210 | 0 | text++; |
211 | 0 | return text; |
212 | 0 | } |
213 | 0 | } |
214 | | |
215 | | /* Lonely / out-of-order surrogate. */ |
216 | 0 | *unicode = replacement; |
217 | 0 | return text; |
218 | 0 | } |
219 | | |
220 | | static const codepoint_t * |
221 | | prev (const codepoint_t *text, |
222 | | const codepoint_t *start, |
223 | | hb_codepoint_t *unicode, |
224 | | hb_codepoint_t replacement) |
225 | 0 | { |
226 | 0 | hb_codepoint_t c = *--text; |
227 | |
|
228 | 0 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
229 | 0 | { |
230 | 0 | *unicode = c; |
231 | 0 | return text; |
232 | 0 | } |
233 | | |
234 | 0 | if (likely (c >= 0xDC00u && start < text)) |
235 | 0 | { |
236 | | /* Low-surrogate in c */ |
237 | 0 | hb_codepoint_t h = text[-1]; |
238 | 0 | if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) |
239 | 0 | { |
240 | | /* High-surrogate in h */ |
241 | 0 | *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
242 | 0 | text--; |
243 | 0 | return text; |
244 | 0 | } |
245 | 0 | } |
246 | | |
247 | | /* Lonely / out-of-order surrogate. */ |
248 | 0 | *unicode = replacement; |
249 | 0 | return text; |
250 | 0 | } |
251 | | |
252 | | |
253 | | static unsigned int |
254 | | strlen (const codepoint_t *text) |
255 | 0 | { |
256 | 0 | unsigned int l = 0; |
257 | 0 | while (*text++) l++; |
258 | 0 | return l; |
259 | 0 | } |
260 | | |
261 | | static unsigned int |
262 | | encode_len (hb_codepoint_t unicode) |
263 | | { |
264 | | return unicode < 0x10000 ? 1 : 2; |
265 | | } |
266 | | |
267 | | static codepoint_t * |
268 | | encode (codepoint_t *text, |
269 | | const codepoint_t *end, |
270 | | hb_codepoint_t unicode) |
271 | | { |
272 | | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
273 | | unicode = 0xFFFDu; |
274 | | if (unicode < 0x10000u) |
275 | | *text++ = unicode; |
276 | | else if (end - text >= 2) |
277 | | { |
278 | | unicode -= 0x10000u; |
279 | | *text++ = 0xD800u + (unicode >> 10); |
280 | | *text++ = 0xDC00u + (unicode & 0x03FFu); |
281 | | } |
282 | | return text; |
283 | | } |
284 | | }; |
285 | | |
286 | | typedef hb_utf16_xe_t<uint16_t> hb_utf16_t; |
287 | | typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t; |
288 | | |
289 | | |
290 | | template <typename TCodepoint, bool validate=true> |
291 | | struct hb_utf32_xe_t |
292 | | { |
293 | | static_assert (sizeof (TCodepoint) == 4, ""); |
294 | | typedef TCodepoint codepoint_t; |
295 | | static constexpr unsigned max_len = 1; |
296 | | |
297 | | static const TCodepoint * |
298 | | next (const TCodepoint *text, |
299 | | const TCodepoint *end HB_UNUSED, |
300 | | hb_codepoint_t *unicode, |
301 | | hb_codepoint_t replacement) |
302 | 2.93M | { |
303 | 2.93M | hb_codepoint_t c = *unicode = *text++; |
304 | 2.93M | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
305 | 0 | *unicode = replacement; |
306 | 2.93M | return text; |
307 | 2.93M | } hb_utf32_xe_t<unsigned int, true>::next(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) Line | Count | Source | 302 | 2.93M | { | 303 | 2.93M | hb_codepoint_t c = *unicode = *text++; | 304 | 2.93M | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) | 305 | 0 | *unicode = replacement; | 306 | 2.93M | return text; | 307 | 2.93M | } |
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::next(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) |
308 | | |
309 | | static const TCodepoint * |
310 | | prev (const TCodepoint *text, |
311 | | const TCodepoint *start HB_UNUSED, |
312 | | hb_codepoint_t *unicode, |
313 | | hb_codepoint_t replacement) |
314 | 0 | { |
315 | 0 | hb_codepoint_t c = *unicode = *--text; |
316 | 0 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
317 | 0 | *unicode = replacement; |
318 | 0 | return text; |
319 | 0 | } Unexecuted instantiation: hb_utf32_xe_t<unsigned int, true>::prev(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::prev(unsigned int const*, unsigned int const*, unsigned int*, unsigned int) |
320 | | |
321 | | static unsigned int |
322 | | strlen (const TCodepoint *text) |
323 | 0 | { |
324 | 0 | unsigned int l = 0; |
325 | 0 | while (*text++) l++; |
326 | 0 | return l; |
327 | 0 | } Unexecuted instantiation: hb_utf32_xe_t<unsigned int, true>::strlen(unsigned int const*) Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::strlen(unsigned int const*) |
328 | | |
329 | | static unsigned int |
330 | | encode_len (hb_codepoint_t unicode HB_UNUSED) |
331 | | { |
332 | | return 1; |
333 | | } |
334 | | |
335 | | static codepoint_t * |
336 | | encode (codepoint_t *text, |
337 | | const codepoint_t *end HB_UNUSED, |
338 | | hb_codepoint_t unicode) |
339 | | { |
340 | | if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
341 | | unicode = 0xFFFDu; |
342 | | *text++ = unicode; |
343 | | return text; |
344 | | } |
345 | | }; |
346 | | |
347 | | typedef hb_utf32_xe_t<uint32_t> hb_utf32_t; |
348 | | typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t; |
349 | | |
350 | | |
351 | | struct hb_latin1_t |
352 | | { |
353 | | typedef uint8_t codepoint_t; |
354 | | static constexpr unsigned max_len = 1; |
355 | | |
356 | | static const codepoint_t * |
357 | | next (const codepoint_t *text, |
358 | | const codepoint_t *end HB_UNUSED, |
359 | | hb_codepoint_t *unicode, |
360 | | hb_codepoint_t replacement HB_UNUSED) |
361 | 0 | { |
362 | 0 | *unicode = *text++; |
363 | 0 | return text; |
364 | 0 | } |
365 | | |
366 | | static const codepoint_t * |
367 | | prev (const codepoint_t *text, |
368 | | const codepoint_t *start HB_UNUSED, |
369 | | hb_codepoint_t *unicode, |
370 | | hb_codepoint_t replacement HB_UNUSED) |
371 | 0 | { |
372 | 0 | *unicode = *--text; |
373 | 0 | return text; |
374 | 0 | } |
375 | | |
376 | | static unsigned int |
377 | | strlen (const codepoint_t *text) |
378 | 0 | { |
379 | 0 | unsigned int l = 0; |
380 | 0 | while (*text++) l++; |
381 | 0 | return l; |
382 | 0 | } |
383 | | |
384 | | static unsigned int |
385 | | encode_len (hb_codepoint_t unicode HB_UNUSED) |
386 | 0 | { |
387 | 0 | return 1; |
388 | 0 | } |
389 | | |
390 | | static codepoint_t * |
391 | | encode (codepoint_t *text, |
392 | | const codepoint_t *end HB_UNUSED, |
393 | | hb_codepoint_t unicode) |
394 | 0 | { |
395 | 0 | if (unlikely (unicode >= 0x0100u)) |
396 | 0 | unicode = '?'; |
397 | 0 | *text++ = unicode; |
398 | 0 | return text; |
399 | 0 | } |
400 | | }; |
401 | | |
402 | | |
403 | | struct hb_ascii_t |
404 | | { |
405 | | typedef uint8_t codepoint_t; |
406 | | static constexpr unsigned max_len = 1; |
407 | | |
408 | | static const codepoint_t * |
409 | | next (const codepoint_t *text, |
410 | | const codepoint_t *end HB_UNUSED, |
411 | | hb_codepoint_t *unicode, |
412 | | hb_codepoint_t replacement) |
413 | 0 | { |
414 | 0 | *unicode = *text++; |
415 | 0 | if (*unicode >= 0x0080u) |
416 | 0 | *unicode = replacement; |
417 | 0 | return text; |
418 | 0 | } |
419 | | |
420 | | static const codepoint_t * |
421 | | prev (const codepoint_t *text, |
422 | | const codepoint_t *start HB_UNUSED, |
423 | | hb_codepoint_t *unicode, |
424 | | hb_codepoint_t replacement) |
425 | 0 | { |
426 | 0 | *unicode = *--text; |
427 | 0 | if (*unicode >= 0x0080u) |
428 | 0 | *unicode = replacement; |
429 | 0 | return text; |
430 | 0 | } |
431 | | |
432 | | static unsigned int |
433 | | strlen (const codepoint_t *text) |
434 | 0 | { |
435 | 0 | unsigned int l = 0; |
436 | 0 | while (*text++) l++; |
437 | 0 | return l; |
438 | 0 | } |
439 | | |
440 | | static unsigned int |
441 | | encode_len (hb_codepoint_t unicode HB_UNUSED) |
442 | 0 | { |
443 | 0 | return 1; |
444 | 0 | } |
445 | | |
446 | | static codepoint_t * |
447 | | encode (codepoint_t *text, |
448 | | const codepoint_t *end HB_UNUSED, |
449 | | hb_codepoint_t unicode) |
450 | 0 | { |
451 | 0 | if (unlikely (unicode >= 0x0080u)) |
452 | 0 | unicode = '?'; |
453 | 0 | *text++ = unicode; |
454 | 0 | return text; |
455 | 0 | } |
456 | | }; |
457 | | |
458 | | template <typename utf_t> |
459 | | static inline const typename utf_t::codepoint_t * |
460 | | hb_utf_offset_to_pointer (const typename utf_t::codepoint_t *start, |
461 | | signed offset) |
462 | | { |
463 | | hb_codepoint_t unicode; |
464 | | |
465 | | while (offset-- > 0) |
466 | | start = utf_t::next (start, |
467 | | start + utf_t::max_len, |
468 | | &unicode, |
469 | | HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT); |
470 | | |
471 | | while (offset++ < 0) |
472 | | start = utf_t::prev (start, |
473 | | start - utf_t::max_len, |
474 | | &unicode, |
475 | | HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT); |
476 | | |
477 | | return start; |
478 | | } |
479 | | |
480 | | |
481 | | #endif /* HB_UTF_HH */ |