Coverage Report

Created: 2023-09-25 06:24

/src/harfbuzz/src/hb-utf.hh
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright © 2011,2012,2014  Google, Inc.
3
 *
4
 *  This is part of HarfBuzz, a text shaping library.
5
 *
6
 * Permission is hereby granted, without written agreement and without
7
 * license or royalty fees, to use, copy, modify, and distribute this
8
 * software and its documentation for any purpose, provided that the
9
 * above copyright notice and the following two paragraphs appear in
10
 * all copies of this software.
11
 *
12
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13
 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14
 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15
 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16
 * DAMAGE.
17
 *
18
 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19
 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20
 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
21
 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22
 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23
 *
24
 * Google Author(s): Behdad Esfahbod
25
 */
26
27
#ifndef HB_UTF_HH
28
#define HB_UTF_HH
29
30
#include "hb.hh"
31
32
#include "hb-open-type.hh"
33
34
35
struct hb_utf8_t
36
{
37
  typedef uint8_t codepoint_t;
38
  static constexpr unsigned max_len = 4;
39
40
  static const codepoint_t *
41
  next (const codepoint_t *text,
42
  const codepoint_t *end,
43
  hb_codepoint_t *unicode,
44
  hb_codepoint_t replacement)
45
0
  {
46
    /* Written to only accept well-formed sequences.
47
     * Based on ideas from ICU's U8_NEXT.
48
     * Generates one "replacement" for each ill-formed byte. */
49
50
0
    hb_codepoint_t c = *text++;
51
52
0
    if (c > 0x7Fu)
53
0
    {
54
0
      if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */
55
0
      {
56
0
  unsigned int t1;
57
0
  if (likely (text < end &&
58
0
        (t1 = text[0] - 0x80u) <= 0x3Fu))
59
0
  {
60
0
    c = ((c&0x1Fu)<<6) | t1;
61
0
    text++;
62
0
  }
63
0
  else
64
0
    goto error;
65
0
      }
66
0
      else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */
67
0
      {
68
0
  unsigned int t1, t2;
69
0
  if (likely (1 < end - text &&
70
0
        (t1 = text[0] - 0x80u) <= 0x3Fu &&
71
0
        (t2 = text[1] - 0x80u) <= 0x3Fu))
72
0
  {
73
0
    c = ((c&0xFu)<<12) | (t1<<6) | t2;
74
0
    if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
75
0
      goto error;
76
0
    text += 2;
77
0
  }
78
0
  else
79
0
    goto error;
80
0
      }
81
0
      else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */
82
0
      {
83
0
  unsigned int t1, t2, t3;
84
0
  if (likely (2 < end - text &&
85
0
        (t1 = text[0] - 0x80u) <= 0x3Fu &&
86
0
        (t2 = text[1] - 0x80u) <= 0x3Fu &&
87
0
        (t3 = text[2] - 0x80u) <= 0x3Fu))
88
0
  {
89
0
    c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
90
0
    if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu)))
91
0
      goto error;
92
0
    text += 3;
93
0
  }
94
0
  else
95
0
    goto error;
96
0
      }
97
0
      else
98
0
  goto error;
99
0
    }
100
101
0
    *unicode = c;
102
0
    return text;
103
104
0
  error:
105
0
    *unicode = replacement;
106
0
    return text;
107
0
  }
108
109
  static const codepoint_t *
110
  prev (const codepoint_t *text,
111
  const codepoint_t *start,
112
  hb_codepoint_t *unicode,
113
  hb_codepoint_t replacement)
114
0
  {
115
0
    const codepoint_t *end = text--;
116
0
    while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
117
0
      text--;
118
119
0
    if (likely (next (text, end, unicode, replacement) == end))
120
0
      return text;
121
122
0
    *unicode = replacement;
123
0
    return end - 1;
124
0
  }
125
126
  static unsigned int
127
  strlen (const codepoint_t *text)
128
0
  { return ::strlen ((const char *) text); }
129
130
  static unsigned int
131
  encode_len (hb_codepoint_t unicode)
132
0
  {
133
0
    if (unicode <   0x0080u) return 1;
134
0
    if (unicode <   0x0800u) return 2;
135
0
    if (unicode <  0x10000u) return 3;
136
0
    if (unicode < 0x110000u) return 4;
137
0
    return 3;
138
0
  }
139
140
  static codepoint_t *
141
  encode (codepoint_t *text,
142
    const codepoint_t *end,
143
    hb_codepoint_t unicode)
144
0
  {
145
0
    if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
146
0
      unicode = 0xFFFDu;
147
0
    if (unicode < 0x0080u)
148
0
     *text++ = unicode;
149
0
    else if (unicode < 0x0800u)
150
0
    {
151
0
      if (end - text >= 2)
152
0
      {
153
0
  *text++ =  0xC0u + (0x1Fu & (unicode >>  6));
154
0
  *text++ =  0x80u + (0x3Fu & (unicode      ));
155
0
      }
156
0
    }
157
0
    else if (unicode < 0x10000u)
158
0
    {
159
0
      if (end - text >= 3)
160
0
      {
161
0
  *text++ =  0xE0u + (0x0Fu & (unicode >> 12));
162
0
  *text++ =  0x80u + (0x3Fu & (unicode >>  6));
163
0
  *text++ =  0x80u + (0x3Fu & (unicode      ));
164
0
      }
165
0
    }
166
0
    else
167
0
    {
168
0
      if (end - text >= 4)
169
0
      {
170
0
  *text++ =  0xF0u + (0x07u & (unicode >> 18));
171
0
  *text++ =  0x80u + (0x3Fu & (unicode >> 12));
172
0
  *text++ =  0x80u + (0x3Fu & (unicode >>  6));
173
0
  *text++ =  0x80u + (0x3Fu & (unicode      ));
174
0
      }
175
0
    }
176
0
    return text;
177
0
  }
178
};
179
180
181
template <typename TCodepoint>
182
struct hb_utf16_xe_t
183
{
184
  static_assert (sizeof (TCodepoint) == 2, "");
185
  typedef TCodepoint codepoint_t;
186
  static constexpr unsigned max_len = 2;
187
188
  static const codepoint_t *
189
  next (const codepoint_t *text,
190
  const codepoint_t *end,
191
  hb_codepoint_t *unicode,
192
  hb_codepoint_t replacement)
193
0
  {
194
0
    hb_codepoint_t c = *text++;
195
196
0
    if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
197
0
    {
198
0
      *unicode = c;
199
0
      return text;
200
0
    }
201
202
0
    if (likely (c <= 0xDBFFu && text < end))
203
0
    {
204
      /* High-surrogate in c */
205
0
      hb_codepoint_t l = *text;
206
0
      if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu)))
207
0
      {
208
  /* Low-surrogate in l */
209
0
  *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
210
0
   text++;
211
0
   return text;
212
0
      }
213
0
    }
214
215
    /* Lonely / out-of-order surrogate. */
216
0
    *unicode = replacement;
217
0
    return text;
218
0
  }
219
220
  static const codepoint_t *
221
  prev (const codepoint_t *text,
222
  const codepoint_t *start,
223
  hb_codepoint_t *unicode,
224
  hb_codepoint_t replacement)
225
0
  {
226
0
    hb_codepoint_t c = *--text;
227
228
0
    if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
229
0
    {
230
0
      *unicode = c;
231
0
      return text;
232
0
    }
233
234
0
    if (likely (c >= 0xDC00u && start < text))
235
0
    {
236
      /* Low-surrogate in c */
237
0
      hb_codepoint_t h = text[-1];
238
0
      if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu)))
239
0
      {
240
  /* High-surrogate in h */
241
0
  *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
242
0
  text--;
243
0
  return text;
244
0
      }
245
0
    }
246
247
    /* Lonely / out-of-order surrogate. */
248
0
    *unicode = replacement;
249
0
    return text;
250
0
  }
251
252
253
  static unsigned int
254
  strlen (const codepoint_t *text)
255
0
  {
256
0
    unsigned int l = 0;
257
0
    while (*text++) l++;
258
0
    return l;
259
0
  }
260
261
  static unsigned int
262
  encode_len (hb_codepoint_t unicode)
263
  {
264
    return unicode < 0x10000 ? 1 : 2;
265
  }
266
267
  static codepoint_t *
268
  encode (codepoint_t *text,
269
    const codepoint_t *end,
270
    hb_codepoint_t unicode)
271
  {
272
    if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
273
      unicode = 0xFFFDu;
274
    if (unicode < 0x10000u)
275
     *text++ = unicode;
276
    else if (end - text >= 2)
277
    {
278
      unicode -= 0x10000u;
279
      *text++ =  0xD800u + (unicode >> 10);
280
      *text++ =  0xDC00u + (unicode & 0x03FFu);
281
    }
282
    return text;
283
  }
284
};
285
286
typedef hb_utf16_xe_t<uint16_t> hb_utf16_t;
287
typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t;
288
289
290
template <typename TCodepoint, bool validate=true>
291
struct hb_utf32_xe_t
292
{
293
  static_assert (sizeof (TCodepoint) == 4, "");
294
  typedef TCodepoint codepoint_t;
295
  static constexpr unsigned max_len = 1;
296
297
  static const TCodepoint *
298
  next (const TCodepoint *text,
299
  const TCodepoint *end HB_UNUSED,
300
  hb_codepoint_t *unicode,
301
  hb_codepoint_t replacement)
302
2.93M
  {
303
2.93M
    hb_codepoint_t c = *unicode = *text++;
304
2.93M
    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
305
0
      *unicode = replacement;
306
2.93M
    return text;
307
2.93M
  }
hb_utf32_xe_t<unsigned int, true>::next(unsigned int const*, unsigned int const*, unsigned int*, unsigned int)
Line
Count
Source
302
2.93M
  {
303
2.93M
    hb_codepoint_t c = *unicode = *text++;
304
2.93M
    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
305
0
      *unicode = replacement;
306
2.93M
    return text;
307
2.93M
  }
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::next(unsigned int const*, unsigned int const*, unsigned int*, unsigned int)
308
309
  static const TCodepoint *
310
  prev (const TCodepoint *text,
311
  const TCodepoint *start HB_UNUSED,
312
  hb_codepoint_t *unicode,
313
  hb_codepoint_t replacement)
314
0
  {
315
0
    hb_codepoint_t c = *unicode = *--text;
316
0
    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
317
0
      *unicode = replacement;
318
0
    return text;
319
0
  }
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, true>::prev(unsigned int const*, unsigned int const*, unsigned int*, unsigned int)
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::prev(unsigned int const*, unsigned int const*, unsigned int*, unsigned int)
320
321
  static unsigned int
322
  strlen (const TCodepoint *text)
323
0
  {
324
0
    unsigned int l = 0;
325
0
    while (*text++) l++;
326
0
    return l;
327
0
  }
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, true>::strlen(unsigned int const*)
Unexecuted instantiation: hb_utf32_xe_t<unsigned int, false>::strlen(unsigned int const*)
328
329
  static unsigned int
330
  encode_len (hb_codepoint_t unicode HB_UNUSED)
331
  {
332
    return 1;
333
  }
334
335
  static codepoint_t *
336
  encode (codepoint_t *text,
337
    const codepoint_t *end HB_UNUSED,
338
    hb_codepoint_t unicode)
339
  {
340
    if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
341
      unicode = 0xFFFDu;
342
    *text++ = unicode;
343
    return text;
344
  }
345
};
346
347
typedef hb_utf32_xe_t<uint32_t> hb_utf32_t;
348
typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t;
349
350
351
struct hb_latin1_t
352
{
353
  typedef uint8_t codepoint_t;
354
  static constexpr unsigned max_len = 1;
355
356
  static const codepoint_t *
357
  next (const codepoint_t *text,
358
  const codepoint_t *end HB_UNUSED,
359
  hb_codepoint_t *unicode,
360
  hb_codepoint_t replacement HB_UNUSED)
361
0
  {
362
0
    *unicode = *text++;
363
0
    return text;
364
0
  }
365
366
  static const codepoint_t *
367
  prev (const codepoint_t *text,
368
  const codepoint_t *start HB_UNUSED,
369
  hb_codepoint_t *unicode,
370
  hb_codepoint_t replacement HB_UNUSED)
371
0
  {
372
0
    *unicode = *--text;
373
0
    return text;
374
0
  }
375
376
  static unsigned int
377
  strlen (const codepoint_t *text)
378
0
  {
379
0
    unsigned int l = 0;
380
0
    while (*text++) l++;
381
0
    return l;
382
0
  }
383
384
  static unsigned int
385
  encode_len (hb_codepoint_t unicode HB_UNUSED)
386
0
  {
387
0
    return 1;
388
0
  }
389
390
  static codepoint_t *
391
  encode (codepoint_t *text,
392
    const codepoint_t *end HB_UNUSED,
393
    hb_codepoint_t unicode)
394
0
  {
395
0
    if (unlikely (unicode >= 0x0100u))
396
0
      unicode = '?';
397
0
    *text++ = unicode;
398
0
    return text;
399
0
  }
400
};
401
402
403
struct hb_ascii_t
404
{
405
  typedef uint8_t codepoint_t;
406
  static constexpr unsigned max_len = 1;
407
408
  static const codepoint_t *
409
  next (const codepoint_t *text,
410
  const codepoint_t *end HB_UNUSED,
411
  hb_codepoint_t *unicode,
412
  hb_codepoint_t replacement)
413
0
  {
414
0
    *unicode = *text++;
415
0
    if (*unicode >= 0x0080u)
416
0
      *unicode = replacement;
417
0
    return text;
418
0
  }
419
420
  static const codepoint_t *
421
  prev (const codepoint_t *text,
422
  const codepoint_t *start HB_UNUSED,
423
  hb_codepoint_t *unicode,
424
  hb_codepoint_t replacement)
425
0
  {
426
0
    *unicode = *--text;
427
0
    if (*unicode >= 0x0080u)
428
0
      *unicode = replacement;
429
0
    return text;
430
0
  }
431
432
  static unsigned int
433
  strlen (const codepoint_t *text)
434
0
  {
435
0
    unsigned int l = 0;
436
0
    while (*text++) l++;
437
0
    return l;
438
0
  }
439
440
  static unsigned int
441
  encode_len (hb_codepoint_t unicode HB_UNUSED)
442
0
  {
443
0
    return 1;
444
0
  }
445
446
  static codepoint_t *
447
  encode (codepoint_t *text,
448
    const codepoint_t *end HB_UNUSED,
449
    hb_codepoint_t unicode)
450
0
  {
451
0
    if (unlikely (unicode >= 0x0080u))
452
0
      unicode = '?';
453
0
    *text++ = unicode;
454
0
    return text;
455
0
  }
456
};
457
458
template <typename utf_t>
459
static inline const typename utf_t::codepoint_t *
460
hb_utf_offset_to_pointer (const typename utf_t::codepoint_t *start,
461
        signed offset)
462
{
463
  hb_codepoint_t unicode;
464
465
  while (offset-- > 0)
466
    start = utf_t::next (start,
467
       start + utf_t::max_len,
468
       &unicode,
469
       HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT);
470
471
  while (offset++ < 0)
472
    start = utf_t::prev (start,
473
       start - utf_t::max_len,
474
       &unicode,
475
       HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT);
476
477
  return start;
478
}
479
480
481
#endif /* HB_UTF_HH */