Coverage Report

Created: 2025-10-13 06:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/utf8proc/utf8proc.c
Line
Count
Source
1
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2
/*
3
 *  Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4
 *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5
 *
6
 *  Permission is hereby granted, free of charge, to any person obtaining a
7
 *  copy of this software and associated documentation files (the "Software"),
8
 *  to deal in the Software without restriction, including without limitation
9
 *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 *  and/or sell copies of the Software, and to permit persons to whom the
11
 *  Software is furnished to do so, subject to the following conditions:
12
 *
13
 *  The above copyright notice and this permission notice shall be included in
14
 *  all copies or substantial portions of the Software.
15
 *
16
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
 *  DEALINGS IN THE SOFTWARE.
23
 */
24
25
/*
26
 *  This library contains derived data from a modified version of the
27
 *  Unicode data files.
28
 *
29
 *  The original data files are available at
30
 *  https://www.unicode.org/Public/UNIDATA/
31
 *
32
 *  Please notice the copyright statement in the file "utf8proc_data.c".
33
 */
34
35
36
/*
37
 *  File name:    utf8proc.c
38
 *
39
 *  Description:
40
 *  Implementation of libutf8proc.
41
 */
42
43
44
#include "utf8proc.h"
45
46
#ifndef SSIZE_MAX
47
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
48
#endif
49
#ifndef UINT16_MAX
50
#  define UINT16_MAX 65535U
51
#endif
52
53
#include "utf8proc_data.c"
54
55
56
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
57
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
70
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
71
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
72
  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
73
74
55.5M
#define UTF8PROC_HANGUL_SBASE 0xAC00
75
12.3M
#define UTF8PROC_HANGUL_LBASE 0x1100
76
32.9k
#define UTF8PROC_HANGUL_VBASE 0x1161
77
26.3k
#define UTF8PROC_HANGUL_TBASE 0x11A7
78
570k
#define UTF8PROC_HANGUL_LCOUNT 19
79
26.4k
#define UTF8PROC_HANGUL_VCOUNT 21
80
98.2k
#define UTF8PROC_HANGUL_TCOUNT 28
81
30.3k
#define UTF8PROC_HANGUL_NCOUNT 588
82
15.9M
#define UTF8PROC_HANGUL_SCOUNT 11172
83
/* END is exclusive */
84
#define UTF8PROC_HANGUL_L_START  0x1100
85
#define UTF8PROC_HANGUL_L_END    0x115A
86
#define UTF8PROC_HANGUL_L_FILLER 0x115F
87
#define UTF8PROC_HANGUL_V_START  0x1160
88
#define UTF8PROC_HANGUL_V_END    0x11A3
89
#define UTF8PROC_HANGUL_T_START  0x11A8
90
#define UTF8PROC_HANGUL_T_END    0x11FA
91
#define UTF8PROC_HANGUL_S_START  0xAC00
92
#define UTF8PROC_HANGUL_S_END    0xD7A4
93
94
/* Should follow semantic-versioning rules (semver.org) based on API
95
   compatibility.  (Note that the shared-library version number will
96
   be different, being based on ABI compatibility.): */
97
0
#define STRINGIZEx(x) #x
98
0
#define STRINGIZE(x) STRINGIZEx(x)
99
0
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
100
0
  return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
101
0
}
102
103
0
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
104
0
  return "17.0.0";
105
0
}
106
107
0
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
108
0
  switch (errcode) {
109
0
    case UTF8PROC_ERROR_NOMEM:
110
0
    return "Memory for processing UTF-8 data could not be allocated.";
111
0
    case UTF8PROC_ERROR_OVERFLOW:
112
0
    return "UTF-8 string is too long to be processed.";
113
0
    case UTF8PROC_ERROR_INVALIDUTF8:
114
0
    return "Invalid UTF-8 string";
115
0
    case UTF8PROC_ERROR_NOTASSIGNED:
116
0
    return "Unassigned Unicode code point found in UTF-8 string.";
117
0
    case UTF8PROC_ERROR_INVALIDOPTS:
118
0
    return "Invalid options for UTF-8 processing chosen.";
119
0
    default:
120
0
    return "An unknown error occurred while processing UTF-8 data.";
121
0
  }
122
0
}
123
124
13.9M
#define utf_cont(ch)  (((ch) & 0xc0) == 0x80)
125
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
126
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
127
16.9M
) {
128
16.9M
  utf8proc_int32_t uc;
129
16.9M
  const utf8proc_uint8_t *end;
130
131
16.9M
  *dst = -1;
132
16.9M
  if (!strlen) return 0;
133
16.9M
  end = str + ((strlen < 0) ? 4 : strlen);
134
16.9M
  uc = *str++;
135
16.9M
  if (uc < 0x80) {
136
12.2M
    *dst = uc;
137
12.2M
    return 1;
138
12.2M
  }
139
  // Must be between 0xc2 and 0xf4 inclusive to be valid
140
4.70M
  if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
141
4.70M
  if (uc < 0xe0) {         // 2-byte sequence
142
     // Must have valid continuation character
143
135k
     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
144
135k
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
145
135k
     return 2;
146
135k
  }
147
4.57M
  if (uc < 0xf0) {        // 3-byte sequence
148
4.51M
     if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
149
567
        return UTF8PROC_ERROR_INVALIDUTF8;
150
     // Check for surrogate chars
151
4.51M
     if (uc == 0xed && *str > 0x9f)
152
27
         return UTF8PROC_ERROR_INVALIDUTF8;
153
4.51M
     uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
154
4.51M
     if (uc < 0x800)
155
63
         return UTF8PROC_ERROR_INVALIDUTF8;
156
4.51M
     *dst = uc;
157
4.51M
     return 3;
158
4.51M
  }
159
  // 4-byte sequence
160
  // Must have 3 valid continuation characters
161
56.6k
  if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
162
405
     return UTF8PROC_ERROR_INVALIDUTF8;
163
  // Make sure in correct range (0x10000 - 0x10ffff)
164
56.2k
  if (uc == 0xf0) {
165
34.4k
    if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
166
34.4k
  } else if (uc == 0xf4) {
167
5.66k
    if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
168
5.66k
  }
169
56.1k
  *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
170
56.1k
  return 4;
171
56.2k
}
172
173
1.02M
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
174
1.02M
    return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
175
1.02M
}
176
177
19.4M
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
178
19.4M
  if (uc < 0x00) {
179
0
    return 0;
180
19.4M
  } else if (uc < 0x80) {
181
7.09M
    dst[0] = (utf8proc_uint8_t) uc;
182
7.09M
    return 1;
183
12.3M
  } else if (uc < 0x800) {
184
11.2M
    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
185
11.2M
    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
186
11.2M
    return 2;
187
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
188
  // the API, however, these are actually invalid in UTF-8
189
11.2M
  } else if (uc < 0x10000) {
190
1.09M
    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
191
1.09M
    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
192
1.09M
    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
193
1.09M
    return 3;
194
1.09M
  } else if (uc < 0x110000) {
195
16.7k
    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
196
16.7k
    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
197
16.7k
    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
198
16.7k
    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
199
16.7k
    return 4;
200
16.7k
  } else return 0;
201
19.4M
}
202
203
/* internal version used for inserting 0xff bytes between graphemes */
204
1.91M
static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
205
1.91M
   if (uc < 0x00) {
206
947k
      if (uc == -1) { /* internal value used for grapheme breaks */
207
947k
        dst[0] = (utf8proc_uint8_t)0xFF;
208
947k
        return 1;
209
947k
      }
210
0
      return 0;
211
963k
   } else if (uc < 0x80) {
212
692k
      dst[0] = (utf8proc_uint8_t)uc;
213
692k
      return 1;
214
692k
   } else if (uc < 0x800) {
215
6.61k
      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
216
6.61k
      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
217
6.61k
      return 2;
218
264k
   } else if (uc < 0x10000) {
219
262k
      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
220
262k
      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
221
262k
      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
222
262k
      return 3;
223
262k
   } else if (uc < 0x110000) {
224
2.36k
      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
225
2.36k
      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
226
2.36k
      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
227
2.36k
      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
228
2.36k
      return 4;
229
2.36k
   } else return 0;
230
1.91M
}
231
232
/* internal "unsafe" version that does not check whether uc is in range */
233
117M
static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
234
  /* ASSERT: uc >= 0 && uc < 0x110000 */
235
117M
  return utf8proc_properties + (
236
117M
    utf8proc_stage2table[
237
117M
      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
238
117M
    ]
239
117M
  );
240
117M
}
241
242
12.2M
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
243
12.2M
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
244
12.2M
}
245
246
/* return whether there is a grapheme break between boundclasses lbc and tbc
247
   (according to the definition of extended grapheme clusters)
248
249
  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
250
  http://www.unicode.org/reports/tr29/tr29-29.html
251
252
  CAVEATS:
253
   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
254
   and GB 12/13 (regional indicator code points) require knowledge of previous characters
255
   and are thus not handled by this function. This may result in an incorrect break before
256
   an E_Modifier class codepoint and an incorrectly missing break between two
257
   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
258
259
   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
260
*/
261
4.02M
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
262
4.02M
  return
263
4.02M
    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
264
4.02M
    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
265
4.01M
     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
266
4.01M
    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
267
3.98M
    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
268
2.25M
    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
269
7.39k
     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
270
6.99k
      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
271
6.65k
      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
272
2.21M
      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
273
2.21M
    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
274
2.20M
      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
275
4.11k
     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
276
2.20M
      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
277
2.20M
    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
278
2.19M
      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
279
2.20M
     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
280
2.20M
    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
281
2.18M
     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
282
2.17M
     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
283
2.20M
     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
284
2.20M
    (lbc == UTF8PROC_BOUNDCLASS_E_ZWG &&              // GB11 (requires additional handling below)
285
2.17M
     tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
286
2.17M
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
287
2.16M
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
288
2.16M
    true; // GB999
289
4.02M
}
290
291
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
292
4.02M
{
293
4.02M
  if (state) {
294
3.00M
    int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
295
3.00M
    if (*state == 0) { /* state initialization */
296
5.34k
      state_bc = lbc;
297
5.34k
      state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
298
5.34k
    }
299
2.99M
    else { /* lbc and licb are already encoded in *state */
300
2.99M
      state_bc = *state & 0xff;  // 1st byte of state is bound class
301
2.99M
      state_icb = *state >> 8;   // 2nd byte of state is indic conjunct break
302
2.99M
    }
303
304
3.00M
    utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
305
2.94M
       !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
306
1.74k
        && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
307
308
    // Special support for GB9c.  Don't break between two consonants
309
    // separated 1+ linker characters and 0+ extend characters in any order.
310
    // After a consonant, we enter LINKER state after at least one linker.
311
3.00M
    if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
312
2.99M
        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
313
2.99M
        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
314
6.87k
      state_icb = ticb;
315
2.99M
    else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
316
2.59k
      state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
317
1.73k
                  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
318
319
    // Special support for GB 12/13 made possible by GB999. After two RI
320
    // class codepoints we want to force a break. Do this by resetting the
321
    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
322
    // after that character according to GB999 (unless of course such a break is
323
    // forbidden by a different rule such as GB9).
324
3.00M
    if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
325
1.26k
      state_bc = UTF8PROC_BOUNDCLASS_OTHER;
326
    // Special support for GB11 (emoji extend* zwj / emoji)
327
3.00M
    else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
328
6.40k
      if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
329
1.72k
        state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
330
4.68k
      else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
331
2.27k
        state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
332
2.40k
      else
333
2.40k
        state_bc = tbc;
334
6.40k
    }
335
2.99M
    else
336
2.99M
      state_bc = tbc;
337
338
3.00M
    *state = state_bc + (state_icb << 8);
339
3.00M
    return break_permitted;
340
3.00M
  }
341
1.02M
  else
342
1.02M
    return grapheme_break_simple(lbc, tbc);
343
4.02M
}
344
345
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
346
2.04M
    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
347
348
2.04M
  const utf8proc_property_t *p1 = utf8proc_get_property(c1);
349
2.04M
  const utf8proc_property_t *p2 = utf8proc_get_property(c2);
350
2.04M
  return grapheme_break_extended(p1->boundclass,
351
2.04M
                                 p2->boundclass,
352
2.04M
                                 p1->indic_conjunct_break,
353
2.04M
                                 p2->indic_conjunct_break,
354
2.04M
                                 state);
355
2.04M
}
356
357
358
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
359
1.02M
    utf8proc_int32_t c1, utf8proc_int32_t c2) {
360
1.02M
  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
361
1.02M
}
362
363
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
364
27.4M
{
365
27.4M
  utf8proc_int32_t entry_cp = **entry;
366
27.4M
  if ((entry_cp & 0xF800) == 0xD800) {
367
6.67k
    *entry = *entry + 1;
368
6.67k
    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
369
6.67k
    entry_cp += 0x10000;
370
6.67k
  }
371
27.4M
  return entry_cp;
372
27.4M
}
373
374
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
375
173k
{
376
173k
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
377
173k
  return seqindex_decode_entry(&entry);
378
173k
}
379
380
1.69M
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
381
1.69M
  utf8proc_ssize_t written = 0;
382
1.69M
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
383
1.69M
  int len = seqindex >> 14;
384
1.69M
  if (len >= 3) {
385
1.50M
    len = *entry;
386
1.50M
    entry++;
387
1.50M
  }
388
28.9M
  for (; len >= 0; entry++, len--) {
389
27.2M
    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
390
391
27.2M
    written += utf8proc_decompose_char(entry_cp, dst ? dst+written : dst,
392
27.2M
      (bufsize > written) ? (bufsize - written) : 0, options,
393
27.2M
    last_boundclass);
394
27.2M
    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
395
27.2M
  }
396
1.69M
  return written;
397
1.69M
}
398
399
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
400
1.02M
{
401
1.02M
  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
402
1.02M
  return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
403
1.02M
}
404
405
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
406
1.02M
{
407
1.02M
  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
408
1.02M
  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
409
1.02M
}
410
411
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
412
1.02M
{
413
1.02M
  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
414
1.02M
  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
415
1.02M
}
416
417
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
418
1.02M
{
419
1.02M
  const utf8proc_property_t *p = utf8proc_get_property(c);
420
1.02M
  return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
421
1.02M
}
422
423
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
424
1.02M
{
425
1.02M
  const utf8proc_property_t *p = utf8proc_get_property(c);
426
1.02M
  return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
427
1.02M
}
428
429
/* return a character width analogous to wcwidth (except portable and
430
   hopefully less buggy than most system wcwidth functions). */
431
1.02M
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
432
1.02M
  return utf8proc_get_property(c)->charwidth;
433
1.02M
}
434
435
0
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) {
436
0
  return utf8proc_get_property(c)->ambiguous_width;
437
0
}
438
439
2.04M
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
440
2.04M
  return (utf8proc_category_t) utf8proc_get_property(c)->category;
441
2.04M
}
442
443
1.02M
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
444
1.02M
  static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
445
1.02M
  return s[utf8proc_category(c)];
446
1.02M
}
447
448
#define utf8proc_decompose_lump(replacement_uc) \
449
93.8k
  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
450
93.8k
  (utf8proc_option_t)(options & ~(unsigned int)UTF8PROC_LUMP), last_boundclass)
451
452
43.2M
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
453
43.2M
  const utf8proc_property_t *property;
454
43.2M
  utf8proc_propval_t category;
455
43.2M
  utf8proc_int32_t hangul_sindex;
456
43.2M
  if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
457
43.2M
  property = unsafe_get_property(uc);
458
43.2M
  category = property->category;
459
43.2M
  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
460
43.2M
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
461
39.1M
    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
462
37.5k
      utf8proc_int32_t hangul_tindex;
463
37.5k
      if (bufsize >= 1) {
464
15.1k
        dst[0] = UTF8PROC_HANGUL_LBASE +
465
15.1k
          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
466
15.1k
        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
467
15.1k
          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
468
15.1k
      }
469
37.5k
      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
470
37.5k
      if (!hangul_tindex) return 2;
471
31.3k
      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
472
31.3k
      return 3;
473
37.5k
    }
474
39.1M
  }
475
43.1M
  if (options & UTF8PROC_REJECTNA) {
476
0
    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
477
0
  }
478
43.1M
  if (options & UTF8PROC_IGNORE) {
479
11.1M
    if (property->ignorable) return 0;
480
11.1M
  }
481
43.1M
  if (options & UTF8PROC_STRIPNA) {
482
1.98M
    if (!category) return 0;
483
1.98M
  }
484
43.1M
  if (options & UTF8PROC_LUMP) {
485
1.98M
    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
486
1.98M
    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
487
776
      utf8proc_decompose_lump(0x0027);
488
1.98M
    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
489
64.6k
      utf8proc_decompose_lump(0x002D);
490
1.91M
    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
491
1.91M
    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
492
1.91M
    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
493
582
      utf8proc_decompose_lump(0x003C);
494
1.91M
    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
495
976
      utf8proc_decompose_lump(0x003E);
496
1.91M
    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
497
1.91M
    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
498
1.06k
      utf8proc_decompose_lump(0x005E);
499
1.91M
    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
500
22.7k
      utf8proc_decompose_lump(0x005F);
501
1.89M
    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
502
1.89M
    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
503
1.89M
    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
504
1.89M
    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
505
1.89M
      if (category == UTF8PROC_CATEGORY_ZL ||
506
1.89M
          category == UTF8PROC_CATEGORY_ZP)
507
431
        utf8proc_decompose_lump(0x000A);
508
1.89M
    }
509
1.89M
  }
510
43.0M
  if (options & UTF8PROC_STRIPMARK) {
511
1.98M
    if (category == UTF8PROC_CATEGORY_MN ||
512
1.97M
      category == UTF8PROC_CATEGORY_MC ||
513
1.96M
      category == UTF8PROC_CATEGORY_ME) return 0;
514
1.98M
  }
515
43.0M
  if (options & UTF8PROC_CASEFOLD) {
516
11.1M
    if (property->casefold_seqindex != UINT16_MAX) {
517
169k
      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
518
169k
    }
519
11.1M
  }
520
42.8M
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
521
38.9M
    if (property->decomp_seqindex != UINT16_MAX &&
522
3.03M
        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
523
1.53M
      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
524
1.53M
    }
525
38.9M
  }
526
41.3M
  if (options & UTF8PROC_CHARBOUND) {
527
1.98M
    utf8proc_bool boundary;
528
1.98M
    boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
529
1.98M
                                       last_boundclass);
530
1.98M
    if (boundary) {
531
1.94M
      if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
532
1.94M
      if (bufsize >= 2) dst[1] = uc;
533
1.94M
      return 2;
534
1.94M
    }
535
1.98M
  }
536
39.4M
  if (bufsize >= 1) *dst = uc;
537
39.4M
  return 1;
538
41.3M
}
539
540
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
541
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
542
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
543
0
) {
544
0
    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
545
0
}
546
547
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
548
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
549
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
550
  utf8proc_custom_func custom_func, void *custom_data
551
29.0k
) {
552
  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
553
29.0k
  utf8proc_ssize_t wpos = 0;
554
29.0k
  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
555
0
    return UTF8PROC_ERROR_INVALIDOPTS;
556
29.0k
  if ((options & UTF8PROC_STRIPMARK) &&
557
3.62k
      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
558
0
    return UTF8PROC_ERROR_INVALIDOPTS;
559
29.0k
  {
560
29.0k
    utf8proc_int32_t uc;
561
29.0k
    utf8proc_ssize_t rpos = 0;
562
29.0k
    utf8proc_ssize_t decomp_result;
563
29.0k
    int boundclass = UTF8PROC_BOUNDCLASS_START;
564
15.9M
    while (1) {
565
15.9M
      if (options & UTF8PROC_NULLTERM) {
566
9.93M
        rpos += utf8proc_iterate(str + rpos, -1, &uc);
567
        /* checking of return value is not necessary,
568
           as 'uc' is < 0 in case of error */
569
9.93M
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
570
9.93M
        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
571
9.93M
        if (uc == 0) break;
572
9.93M
      } else {
573
5.96M
        if (rpos >= strlen) break;
574
5.95M
        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
575
5.95M
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
576
5.95M
      }
577
15.8M
      if (custom_func != NULL) {
578
0
        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
579
0
      }
580
15.8M
      decomp_result = utf8proc_decompose_char(
581
15.8M
        uc, buffer ? buffer+wpos : buffer, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
582
15.8M
        &boundclass
583
15.8M
      );
584
15.8M
      if (decomp_result < 0) return decomp_result;
585
15.8M
      wpos += decomp_result;
586
      /* prohibiting integer overflows due to too long strings: */
587
15.8M
      if (wpos < 0 ||
588
15.8M
          wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
589
0
        return UTF8PROC_ERROR_OVERFLOW;
590
15.8M
    }
591
29.0k
  }
592
26.4k
  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
593
9.97k
    utf8proc_ssize_t pos = 0;
594
18.6M
    while (pos < wpos-1) {
595
18.6M
      utf8proc_int32_t uc1, uc2;
596
18.6M
      const utf8proc_property_t *property1, *property2;
597
18.6M
      uc1 = buffer[pos];
598
18.6M
      uc2 = buffer[pos+1];
599
18.6M
      property1 = unsafe_get_property(uc1);
600
18.6M
      property2 = unsafe_get_property(uc2);
601
18.6M
      if (property1->combining_class > property2->combining_class &&
602
49.1k
          property2->combining_class > 0) {
603
39.5k
        buffer[pos] = uc2;
604
39.5k
        buffer[pos+1] = uc1;
605
39.5k
        if (pos > 0) pos--; else pos++;
606
18.5M
      } else {
607
18.5M
        pos++;
608
18.5M
      }
609
18.6M
    }
610
9.97k
  }
611
26.4k
  return wpos;
612
29.0k
}
613
614
14.9k
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
615
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
616
14.9k
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
617
2.79k
    utf8proc_ssize_t rpos;
618
2.79k
    utf8proc_ssize_t wpos = 0;
619
2.79k
    utf8proc_int32_t uc;
620
959k
    for (rpos = 0; rpos < length; rpos++) {
621
956k
      uc = buffer[rpos];
622
956k
      if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
623
956k
      if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
624
564k
          ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
625
391k
        if (options & UTF8PROC_NLF2LS) {
626
391k
          if (options & UTF8PROC_NLF2PS) {
627
391k
            buffer[wpos++] = 0x000A;
628
391k
          } else {
629
0
            buffer[wpos++] = 0x2028;
630
0
          }
631
391k
        } else {
632
0
          if (options & UTF8PROC_NLF2PS) {
633
0
            buffer[wpos++] = 0x2029;
634
0
          } else {
635
0
            buffer[wpos++] = 0x0020;
636
0
          }
637
0
        }
638
564k
      } else if ((options & UTF8PROC_STRIPCC) &&
639
0
          (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
640
0
        if (uc == 0x0009) buffer[wpos++] = 0x0020;
641
564k
      } else {
642
564k
        buffer[wpos++] = uc;
643
564k
      }
644
956k
    }
645
2.79k
    length = wpos;
646
2.79k
  }
647
14.9k
  if (options & UTF8PROC_COMPOSE) {
648
6.60k
    utf8proc_int32_t *starter = NULL;
649
6.60k
    const utf8proc_property_t *starter_property = NULL;
650
6.60k
    utf8proc_propval_t max_combining_class = -1;
651
6.60k
    utf8proc_ssize_t rpos;
652
6.60k
    utf8proc_ssize_t wpos = 0;
653
12.3M
    for (rpos = 0; rpos < length; rpos++) {
654
12.3M
      utf8proc_int32_t current_char = buffer[rpos];
655
12.3M
      const utf8proc_property_t *current_property = unsafe_get_property(current_char);
656
12.3M
      if (starter && current_property->combining_class > max_combining_class) {
657
        /* combination perhaps possible */
658
12.3M
        utf8proc_int32_t hangul_lindex;
659
12.3M
        utf8proc_int32_t hangul_sindex;
660
12.3M
        hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
661
12.3M
        if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
662
17.8k
          utf8proc_int32_t hangul_vindex;
663
17.8k
          hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
664
17.8k
          if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
665
12.9k
            *starter = UTF8PROC_HANGUL_SBASE +
666
12.9k
              (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
667
12.9k
              UTF8PROC_HANGUL_TCOUNT;
668
12.9k
            starter_property = NULL;
669
12.9k
            continue;
670
12.9k
          }
671
17.8k
        }
672
12.3M
        hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
673
12.3M
        if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
674
22.4k
            (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
675
13.0k
          utf8proc_int32_t hangul_tindex;
676
13.0k
          hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
677
13.0k
          if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
678
9.60k
            *starter += hangul_tindex;
679
9.60k
            starter_property = NULL;
680
9.60k
            continue;
681
9.60k
          }
682
13.0k
        }
683
12.3M
        if (!starter_property) {
684
12.3M
          starter_property = unsafe_get_property(*starter);
685
12.3M
        }
686
12.3M
        int idx = starter_property->comb_index;
687
12.3M
        if (idx < 0x3FF && current_property->comb_issecond) {
688
4.42k
          int len = starter_property->comb_length;
689
4.42k
          utf8proc_int32_t max_second = utf8proc_combinations_second[idx + len - 1];
690
4.42k
          if (current_char <= max_second) {
691
3.66k
            int off;
692
            // TODO: binary search? arithmetic search?
693
15.1k
            for (off = 0; off < len; ++off) {
694
15.1k
              utf8proc_int32_t second = utf8proc_combinations_second[idx + off];
695
15.1k
              if (current_char < second) {
696
                /* not found */
697
835
                break;
698
835
              }
699
14.3k
              if (current_char == second) {
700
                /* found */
701
2.82k
                utf8proc_int32_t composition = utf8proc_combinations_combined[idx + off];
702
2.82k
                *starter = composition;
703
2.82k
                starter_property = NULL;
704
2.82k
                break;
705
2.82k
              }
706
14.3k
            }
707
3.66k
            if (starter_property == NULL) {
708
              /* found */
709
2.82k
              continue;
710
2.82k
            }
711
3.66k
          }
712
4.42k
        }
713
12.3M
      }
714
12.3M
      buffer[wpos] = current_char;
715
12.3M
      if (current_property->combining_class) {
716
13.3k
        if (current_property->combining_class > max_combining_class) {
717
7.54k
          max_combining_class = current_property->combining_class;
718
7.54k
        }
719
12.3M
      } else {
720
12.3M
        starter = buffer + wpos;
721
12.3M
        starter_property = NULL;
722
12.3M
        max_combining_class = -1;
723
12.3M
      }
724
12.3M
      wpos++;
725
12.3M
    }
726
6.60k
    length = wpos;
727
6.60k
  }
728
14.9k
  return length;
729
14.9k
}
730
731
13.2k
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
732
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
733
     ASSERT: 'buffer' has one spare byte of free space at the end! */
734
13.2k
  length = utf8proc_normalize_utf32(buffer, length, options);
735
13.2k
  if (length < 0) return length;
736
13.2k
  {
737
13.2k
    utf8proc_ssize_t rpos, wpos = 0;
738
13.2k
    utf8proc_int32_t uc;
739
13.2k
    if (options & UTF8PROC_CHARBOUND) {
740
1.91M
        for (rpos = 0; rpos < length; rpos++) {
741
1.91M
            uc = buffer[rpos];
742
1.91M
            wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
743
1.91M
        }
744
11.5k
    } else {
745
19.4M
        for (rpos = 0; rpos < length; rpos++) {
746
19.4M
            uc = buffer[rpos];
747
19.4M
            wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
748
19.4M
        }
749
11.5k
    }
750
13.2k
    ((utf8proc_uint8_t *)buffer)[wpos] = 0;
751
13.2k
    return wpos;
752
13.2k
  }
753
13.2k
}
754
755
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
756
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
757
15.8k
) {
758
15.8k
    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
759
15.8k
}
760
761
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
762
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
763
  utf8proc_custom_func custom_func, void *custom_data
764
15.8k
) {
765
15.8k
  utf8proc_int32_t *buffer;
766
15.8k
  utf8proc_ssize_t result;
767
15.8k
  *dstptr = NULL;
768
15.8k
  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
769
15.8k
  if (result < 0) return result;
770
13.2k
  buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
771
13.2k
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
772
13.2k
  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
773
13.2k
  if (result < 0) {
774
0
    free(buffer);
775
0
    return result;
776
0
  }
777
13.2k
  result = utf8proc_reencode(buffer, result, options);
778
13.2k
  if (result < 0) {
779
0
    free(buffer);
780
0
    return result;
781
0
  }
782
13.2k
  {
783
13.2k
    utf8proc_int32_t *newptr;
784
13.2k
    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
785
13.2k
    if (newptr) buffer = newptr;
786
13.2k
  }
787
13.2k
  *dstptr = (utf8proc_uint8_t *)buffer;
788
13.2k
  return result;
789
13.2k
}
790
791
1.97k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
792
1.97k
  utf8proc_uint8_t *retval;
793
1.97k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
794
1.97k
    UTF8PROC_DECOMPOSE));
795
1.97k
  return retval;
796
1.97k
}
797
798
1.97k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
799
1.97k
  utf8proc_uint8_t *retval;
800
1.97k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
801
1.97k
    UTF8PROC_COMPOSE));
802
1.97k
  return retval;
803
1.97k
}
804
805
1.97k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
806
1.97k
  utf8proc_uint8_t *retval;
807
1.97k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
808
1.97k
    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT));
809
1.97k
  return retval;
810
1.97k
}
811
812
1.97k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
813
1.97k
  utf8proc_uint8_t *retval;
814
1.97k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
815
1.97k
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT));
816
1.97k
  return retval;
817
1.97k
}
818
819
1.97k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
820
1.97k
  utf8proc_uint8_t *retval;
821
1.97k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
822
1.97k
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE));
823
1.97k
  return retval;
824
1.97k
}