Coverage Report

Created: 2025-08-26 06:54

/src/utf8proc/utf8proc.c
Line
Count
Source (jump to first uncovered line)
1
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2
/*
3
 *  Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4
 *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5
 *
6
 *  Permission is hereby granted, free of charge, to any person obtaining a
7
 *  copy of this software and associated documentation files (the "Software"),
8
 *  to deal in the Software without restriction, including without limitation
9
 *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 *  and/or sell copies of the Software, and to permit persons to whom the
11
 *  Software is furnished to do so, subject to the following conditions:
12
 *
13
 *  The above copyright notice and this permission notice shall be included in
14
 *  all copies or substantial portions of the Software.
15
 *
16
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
 *  DEALINGS IN THE SOFTWARE.
23
 */
24
25
/*
26
 *  This library contains derived data from a modified version of the
27
 *  Unicode data files.
28
 *
29
 *  The original data files are available at
30
 *  https://www.unicode.org/Public/UNIDATA/
31
 *
32
 *  Please notice the copyright statement in the file "utf8proc_data.c".
33
 */
34
35
36
/*
37
 *  File name:    utf8proc.c
38
 *
39
 *  Description:
40
 *  Implementation of libutf8proc.
41
 */
42
43
44
#include "utf8proc.h"
45
46
#ifndef SSIZE_MAX
47
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
48
#endif
49
#ifndef UINT16_MAX
50
#  define UINT16_MAX 65535U
51
#endif
52
53
#include "utf8proc_data.c"
54
55
56
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
57
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
70
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
71
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
72
  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
73
74
60.6M
#define UTF8PROC_HANGUL_SBASE 0xAC00
75
13.4M
#define UTF8PROC_HANGUL_LBASE 0x1100
76
32.3k
#define UTF8PROC_HANGUL_VBASE 0x1161
77
25.7k
#define UTF8PROC_HANGUL_TBASE 0x11A7
78
647k
#define UTF8PROC_HANGUL_LCOUNT 19
79
27.4k
#define UTF8PROC_HANGUL_VCOUNT 21
80
96.1k
#define UTF8PROC_HANGUL_TCOUNT 28
81
27.8k
#define UTF8PROC_HANGUL_NCOUNT 588
82
17.6M
#define UTF8PROC_HANGUL_SCOUNT 11172
83
/* END is exclusive */
84
#define UTF8PROC_HANGUL_L_START  0x1100
85
#define UTF8PROC_HANGUL_L_END    0x115A
86
#define UTF8PROC_HANGUL_L_FILLER 0x115F
87
#define UTF8PROC_HANGUL_V_START  0x1160
88
#define UTF8PROC_HANGUL_V_END    0x11A3
89
#define UTF8PROC_HANGUL_T_START  0x11A8
90
#define UTF8PROC_HANGUL_T_END    0x11FA
91
#define UTF8PROC_HANGUL_S_START  0xAC00
92
#define UTF8PROC_HANGUL_S_END    0xD7A4
93
94
/* Should follow semantic-versioning rules (semver.org) based on API
95
   compatibility.  (Note that the shared-library version number will
96
   be different, being based on ABI compatibility.): */
97
0
#define STRINGIZEx(x) #x
98
0
#define STRINGIZE(x) STRINGIZEx(x)
99
0
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
100
0
  return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
101
0
}
102
103
0
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
104
0
  return "17.0.0";
105
0
}
106
107
0
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
108
0
  switch (errcode) {
109
0
    case UTF8PROC_ERROR_NOMEM:
110
0
    return "Memory for processing UTF-8 data could not be allocated.";
111
0
    case UTF8PROC_ERROR_OVERFLOW:
112
0
    return "UTF-8 string is too long to be processed.";
113
0
    case UTF8PROC_ERROR_INVALIDUTF8:
114
0
    return "Invalid UTF-8 string";
115
0
    case UTF8PROC_ERROR_NOTASSIGNED:
116
0
    return "Unassigned Unicode code point found in UTF-8 string.";
117
0
    case UTF8PROC_ERROR_INVALIDOPTS:
118
0
    return "Invalid options for UTF-8 processing chosen.";
119
0
    default:
120
0
    return "An unknown error occurred while processing UTF-8 data.";
121
0
  }
122
0
}
123
124
16.1M
#define utf_cont(ch)  (((ch) & 0xc0) == 0x80)
125
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
126
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
127
16.5M
) {
128
16.5M
  utf8proc_int32_t uc;
129
16.5M
  const utf8proc_uint8_t *end;
130
131
16.5M
  *dst = -1;
132
16.5M
  if (!strlen) return 0;
133
16.5M
  end = str + ((strlen < 0) ? 4 : strlen);
134
16.5M
  uc = *str++;
135
16.5M
  if (uc < 0x80) {
136
10.9M
    *dst = uc;
137
10.9M
    return 1;
138
10.9M
  }
139
  // Must be between 0xc2 and 0xf4 inclusive to be valid
140
5.53M
  if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
141
5.53M
  if (uc < 0xe0) {         // 2-byte sequence
142
     // Must have valid continuation character
143
281k
     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
144
280k
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
145
280k
     return 2;
146
281k
  }
147
5.24M
  if (uc < 0xf0) {        // 3-byte sequence
148
5.19M
     if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
149
513
        return UTF8PROC_ERROR_INVALIDUTF8;
150
     // Check for surrogate chars
151
5.19M
     if (uc == 0xed && *str > 0x9f)
152
45
         return UTF8PROC_ERROR_INVALIDUTF8;
153
5.19M
     uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
154
5.19M
     if (uc < 0x800)
155
117
         return UTF8PROC_ERROR_INVALIDUTF8;
156
5.19M
     *dst = uc;
157
5.19M
     return 3;
158
5.19M
  }
159
  // 4-byte sequence
160
  // Must have 3 valid continuation characters
161
57.9k
  if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
162
270
     return UTF8PROC_ERROR_INVALIDUTF8;
163
  // Make sure in correct range (0x10000 - 0x10ffff)
164
57.6k
  if (uc == 0xf0) {
165
34.3k
    if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
166
34.3k
  } else if (uc == 0xf4) {
167
6.24k
    if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
168
6.24k
  }
169
57.5k
  *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
170
57.5k
  return 4;
171
57.6k
}
172
173
1.01M
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
174
1.01M
    return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
175
1.01M
}
176
177
21.1M
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
178
21.1M
  if (uc < 0x00) {
179
0
    return 0;
180
21.1M
  } else if (uc < 0x80) {
181
6.86M
    dst[0] = (utf8proc_uint8_t) uc;
182
6.86M
    return 1;
183
14.2M
  } else if (uc < 0x800) {
184
12.9M
    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
185
12.9M
    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
186
12.9M
    return 2;
187
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
188
  // the API, however, these are actually invalid in UTF-8
189
12.9M
  } else if (uc < 0x10000) {
190
1.25M
    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
191
1.25M
    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
192
1.25M
    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
193
1.25M
    return 3;
194
1.25M
  } else if (uc < 0x110000) {
195
18.6k
    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
196
18.6k
    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
197
18.6k
    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
198
18.6k
    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
199
18.6k
    return 4;
200
18.6k
  } else return 0;
201
21.1M
}
202
203
/* internal version used for inserting 0xff bytes between graphemes */
204
1.82M
static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
205
1.82M
   if (uc < 0x00) {
206
900k
      if (uc == -1) { /* internal value used for grapheme breaks */
207
900k
        dst[0] = (utf8proc_uint8_t)0xFF;
208
900k
        return 1;
209
900k
      }
210
0
      return 0;
211
920k
   } else if (uc < 0x80) {
212
610k
      dst[0] = (utf8proc_uint8_t)uc;
213
610k
      return 1;
214
610k
   } else if (uc < 0x800) {
215
6.00k
      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
216
6.00k
      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
217
6.00k
      return 2;
218
303k
   } else if (uc < 0x10000) {
219
300k
      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
220
300k
      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
221
300k
      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
222
300k
      return 3;
223
300k
   } else if (uc < 0x110000) {
224
2.66k
      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
225
2.66k
      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
226
2.66k
      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
227
2.66k
      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
228
2.66k
      return 4;
229
2.66k
   } else return 0;
230
1.82M
}
231
232
/* internal "unsafe" version that does not check whether uc is in range */
233
126M
static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
234
  /* ASSERT: uc >= 0 && uc < 0x110000 */
235
126M
  return utf8proc_properties + (
236
126M
    utf8proc_stage2table[
237
126M
      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
238
126M
    ]
239
126M
  );
240
126M
}
241
242
12.1M
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
243
12.1M
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
244
12.1M
}
245
246
/* return whether there is a grapheme break between boundclasses lbc and tbc
247
   (according to the definition of extended grapheme clusters)
248
249
  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
250
  http://www.unicode.org/reports/tr29/tr29-29.html
251
252
  CAVEATS:
253
   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
254
   and GB 12/13 (regional indicator code points) require knowledge of previous characters
255
   and are thus not handled by this function. This may result in an incorrect break before
256
   an E_Modifier class codepoint and an incorrectly missing break between two
257
   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
258
259
   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
260
*/
261
3.96M
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
262
3.96M
  return
263
3.96M
    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
264
3.96M
    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
265
3.96M
     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
266
3.96M
    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
267
3.91M
    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
268
2.68M
    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
269
2.63M
     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
270
6.91k
      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
271
6.91k
      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
272
6.91k
      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
273
2.63M
    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
274
2.63M
      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
275
2.63M
     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
276
4.80k
      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
277
2.63M
    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
278
2.63M
      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
279
2.63M
     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
280
2.63M
    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
281
2.63M
     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
282
2.63M
     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
283
2.63M
     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
284
2.63M
    (lbc == UTF8PROC_BOUNDCLASS_E_ZWG &&              // GB11 (requires additional handling below)
285
2.55M
     tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
286
2.55M
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
287
2.55M
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
288
2.55M
    true; // GB999
289
3.96M
}
290
291
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
292
3.96M
{
293
3.96M
  if (state) {
294
2.95M
    int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
295
2.95M
    if (*state == 0) { /* state initialization */
296
5.61k
      state_bc = lbc;
297
5.61k
      state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
298
5.61k
    }
299
2.94M
    else { /* lbc and licb are already encoded in *state */
300
2.94M
      state_bc = *state & 0xff;  // 1st byte of state is bound class
301
2.94M
      state_icb = *state >> 8;   // 2nd byte of state is indic conjunct break
302
2.94M
    }
303
304
2.95M
    utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
305
2.95M
       !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
306
2.85M
        && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
307
308
    // Special support for GB9c.  Don't break between two consonants
309
    // separated 1+ linker characters and 0+ extend characters in any order.
310
    // After a consonant, we enter LINKER state after at least one linker.
311
2.95M
    if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
312
2.95M
        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
313
2.95M
        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
314
39.3k
      state_icb = ticb;
315
2.91M
    else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
316
2.49k
      state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
317
1.70k
                  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
318
319
    // Special support for GB 12/13 made possible by GB999. After two RI
320
    // class codepoints we want to force a break. Do this by resetting the
321
    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
322
    // after that character according to GB999 (unless of course such a break is
323
    // forbidden by a different rule such as GB9).
324
2.95M
    if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
325
1.21k
      state_bc = UTF8PROC_BOUNDCLASS_OTHER;
326
    // Special support for GB11 (emoji extend* zwj / emoji)
327
2.95M
    else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
328
8.11k
      if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
329
1.42k
        state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
330
6.68k
      else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
331
2.88k
        state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
332
3.79k
      else
333
3.79k
        state_bc = tbc;
334
8.11k
    }
335
2.94M
    else
336
2.94M
      state_bc = tbc;
337
338
2.95M
    *state = state_bc + (state_icb << 8);
339
2.95M
    return break_permitted;
340
2.95M
  }
341
1.01M
  else
342
1.01M
    return grapheme_break_simple(lbc, tbc);
343
3.96M
}
344
345
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
346
2.03M
    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
347
348
2.03M
  const utf8proc_property_t *p1 = utf8proc_get_property(c1);
349
2.03M
  const utf8proc_property_t *p2 = utf8proc_get_property(c2);
350
2.03M
  return grapheme_break_extended(p1->boundclass,
351
2.03M
                                 p2->boundclass,
352
2.03M
                                 p1->indic_conjunct_break,
353
2.03M
                                 p2->indic_conjunct_break,
354
2.03M
                                 state);
355
2.03M
}
356
357
358
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
359
1.01M
    utf8proc_int32_t c1, utf8proc_int32_t c2) {
360
1.01M
  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
361
1.01M
}
362
363
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
364
31.8M
{
365
31.8M
  utf8proc_int32_t entry_cp = **entry;
366
31.8M
  if ((entry_cp & 0xF800) == 0xD800) {
367
6.38k
    *entry = *entry + 1;
368
6.38k
    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
369
6.38k
    entry_cp += 0x10000;
370
6.38k
  }
371
31.8M
  return entry_cp;
372
31.8M
}
373
374
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
375
330k
{
376
330k
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
377
330k
  return seqindex_decode_entry(&entry);
378
330k
}
379
380
1.98M
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
381
1.98M
  utf8proc_ssize_t written = 0;
382
1.98M
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
383
1.98M
  int len = seqindex >> 14;
384
1.98M
  if (len >= 3) {
385
1.74M
    len = *entry;
386
1.74M
    entry++;
387
1.74M
  }
388
33.5M
  for (; len >= 0; entry++, len--) {
389
31.5M
    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
390
391
31.5M
    written += utf8proc_decompose_char(entry_cp, dst ? dst+written : dst,
392
31.5M
      (bufsize > written) ? (bufsize - written) : 0, options,
393
31.5M
    last_boundclass);
394
31.5M
    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
395
31.5M
  }
396
1.98M
  return written;
397
1.98M
}
398
399
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
400
1.01M
{
401
1.01M
  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
402
1.01M
  return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
403
1.01M
}
404
405
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
406
1.01M
{
407
1.01M
  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
408
1.01M
  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
409
1.01M
}
410
411
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
412
1.01M
{
413
1.01M
  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
414
1.01M
  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
415
1.01M
}
416
417
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
418
1.01M
{
419
1.01M
  const utf8proc_property_t *p = utf8proc_get_property(c);
420
1.01M
  return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
421
1.01M
}
422
423
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
424
1.01M
{
425
1.01M
  const utf8proc_property_t *p = utf8proc_get_property(c);
426
1.01M
  return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
427
1.01M
}
428
429
/* return a character width analogous to wcwidth (except portable and
430
   hopefully less buggy than most system wcwidth functions). */
431
1.01M
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
432
1.01M
  return utf8proc_get_property(c)->charwidth;
433
1.01M
}
434
435
0
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) {
436
0
  return utf8proc_get_property(c)->ambiguous_width;
437
0
}
438
439
2.03M
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
440
2.03M
  return (utf8proc_category_t) utf8proc_get_property(c)->category;
441
2.03M
}
442
443
1.01M
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
444
1.01M
  static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
445
1.01M
  return s[utf8proc_category(c)];
446
1.01M
}
447
448
#define utf8proc_decompose_lump(replacement_uc) \
449
82.7k
  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
450
82.7k
  options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass)
451
452
47.1M
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
453
47.1M
  const utf8proc_property_t *property;
454
47.1M
  utf8proc_propval_t category;
455
47.1M
  utf8proc_int32_t hangul_sindex;
456
47.1M
  if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
457
47.1M
  property = unsafe_get_property(uc);
458
47.1M
  category = property->category;
459
47.1M
  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
460
47.1M
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
461
43.1M
    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
462
36.1k
      utf8proc_int32_t hangul_tindex;
463
36.1k
      if (bufsize >= 1) {
464
13.9k
        dst[0] = UTF8PROC_HANGUL_LBASE +
465
13.9k
          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
466
13.9k
        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
467
13.9k
          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
468
13.9k
      }
469
36.1k
      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
470
36.1k
      if (!hangul_tindex) return 2;
471
29.1k
      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
472
29.1k
      return 3;
473
36.1k
    }
474
43.1M
  }
475
47.1M
  if (options & UTF8PROC_REJECTNA) {
476
0
    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
477
0
  }
478
47.1M
  if (options & UTF8PROC_IGNORE) {
479
12.5M
    if (property->ignorable) return 0;
480
12.5M
  }
481
47.1M
  if (options & UTF8PROC_STRIPNA) {
482
1.93M
    if (!category) return 0;
483
1.93M
  }
484
47.1M
  if (options & UTF8PROC_LUMP) {
485
1.93M
    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
486
1.93M
    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
487
891
      utf8proc_decompose_lump(0x0027);
488
1.93M
    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
489
61.5k
      utf8proc_decompose_lump(0x002D);
490
1.87M
    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
491
1.87M
    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
492
1.87M
    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
493
583
      utf8proc_decompose_lump(0x003C);
494
1.87M
    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
495
732
      utf8proc_decompose_lump(0x003E);
496
1.87M
    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
497
1.87M
    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
498
1.31k
      utf8proc_decompose_lump(0x005E);
499
1.86M
    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
500
14.9k
      utf8proc_decompose_lump(0x005F);
501
1.85M
    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
502
1.85M
    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
503
1.85M
    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
504
1.85M
    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
505
1.85M
      if (category == UTF8PROC_CATEGORY_ZL ||
506
1.85M
          category == UTF8PROC_CATEGORY_ZP)
507
388
        utf8proc_decompose_lump(0x000A);
508
1.85M
    }
509
1.85M
  }
510
47.0M
  if (options & UTF8PROC_STRIPMARK) {
511
1.93M
    if (category == UTF8PROC_CATEGORY_MN ||
512
1.93M
      category == UTF8PROC_CATEGORY_MC ||
513
1.93M
      category == UTF8PROC_CATEGORY_ME) return 0;
514
1.93M
  }
515
46.9M
  if (options & UTF8PROC_CASEFOLD) {
516
12.5M
    if (property->casefold_seqindex != UINT16_MAX) {
517
221k
      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
518
221k
    }
519
12.5M
  }
520
46.7M
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
521
42.8M
    if (property->decomp_seqindex != UINT16_MAX &&
522
42.8M
        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
523
1.76M
      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
524
1.76M
    }
525
42.8M
  }
526
45.0M
  if (options & UTF8PROC_CHARBOUND) {
527
1.93M
    utf8proc_bool boundary;
528
1.93M
    boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
529
1.93M
                                       last_boundclass);
530
1.93M
    if (boundary) {
531
1.87M
      if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
532
1.87M
      if (bufsize >= 2) dst[1] = uc;
533
1.87M
      return 2;
534
1.87M
    }
535
1.93M
  }
536
43.1M
  if (bufsize >= 1) *dst = uc;
537
43.1M
  return 1;
538
45.0M
}
539
540
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
541
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
542
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
543
0
) {
544
0
    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
545
0
}
546
547
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
548
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
549
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
550
  utf8proc_custom_func custom_func, void *custom_data
551
30.3k
) {
552
  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
553
30.3k
  utf8proc_ssize_t wpos = 0;
554
30.3k
  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
555
0
    return UTF8PROC_ERROR_INVALIDOPTS;
556
30.3k
  if ((options & UTF8PROC_STRIPMARK) &&
557
30.3k
      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
558
0
    return UTF8PROC_ERROR_INVALIDOPTS;
559
30.3k
  {
560
30.3k
    utf8proc_int32_t uc;
561
30.3k
    utf8proc_ssize_t rpos = 0;
562
30.3k
    utf8proc_ssize_t decomp_result;
563
30.3k
    int boundclass = UTF8PROC_BOUNDCLASS_START;
564
15.5M
    while (1) {
565
15.5M
      if (options & UTF8PROC_NULLTERM) {
566
9.70M
        rpos += utf8proc_iterate(str + rpos, -1, &uc);
567
        /* checking of return value is not necessary,
568
           as 'uc' is < 0 in case of error */
569
9.70M
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
570
9.70M
        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
571
9.70M
        if (uc == 0) break;
572
9.70M
      } else {
573
5.82M
        if (rpos >= strlen) break;
574
5.81M
        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
575
5.81M
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
576
5.81M
      }
577
15.4M
      if (custom_func != NULL) {
578
0
        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
579
0
      }
580
15.4M
      decomp_result = utf8proc_decompose_char(
581
15.4M
        uc, buffer ? buffer+wpos : buffer, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
582
15.4M
        &boundclass
583
15.4M
      );
584
15.4M
      if (decomp_result < 0) return decomp_result;
585
15.4M
      wpos += decomp_result;
586
      /* prohibiting integer overflows due to too long strings: */
587
15.4M
      if (wpos < 0 ||
588
15.4M
          wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
589
0
        return UTF8PROC_ERROR_OVERFLOW;
590
15.4M
    }
591
30.3k
  }
592
27.6k
  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
593
10.4k
    utf8proc_ssize_t pos = 0;
594
20.3M
    while (pos < wpos-1) {
595
20.3M
      utf8proc_int32_t uc1, uc2;
596
20.3M
      const utf8proc_property_t *property1, *property2;
597
20.3M
      uc1 = buffer[pos];
598
20.3M
      uc2 = buffer[pos+1];
599
20.3M
      property1 = unsafe_get_property(uc1);
600
20.3M
      property2 = unsafe_get_property(uc2);
601
20.3M
      if (property1->combining_class > property2->combining_class &&
602
20.3M
          property2->combining_class > 0) {
603
33.0k
        buffer[pos] = uc2;
604
33.0k
        buffer[pos+1] = uc1;
605
33.0k
        if (pos > 0) pos--; else pos++;
606
20.2M
      } else {
607
20.2M
        pos++;
608
20.2M
      }
609
20.3M
    }
610
10.4k
  }
611
27.6k
  return wpos;
612
30.3k
}
613
614
15.5k
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
615
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
616
15.5k
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
617
2.85k
    utf8proc_ssize_t rpos;
618
2.85k
    utf8proc_ssize_t wpos = 0;
619
2.85k
    utf8proc_int32_t uc;
620
911k
    for (rpos = 0; rpos < length; rpos++) {
621
908k
      uc = buffer[rpos];
622
908k
      if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
623
908k
      if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
624
908k
          ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
625
250k
        if (options & UTF8PROC_NLF2LS) {
626
250k
          if (options & UTF8PROC_NLF2PS) {
627
250k
            buffer[wpos++] = 0x000A;
628
250k
          } else {
629
0
            buffer[wpos++] = 0x2028;
630
0
          }
631
250k
        } else {
632
0
          if (options & UTF8PROC_NLF2PS) {
633
0
            buffer[wpos++] = 0x2029;
634
0
          } else {
635
0
            buffer[wpos++] = 0x0020;
636
0
          }
637
0
        }
638
658k
      } else if ((options & UTF8PROC_STRIPCC) &&
639
658k
          (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
640
0
        if (uc == 0x0009) buffer[wpos++] = 0x0020;
641
658k
      } else {
642
658k
        buffer[wpos++] = uc;
643
658k
      }
644
908k
    }
645
2.85k
    length = wpos;
646
2.85k
  }
647
15.5k
  if (options & UTF8PROC_COMPOSE) {
648
6.92k
    utf8proc_int32_t *starter = NULL;
649
6.92k
    const utf8proc_property_t *starter_property = NULL;
650
6.92k
    utf8proc_propval_t max_combining_class = -1;
651
6.92k
    utf8proc_ssize_t rpos;
652
6.92k
    utf8proc_ssize_t wpos = 0;
653
13.5M
    for (rpos = 0; rpos < length; rpos++) {
654
13.4M
      utf8proc_int32_t current_char = buffer[rpos];
655
13.4M
      const utf8proc_property_t *current_property = unsafe_get_property(current_char);
656
13.4M
      if (starter && current_property->combining_class > max_combining_class) {
657
        /* combination perhaps possible */
658
13.4M
        utf8proc_int32_t hangul_lindex;
659
13.4M
        utf8proc_int32_t hangul_sindex;
660
13.4M
        hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
661
13.4M
        if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
662
18.4k
          utf8proc_int32_t hangul_vindex;
663
18.4k
          hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
664
18.4k
          if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
665
13.3k
            *starter = UTF8PROC_HANGUL_SBASE +
666
13.3k
              (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
667
13.3k
              UTF8PROC_HANGUL_TCOUNT;
668
13.3k
            starter_property = NULL;
669
13.3k
            continue;
670
13.3k
          }
671
18.4k
        }
672
13.4M
        hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
673
13.4M
        if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
674
13.4M
            (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
675
13.4k
          utf8proc_int32_t hangul_tindex;
676
13.4k
          hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
677
13.4k
          if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
678
9.46k
            *starter += hangul_tindex;
679
9.46k
            starter_property = NULL;
680
9.46k
            continue;
681
9.46k
          }
682
13.4k
        }
683
13.4M
        if (!starter_property) {
684
13.4M
          starter_property = unsafe_get_property(*starter);
685
13.4M
        }
686
13.4M
        int idx = starter_property->comb_index;
687
13.4M
        if (idx < 0x3FF && current_property->comb_issecond) {
688
3.29k
          int len = starter_property->comb_length;
689
3.29k
          utf8proc_int32_t max_second = utf8proc_combinations_second[idx + len - 1];
690
3.29k
          if (current_char <= max_second) {
691
2.92k
            int off;
692
            // TODO: binary search? arithmetic search?
693
10.3k
            for (off = 0; off < len; ++off) {
694
10.3k
              utf8proc_int32_t second = utf8proc_combinations_second[idx + off];
695
10.3k
              if (current_char < second) {
696
                /* not found */
697
662
                break;
698
662
              }
699
9.70k
              if (current_char == second) {
700
                /* found */
701
2.26k
                utf8proc_int32_t composition = utf8proc_combinations_combined[idx + off];
702
2.26k
                *starter = composition;
703
2.26k
                starter_property = NULL;
704
2.26k
                break;
705
2.26k
              }
706
9.70k
            }
707
2.92k
            if (starter_property == NULL) {
708
              /* found */
709
2.26k
              continue;
710
2.26k
            }
711
2.92k
          }
712
3.29k
        }
713
13.4M
      }
714
13.4M
      buffer[wpos] = current_char;
715
13.4M
      if (current_property->combining_class) {
716
11.5k
        if (current_property->combining_class > max_combining_class) {
717
6.75k
          max_combining_class = current_property->combining_class;
718
6.75k
        }
719
13.4M
      } else {
720
13.4M
        starter = buffer + wpos;
721
13.4M
        starter_property = NULL;
722
13.4M
        max_combining_class = -1;
723
13.4M
      }
724
13.4M
      wpos++;
725
13.4M
    }
726
6.92k
    length = wpos;
727
6.92k
  }
728
15.5k
  return length;
729
15.5k
}
730
731
13.8k
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
732
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
733
     ASSERT: 'buffer' has one spare byte of free space at the end! */
734
13.8k
  length = utf8proc_normalize_utf32(buffer, length, options);
735
13.8k
  if (length < 0) return length;
736
13.8k
  {
737
13.8k
    utf8proc_ssize_t rpos, wpos = 0;
738
13.8k
    utf8proc_int32_t uc;
739
13.8k
    if (options & UTF8PROC_CHARBOUND) {
740
1.82M
        for (rpos = 0; rpos < length; rpos++) {
741
1.82M
            uc = buffer[rpos];
742
1.82M
            wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
743
1.82M
        }
744
12.1k
    } else {
745
21.1M
        for (rpos = 0; rpos < length; rpos++) {
746
21.1M
            uc = buffer[rpos];
747
21.1M
            wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
748
21.1M
        }
749
12.1k
    }
750
13.8k
    ((utf8proc_uint8_t *)buffer)[wpos] = 0;
751
13.8k
    return wpos;
752
13.8k
  }
753
13.8k
}
754
755
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
756
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
757
16.5k
) {
758
16.5k
    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
759
16.5k
}
760
761
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
762
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
763
  utf8proc_custom_func custom_func, void *custom_data
764
16.5k
) {
765
16.5k
  utf8proc_int32_t *buffer;
766
16.5k
  utf8proc_ssize_t result;
767
16.5k
  *dstptr = NULL;
768
16.5k
  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
769
16.5k
  if (result < 0) return result;
770
13.8k
  buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
771
13.8k
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
772
13.8k
  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
773
13.8k
  if (result < 0) {
774
0
    free(buffer);
775
0
    return result;
776
0
  }
777
13.8k
  result = utf8proc_reencode(buffer, result, options);
778
13.8k
  if (result < 0) {
779
0
    free(buffer);
780
0
    return result;
781
0
  }
782
13.8k
  {
783
13.8k
    utf8proc_int32_t *newptr;
784
13.8k
    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
785
13.8k
    if (newptr) buffer = newptr;
786
13.8k
  }
787
13.8k
  *dstptr = (utf8proc_uint8_t *)buffer;
788
13.8k
  return result;
789
13.8k
}
790
791
2.06k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
792
2.06k
  utf8proc_uint8_t *retval;
793
2.06k
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
794
2.06k
    UTF8PROC_DECOMPOSE);
795
2.06k
  return retval;
796
2.06k
}
797
798
2.06k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
799
2.06k
  utf8proc_uint8_t *retval;
800
2.06k
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
801
2.06k
    UTF8PROC_COMPOSE);
802
2.06k
  return retval;
803
2.06k
}
804
805
2.06k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
806
2.06k
  utf8proc_uint8_t *retval;
807
2.06k
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
808
2.06k
    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
809
2.06k
  return retval;
810
2.06k
}
811
812
2.06k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
813
2.06k
  utf8proc_uint8_t *retval;
814
2.06k
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
815
2.06k
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
816
2.06k
  return retval;
817
2.06k
}
818
819
2.06k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
820
2.06k
  utf8proc_uint8_t *retval;
821
2.06k
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
822
2.06k
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
823
2.06k
  return retval;
824
2.06k
}