Coverage Report

Created: 2026-02-26 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/utf8proc/utf8proc.c
Line
Count
Source
1
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2
/*
3
 *  Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4
 *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5
 *
6
 *  Permission is hereby granted, free of charge, to any person obtaining a
7
 *  copy of this software and associated documentation files (the "Software"),
8
 *  to deal in the Software without restriction, including without limitation
9
 *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 *  and/or sell copies of the Software, and to permit persons to whom the
11
 *  Software is furnished to do so, subject to the following conditions:
12
 *
13
 *  The above copyright notice and this permission notice shall be included in
14
 *  all copies or substantial portions of the Software.
15
 *
16
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
 *  DEALINGS IN THE SOFTWARE.
23
 */
24
25
/*
26
 *  This library contains derived data from a modified version of the
27
 *  Unicode data files.
28
 *
29
 *  The original data files are available at
30
 *  https://www.unicode.org/Public/UNIDATA/
31
 *
32
 *  Please notice the copyright statement in the file "utf8proc_data.c".
33
 */
34
35
36
/*
37
 *  File name:    utf8proc.c
38
 *
39
 *  Description:
40
 *  Implementation of libutf8proc.
41
 */
42
43
44
#include "utf8proc.h"
45
46
#ifndef SSIZE_MAX
47
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
48
#endif
49
#ifndef UINT16_MAX
50
#  define UINT16_MAX 65535U
51
#endif
52
53
#include "utf8proc_data.c"
54
55
56
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
57
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
70
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
71
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
72
  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
73
74
60.6M
#define UTF8PROC_HANGUL_SBASE 0xAC00
75
12.8M
#define UTF8PROC_HANGUL_LBASE 0x1100
76
74.9k
#define UTF8PROC_HANGUL_VBASE 0x1161
77
52.9k
#define UTF8PROC_HANGUL_TBASE 0x11A7
78
945k
#define UTF8PROC_HANGUL_LCOUNT 19
79
54.1k
#define UTF8PROC_HANGUL_VCOUNT 21
80
221k
#define UTF8PROC_HANGUL_TCOUNT 28
81
66.0k
#define UTF8PROC_HANGUL_NCOUNT 588
82
18.2M
#define UTF8PROC_HANGUL_SCOUNT 11172
83
/* END is exclusive */
84
#define UTF8PROC_HANGUL_L_START  0x1100
85
#define UTF8PROC_HANGUL_L_END    0x115A
86
#define UTF8PROC_HANGUL_L_FILLER 0x115F
87
#define UTF8PROC_HANGUL_V_START  0x1160
88
#define UTF8PROC_HANGUL_V_END    0x11A3
89
#define UTF8PROC_HANGUL_T_START  0x11A8
90
#define UTF8PROC_HANGUL_T_END    0x11FA
91
#define UTF8PROC_HANGUL_S_START  0xAC00
92
#define UTF8PROC_HANGUL_S_END    0xD7A4
93
94
/* Should follow semantic-versioning rules (semver.org) based on API
95
   compatibility.  (Note that the shared-library version number will
96
   be different, being based on ABI compatibility.): */
97
0
#define STRINGIZEx(x) #x
98
0
#define STRINGIZE(x) STRINGIZEx(x)
99
0
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
100
0
  return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
101
0
}
102
103
0
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
104
0
  return "17.0.0";
105
0
}
106
107
0
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
108
0
  switch (errcode) {
109
0
    case UTF8PROC_ERROR_NOMEM:
110
0
    return "Memory for processing UTF-8 data could not be allocated.";
111
0
    case UTF8PROC_ERROR_OVERFLOW:
112
0
    return "UTF-8 string is too long to be processed.";
113
0
    case UTF8PROC_ERROR_INVALIDUTF8:
114
0
    return "Invalid UTF-8 string";
115
0
    case UTF8PROC_ERROR_NOTASSIGNED:
116
0
    return "Unassigned Unicode code point found in UTF-8 string.";
117
0
    case UTF8PROC_ERROR_INVALIDOPTS:
118
0
    return "Invalid options for UTF-8 processing chosen.";
119
0
    default:
120
0
    return "An unknown error occurred while processing UTF-8 data.";
121
0
  }
122
0
}
123
124
19.4M
#define utf_cont(ch)  (((ch) & 0xc0) == 0x80)
125
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
126
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
127
19.3M
) {
128
19.3M
  utf8proc_int32_t uc;
129
19.3M
  const utf8proc_uint8_t *end;
130
131
19.3M
  *dst = -1;
132
19.3M
  if (!strlen) return 0;
133
19.3M
  end = str + ((strlen < 0) ? 4 : strlen);
134
19.3M
  uc = *str++;
135
19.3M
  if (uc < 0x80) {
136
12.7M
    *dst = uc;
137
12.7M
    return 1;
138
12.7M
  }
139
  // Must be between 0xc2 and 0xf4 inclusive to be valid
140
6.61M
  if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
141
6.60M
  if (uc < 0xe0) {         // 2-byte sequence
142
     // Must have valid continuation character
143
276k
     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
144
275k
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
145
275k
     return 2;
146
276k
  }
147
6.33M
  if (uc < 0xf0) {        // 3-byte sequence
148
6.22M
     if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
149
803
        return UTF8PROC_ERROR_INVALIDUTF8;
150
     // Check for surrogate chars
151
6.22M
     if (uc == 0xed && *str > 0x9f)
152
22
         return UTF8PROC_ERROR_INVALIDUTF8;
153
6.22M
     uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
154
6.22M
     if (uc < 0x800)
155
77
         return UTF8PROC_ERROR_INVALIDUTF8;
156
6.22M
     *dst = uc;
157
6.22M
     return 3;
158
6.22M
  }
159
  // 4-byte sequence
160
  // Must have 3 valid continuation characters
161
108k
  if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
162
528
     return UTF8PROC_ERROR_INVALIDUTF8;
163
  // Make sure in correct range (0x10000 - 0x10ffff)
164
107k
  if (uc == 0xf0) {
165
64.5k
    if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
166
64.5k
  } else if (uc == 0xf4) {
167
10.2k
    if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
168
10.2k
  }
169
107k
  *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
170
107k
  return 4;
171
107k
}
172
173
1.07M
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
174
1.07M
    return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
175
1.07M
}
176
177
18.8M
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
178
18.8M
  if (uc < 0x00) {
179
0
    return 0;
180
18.8M
  } else if (uc < 0x80) {
181
5.54M
    dst[0] = (utf8proc_uint8_t) uc;
182
5.54M
    return 1;
183
13.3M
  } else if (uc < 0x800) {
184
12.0M
    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
185
12.0M
    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
186
12.0M
    return 2;
187
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
188
  // the API, however, these are actually invalid in UTF-8
189
12.0M
  } else if (uc < 0x10000) {
190
1.22M
    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
191
1.22M
    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
192
1.22M
    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
193
1.22M
    return 3;
194
1.22M
  } else if (uc < 0x110000) {
195
25.3k
    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
196
25.3k
    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
197
25.3k
    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
198
25.3k
    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
199
25.3k
    return 4;
200
25.3k
  } else return 0;
201
18.8M
}
202
203
/* internal version used for inserting 0xff bytes between graphemes */
204
3.72M
static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
205
3.72M
   if (uc < 0x00) {
206
1.46M
      if (uc == -1) { /* internal value used for grapheme breaks */
207
1.46M
        dst[0] = (utf8proc_uint8_t)0xFF;
208
1.46M
        return 1;
209
1.46M
      }
210
0
      return 0;
211
2.25M
   } else if (uc < 0x80) {
212
1.34M
      dst[0] = (utf8proc_uint8_t)uc;
213
1.34M
      return 1;
214
1.34M
   } else if (uc < 0x800) {
215
33.1k
      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
216
33.1k
      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
217
33.1k
      return 2;
218
880k
   } else if (uc < 0x10000) {
219
869k
      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
220
869k
      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
221
869k
      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
222
869k
      return 3;
223
869k
   } else if (uc < 0x110000) {
224
10.9k
      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
225
10.9k
      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
226
10.9k
      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
227
10.9k
      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
228
10.9k
      return 4;
229
10.9k
   } else return 0;
230
3.72M
}
231
232
/* internal "unsafe" version that does not check whether uc is in range */
233
123M
static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
234
  /* ASSERT: uc >= 0 && uc < 0x110000 */
235
123M
  return utf8proc_properties + (
236
123M
    utf8proc_stage2table[
237
123M
      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
238
123M
    ]
239
123M
  );
240
123M
}
241
242
12.9M
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
243
12.9M
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
244
12.9M
}
245
246
/* return whether there is a grapheme break between boundclasses lbc and tbc
247
   (according to the definition of extended grapheme clusters)
248
249
  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
250
  http://www.unicode.org/reports/tr29/tr29-29.html
251
252
  CAVEATS:
253
   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
254
   and GB 12/13 (regional indicator code points) require knowledge of previous characters
255
   and are thus not handled by this function. This may result in an incorrect break before
256
   an E_Modifier class codepoint and an incorrectly missing break between two
257
   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
258
259
   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
260
*/
261
7.61M
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
262
7.61M
  return
263
7.61M
    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
264
7.61M
    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
265
7.60M
     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
266
7.60M
    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
267
7.58M
    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
268
6.22M
    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
269
39.8k
     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
270
38.3k
      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
271
37.3k
      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
272
6.09M
      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
273
6.09M
    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
274
6.08M
      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
275
12.9k
     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
276
6.09M
      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
277
6.09M
    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
278
6.06M
      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
279
6.09M
     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
280
6.09M
    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
281
6.00M
     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
282
5.98M
     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
283
6.08M
     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
284
6.08M
    (lbc == UTF8PROC_BOUNDCLASS_E_ZWG &&              // GB11 (requires additional handling below)
285
5.96M
     tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
286
5.96M
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
287
5.95M
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
288
5.95M
    true; // GB999
289
7.61M
}
290
291
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
292
7.61M
{
293
7.61M
  if (state) {
294
6.53M
    int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
295
6.53M
    if (*state == 0) { /* state initialization */
296
12.5k
      state_bc = lbc;
297
12.5k
      state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
298
12.5k
    }
299
6.52M
    else { /* lbc and licb are already encoded in *state */
300
6.52M
      state_bc = *state & 0xff;  // 1st byte of state is bound class
301
6.52M
      state_icb = *state >> 8;   // 2nd byte of state is indic conjunct break
302
6.52M
    }
303
304
6.53M
    utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
305
6.39M
       !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
306
3.48k
        && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
307
308
    // Special support for GB9c.  Don't break between two consonants
309
    // separated 1+ linker characters and 0+ extend characters in any order.
310
    // After a consonant, we enter LINKER state after at least one linker.
311
6.53M
    if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
312
6.52M
        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
313
6.52M
        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
314
15.9k
      state_icb = ticb;
315
6.51M
    else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
316
5.90k
      state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
317
3.58k
                  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
318
319
    // Special support for GB 12/13 made possible by GB999. After two RI
320
    // class codepoints we want to force a break. Do this by resetting the
321
    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
322
    // after that character according to GB999 (unless of course such a break is
323
    // forbidden by a different rule such as GB9).
324
6.53M
    if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
325
2.78k
      state_bc = UTF8PROC_BOUNDCLASS_OTHER;
326
    // Special support for GB11 (emoji extend* zwj / emoji)
327
6.53M
    else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
328
22.3k
      if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
329
6.27k
        state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
330
16.0k
      else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
331
7.49k
        state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
332
8.56k
      else
333
8.56k
        state_bc = tbc;
334
22.3k
    }
335
6.50M
    else
336
6.50M
      state_bc = tbc;
337
338
6.53M
    *state = state_bc + (state_icb << 8);
339
6.53M
    return break_permitted;
340
6.53M
  }
341
1.07M
  else
342
1.07M
    return grapheme_break_simple(lbc, tbc);
343
7.61M
}
344
345
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
346
2.15M
    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
347
348
2.15M
  const utf8proc_property_t *p1 = utf8proc_get_property(c1);
349
2.15M
  const utf8proc_property_t *p2 = utf8proc_get_property(c2);
350
2.15M
  return grapheme_break_extended(p1->boundclass,
351
2.15M
                                 p2->boundclass,
352
2.15M
                                 p1->indic_conjunct_break,
353
2.15M
                                 p2->indic_conjunct_break,
354
2.15M
                                 state);
355
2.15M
}
356
357
358
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
359
1.07M
    utf8proc_int32_t c1, utf8proc_int32_t c2) {
360
1.07M
  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
361
1.07M
}
362
363
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
364
29.7M
{
365
29.7M
  utf8proc_int32_t entry_cp = **entry;
366
29.7M
  if ((entry_cp & 0xF800) == 0xD800) {
367
16.0k
    *entry = *entry + 1;
368
16.0k
    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
369
16.0k
    entry_cp += 0x10000;
370
16.0k
  }
371
29.7M
  return entry_cp;
372
29.7M
}
373
374
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
375
401k
{
376
401k
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
377
401k
  return seqindex_decode_entry(&entry);
378
401k
}
379
380
1.93M
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
381
1.93M
  utf8proc_ssize_t written = 0;
382
1.93M
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
383
1.93M
  int len = seqindex >> 14;
384
1.93M
  if (len >= 3) {
385
1.61M
    len = *entry;
386
1.61M
    entry++;
387
1.61M
  }
388
31.2M
  for (; len >= 0; entry++, len--) {
389
29.3M
    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
390
391
29.3M
    written += utf8proc_decompose_char(entry_cp, dst ? dst+written : dst,
392
29.3M
      (bufsize > written) ? (bufsize - written) : 0, options,
393
29.3M
    last_boundclass);
394
29.3M
    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
395
29.3M
  }
396
1.93M
  return written;
397
1.93M
}
398
399
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
400
1.07M
{
401
1.07M
  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
402
1.07M
  return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
403
1.07M
}
404
405
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
406
1.07M
{
407
1.07M
  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
408
1.07M
  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
409
1.07M
}
410
411
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
412
1.07M
{
413
1.07M
  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
414
1.07M
  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
415
1.07M
}
416
417
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
418
1.07M
{
419
1.07M
  const utf8proc_property_t *p = utf8proc_get_property(c);
420
1.07M
  return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
421
1.07M
}
422
423
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
424
1.07M
{
425
1.07M
  const utf8proc_property_t *p = utf8proc_get_property(c);
426
1.07M
  return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
427
1.07M
}
428
429
/* return a character width analogous to wcwidth (except portable and
430
   hopefully less buggy than most system wcwidth functions). */
431
1.07M
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
432
1.07M
  return utf8proc_get_property(c)->charwidth;
433
1.07M
}
434
435
0
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) {
436
0
  return utf8proc_get_property(c)->ambiguous_width;
437
0
}
438
439
2.15M
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
440
2.15M
  return (utf8proc_category_t) utf8proc_get_property(c)->category;
441
2.15M
}
442
443
1.07M
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
444
1.07M
  static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
445
1.07M
  return s[utf8proc_category(c)];
446
1.07M
}
447
448
#define utf8proc_decompose_lump(replacement_uc) \
449
154k
  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
450
154k
  (utf8proc_option_t)(options & ~(unsigned int)UTF8PROC_LUMP), last_boundclass)
451
452
47.7M
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
453
47.7M
  const utf8proc_property_t *property;
454
47.7M
  utf8proc_propval_t category;
455
47.7M
  utf8proc_int32_t hangul_sindex;
456
47.7M
  if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
457
47.7M
  property = unsafe_get_property(uc);
458
47.7M
  category = property->category;
459
47.7M
  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
460
47.7M
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
461
43.9M
    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
462
98.3k
      utf8proc_int32_t hangul_tindex;
463
98.3k
      if (bufsize >= 1) {
464
33.0k
        dst[0] = UTF8PROC_HANGUL_LBASE +
465
33.0k
          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
466
33.0k
        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
467
33.0k
          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
468
33.0k
      }
469
98.3k
      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
470
98.3k
      if (!hangul_tindex) return 2;
471
81.6k
      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
472
81.6k
      return 3;
473
98.3k
    }
474
43.9M
  }
475
47.6M
  if (options & UTF8PROC_REJECTNA) {
476
0
    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
477
0
  }
478
47.6M
  if (options & UTF8PROC_IGNORE) {
479
11.7M
    if (property->ignorable) return 0;
480
11.7M
  }
481
47.6M
  if (options & UTF8PROC_STRIPNA) {
482
1.82M
    if (!category) return 0;
483
1.82M
  }
484
47.6M
  if (options & UTF8PROC_LUMP) {
485
1.82M
    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
486
1.82M
    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
487
1.16k
      utf8proc_decompose_lump(0x0027);
488
1.82M
    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
489
104k
      utf8proc_decompose_lump(0x002D);
490
1.71M
    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
491
1.71M
    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
492
1.71M
    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
493
956
      utf8proc_decompose_lump(0x003C);
494
1.71M
    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
495
913
      utf8proc_decompose_lump(0x003E);
496
1.71M
    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
497
1.71M
    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
498
1.42k
      utf8proc_decompose_lump(0x005E);
499
1.71M
    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
500
40.9k
      utf8proc_decompose_lump(0x005F);
501
1.67M
    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
502
1.67M
    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
503
1.67M
    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
504
1.67M
    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
505
1.67M
      if (category == UTF8PROC_CATEGORY_ZL ||
506
1.67M
          category == UTF8PROC_CATEGORY_ZP)
507
539
        utf8proc_decompose_lump(0x000A);
508
1.67M
    }
509
1.67M
  }
510
47.4M
  if (options & UTF8PROC_STRIPMARK) {
511
1.81M
    if (category == UTF8PROC_CATEGORY_MN ||
512
1.80M
      category == UTF8PROC_CATEGORY_MC ||
513
1.79M
      category == UTF8PROC_CATEGORY_ME) return 0;
514
1.81M
  }
515
47.4M
  if (options & UTF8PROC_CASEFOLD) {
516
11.7M
    if (property->casefold_seqindex != UINT16_MAX) {
517
274k
      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
518
274k
    }
519
11.7M
  }
520
47.1M
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
521
43.5M
    if (property->decomp_seqindex != UINT16_MAX &&
522
4.36M
        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
523
1.66M
      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
524
1.66M
    }
525
43.5M
  }
526
45.5M
  if (options & UTF8PROC_CHARBOUND) {
527
5.45M
    utf8proc_bool boundary;
528
5.45M
    boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
529
5.45M
                                       last_boundclass);
530
5.45M
    if (boundary) {
531
5.34M
      if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
532
5.34M
      if (bufsize >= 2) dst[1] = uc;
533
5.34M
      return 2;
534
5.34M
    }
535
5.45M
  }
536
40.1M
  if (bufsize >= 1) *dst = uc;
537
40.1M
  return 1;
538
45.5M
}
539
540
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
541
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
542
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
543
0
) {
544
0
    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
545
0
}
546
547
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
548
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
549
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
550
  utf8proc_custom_func custom_func, void *custom_data
551
37.2k
) {
552
  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
553
37.2k
  utf8proc_ssize_t wpos = 0;
554
37.2k
  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
555
0
    return UTF8PROC_ERROR_INVALIDOPTS;
556
37.2k
  if ((options & UTF8PROC_STRIPMARK) &&
557
3.72k
      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
558
0
    return UTF8PROC_ERROR_INVALIDOPTS;
559
37.2k
  {
560
37.2k
    utf8proc_int32_t uc;
561
37.2k
    utf8proc_ssize_t rpos = 0;
562
37.2k
    utf8proc_ssize_t decomp_result;
563
37.2k
    int boundclass = UTF8PROC_BOUNDCLASS_START;
564
18.2M
    while (1) {
565
18.2M
      if (options & UTF8PROC_NULLTERM) {
566
9.14M
        rpos += utf8proc_iterate(str + rpos, -1, &uc);
567
        /* checking of return value is not necessary,
568
           as 'uc' is < 0 in case of error */
569
9.14M
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
570
9.14M
        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
571
9.14M
        if (uc == 0) break;
572
9.14M
      } else {
573
9.14M
        if (rpos >= strlen) break;
574
9.12M
        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
575
9.12M
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
576
9.12M
      }
577
18.2M
      if (custom_func != NULL) {
578
0
        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
579
0
      }
580
18.2M
      decomp_result = utf8proc_decompose_char(
581
18.2M
        uc, buffer ? buffer+wpos : buffer, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
582
18.2M
        &boundclass
583
18.2M
      );
584
18.2M
      if (decomp_result < 0) return decomp_result;
585
18.2M
      wpos += decomp_result;
586
      /* prohibiting integer overflows due to too long strings: */
587
18.2M
      if (wpos < 0 ||
588
18.2M
          wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
589
0
        return UTF8PROC_ERROR_OVERFLOW;
590
18.2M
    }
591
37.2k
  }
592
33.7k
  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
593
13.5k
    utf8proc_ssize_t pos = 0;
594
19.9M
    while (pos < wpos-1) {
595
19.8M
      utf8proc_int32_t uc1, uc2;
596
19.8M
      const utf8proc_property_t *property1, *property2;
597
19.8M
      uc1 = buffer[pos];
598
19.8M
      if (uc1 < 0) {
599
        /* skip grapheme break */
600
3.38k
        pos++;
601
3.38k
        continue;
602
3.38k
      }
603
19.8M
      uc2 = buffer[pos+1];
604
19.8M
      if (uc2 < 0) {
605
        /* cannot recombine; skip grapheme break */
606
1.45M
        pos+=2;
607
1.45M
        continue;
608
1.45M
      }
609
18.4M
      property1 = unsafe_get_property(uc1);
610
18.4M
      property2 = unsafe_get_property(uc2);
611
18.4M
      if (property1->combining_class > property2->combining_class &&
612
118k
          property2->combining_class > 0) {
613
101k
        buffer[pos] = uc2;
614
101k
        buffer[pos+1] = uc1;
615
101k
        if (pos > 0) pos--; else pos++;
616
18.3M
      } else {
617
18.3M
        pos++;
618
18.3M
      }
619
18.4M
    }
620
13.5k
  }
621
33.7k
  return wpos;
622
37.2k
}
623
624
18.5k
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
625
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
626
18.5k
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
627
2.83k
    utf8proc_ssize_t rpos;
628
2.83k
    utf8proc_ssize_t wpos = 0;
629
2.83k
    utf8proc_int32_t uc;
630
749k
    for (rpos = 0; rpos < length; rpos++) {
631
747k
      uc = buffer[rpos];
632
747k
      if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
633
747k
      if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
634
646k
          ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
635
100k
        if (options & UTF8PROC_NLF2LS) {
636
100k
          if (options & UTF8PROC_NLF2PS) {
637
100k
            buffer[wpos++] = 0x000A;
638
100k
          } else {
639
0
            buffer[wpos++] = 0x2028;
640
0
          }
641
100k
        } else {
642
0
          if (options & UTF8PROC_NLF2PS) {
643
0
            buffer[wpos++] = 0x2029;
644
0
          } else {
645
0
            buffer[wpos++] = 0x0020;
646
0
          }
647
0
        }
648
646k
      } else if ((options & UTF8PROC_STRIPCC) &&
649
0
          (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
650
0
        if (uc == 0x0009) buffer[wpos++] = 0x0020;
651
646k
      } else {
652
646k
        buffer[wpos++] = uc;
653
646k
      }
654
747k
    }
655
2.83k
    length = wpos;
656
2.83k
  }
657
18.5k
  if (options & UTF8PROC_COMPOSE) {
658
8.43k
    utf8proc_int32_t *starter = NULL;
659
8.43k
    const utf8proc_property_t *starter_property = NULL;
660
8.43k
    utf8proc_propval_t max_combining_class = -1;
661
8.43k
    utf8proc_ssize_t rpos;
662
8.43k
    utf8proc_ssize_t wpos = 0;
663
13.6M
    for (rpos = 0; rpos < length; rpos++) {
664
13.6M
      utf8proc_int32_t current_char = buffer[rpos];
665
13.6M
      if (current_char < 0) {
666
        /* skip grapheme break */
667
731k
        continue;
668
731k
      }
669
12.8M
      const utf8proc_property_t *current_property = unsafe_get_property(current_char);
670
12.8M
      if (starter && current_property->combining_class > max_combining_class) {
671
        /* combination perhaps possible */
672
12.8M
        utf8proc_int32_t hangul_lindex;
673
12.8M
        utf8proc_int32_t hangul_sindex;
674
12.8M
        hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
675
12.8M
        if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
676
41.9k
          utf8proc_int32_t hangul_vindex;
677
41.9k
          hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
678
41.9k
          if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
679
26.0k
            *starter = UTF8PROC_HANGUL_SBASE +
680
26.0k
              (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
681
26.0k
              UTF8PROC_HANGUL_TCOUNT;
682
26.0k
            starter_property = NULL;
683
26.0k
            continue;
684
26.0k
          }
685
41.9k
        }
686
12.8M
        hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
687
12.8M
        if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
688
43.9k
            (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
689
26.1k
          utf8proc_int32_t hangul_tindex;
690
26.1k
          hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
691
26.1k
          if (hangul_tindex > 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
692
18.0k
            *starter += hangul_tindex;
693
18.0k
            starter_property = NULL;
694
18.0k
            continue;
695
18.0k
          }
696
26.1k
        }
697
12.8M
        if (!starter_property) {
698
12.7M
          starter_property = unsafe_get_property(*starter);
699
12.7M
        }
700
12.8M
        int idx = starter_property->comb_index;
701
12.8M
        if (idx < 0x3FF && current_property->comb_issecond) {
702
8.96k
          int len = starter_property->comb_length;
703
8.96k
          utf8proc_int32_t max_second = utf8proc_combinations_second[idx + len - 1];
704
8.96k
          if (current_char <= max_second) {
705
7.64k
            int off;
706
            // TODO: binary search? arithmetic search?
707
31.8k
            for (off = 0; off < len; ++off) {
708
31.8k
              utf8proc_int32_t second = utf8proc_combinations_second[idx + off];
709
31.8k
              if (current_char < second) {
710
                /* not found */
711
2.87k
                break;
712
2.87k
              }
713
28.9k
              if (current_char == second) {
714
                /* found */
715
4.76k
                utf8proc_int32_t composition = utf8proc_combinations_combined[idx + off];
716
4.76k
                *starter = composition;
717
4.76k
                starter_property = NULL;
718
4.76k
                break;
719
4.76k
              }
720
28.9k
            }
721
7.64k
            if (starter_property == NULL) {
722
              /* found */
723
4.76k
              continue;
724
4.76k
            }
725
7.64k
          }
726
8.96k
        }
727
12.8M
      }
728
12.8M
      buffer[wpos] = current_char;
729
12.8M
      if (current_property->combining_class) {
730
28.4k
        if (current_property->combining_class > max_combining_class) {
731
17.9k
          max_combining_class = current_property->combining_class;
732
17.9k
        }
733
12.8M
      } else {
734
12.8M
        starter = buffer + wpos;
735
12.8M
        starter_property = NULL;
736
12.8M
        max_combining_class = -1;
737
12.8M
      }
738
12.8M
      wpos++;
739
12.8M
    }
740
8.43k
    length = wpos;
741
8.43k
  }
742
18.5k
  return length;
743
18.5k
}
744
745
16.8k
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
746
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
747
     ASSERT: 'buffer' has one spare byte of free space at the end! */
748
16.8k
  length = utf8proc_normalize_utf32(buffer, length, options);
749
16.8k
  if (length < 0) return length;
750
16.8k
  {
751
16.8k
    utf8proc_ssize_t rpos, wpos = 0;
752
16.8k
    utf8proc_int32_t uc;
753
16.8k
    if (options & UTF8PROC_CHARBOUND) {
754
3.72M
        for (rpos = 0; rpos < length; rpos++) {
755
3.72M
            uc = buffer[rpos];
756
3.72M
            wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
757
3.72M
        }
758
11.8k
    } else {
759
18.9M
        for (rpos = 0; rpos < length; rpos++) {
760
18.8M
            uc = buffer[rpos];
761
18.8M
            wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
762
18.8M
        }
763
11.8k
    }
764
16.8k
    ((utf8proc_uint8_t *)buffer)[wpos] = 0;
765
16.8k
    return wpos;
766
16.8k
  }
767
16.8k
}
768
769
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
770
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
771
20.3k
) {
772
20.3k
    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
773
20.3k
}
774
775
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
776
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
777
  utf8proc_custom_func custom_func, void *custom_data
778
20.3k
) {
779
20.3k
  utf8proc_int32_t *buffer;
780
20.3k
  utf8proc_ssize_t result;
781
20.3k
  *dstptr = NULL;
782
20.3k
  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
783
20.3k
  if (result < 0) return result;
784
16.8k
  buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
785
16.8k
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
786
16.8k
  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
787
16.8k
  if (result < 0) {
788
0
    free(buffer);
789
0
    return result;
790
0
  }
791
16.8k
  result = utf8proc_reencode(buffer, result, options);
792
16.8k
  if (result < 0) {
793
0
    free(buffer);
794
0
    return result;
795
0
  }
796
16.8k
  {
797
16.8k
    utf8proc_int32_t *newptr;
798
16.8k
    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
799
16.8k
    if (newptr) buffer = newptr;
800
16.8k
  }
801
16.8k
  *dstptr = (utf8proc_uint8_t *)buffer;
802
16.8k
  return result;
803
16.8k
}
804
805
2.03k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
806
2.03k
  utf8proc_uint8_t *retval;
807
2.03k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
808
2.03k
    UTF8PROC_DECOMPOSE));
809
2.03k
  return retval;
810
2.03k
}
811
812
2.03k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
813
2.03k
  utf8proc_uint8_t *retval;
814
2.03k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
815
2.03k
    UTF8PROC_COMPOSE));
816
2.03k
  return retval;
817
2.03k
}
818
819
2.03k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
820
2.03k
  utf8proc_uint8_t *retval;
821
2.03k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
822
2.03k
    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT));
823
2.03k
  return retval;
824
2.03k
}
825
826
2.03k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
827
2.03k
  utf8proc_uint8_t *retval;
828
2.03k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
829
2.03k
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT));
830
2.03k
  return retval;
831
2.03k
}
832
833
2.03k
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
834
2.03k
  utf8proc_uint8_t *retval;
835
2.03k
  utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE |
836
2.03k
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE));
837
2.03k
  return retval;
838
2.03k
}