/src/netcdf-c/libdispatch/utf8proc.c
| Line | Count | Source | 
| 1 |  | /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ | 
| 2 |  | /* | 
| 3 |  |  *  Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. | 
| 4 |  |  *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany | 
| 5 |  |  * | 
| 6 |  |  *  Permission is hereby granted, free of charge, to any person obtaining a | 
| 7 |  |  *  copy of this software and associated documentation files (the "Software"), | 
| 8 |  |  *  to deal in the Software without restriction, including without limitation | 
| 9 |  |  *  the rights to use, copy, modify, merge, publish, distribute, sublicense, | 
| 10 |  |  *  and/or sell copies of the Software, and to permit persons to whom the | 
| 11 |  |  *  Software is furnished to do so, subject to the following conditions: | 
| 12 |  |  * | 
| 13 |  |  *  The above copyright notice and this permission notice shall be included in | 
| 14 |  |  *  all copies or substantial portions of the Software. | 
| 15 |  |  * | 
| 16 |  |  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
| 17 |  |  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
| 18 |  |  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | 
| 19 |  |  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
| 20 |  |  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | 
| 21 |  |  *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | 
| 22 |  |  *  DEALINGS IN THE SOFTWARE. | 
| 23 |  |  */ | 
| 24 |  |  | 
| 25 |  | /* | 
| 26 |  |  *  This library contains derived data from a modified version of the | 
| 27 |  |  *  Unicode data files. | 
| 28 |  |  * | 
| 29 |  |  *  The original data files are available at | 
| 30 |  |  *  https://www.unicode.org/Public/UNIDATA/ | 
| 31 |  |  * | 
| 32 |  |  *  Please notice the copyright statement in the file "utf8proc_data.c". | 
| 33 |  |  */ | 
| 34 |  |  | 
| 35 |  |  | 
| 36 |  | /* | 
| 37 |  |  *  File name:    utf8proc.c | 
| 38 |  |  * | 
| 39 |  |  *  Description: | 
| 40 |  |  *  Implementation of libutf8proc. | 
| 41 |  |  */ | 
| 42 |  |  | 
| 43 |  |  | 
| 44 |  | #include "utf8proc.h" | 
| 45 |  |  | 
| 46 |  | #ifndef SSIZE_MAX | 
| 47 |  | #define SSIZE_MAX ((size_t)SIZE_MAX/2) | 
| 48 |  | #endif | 
| 49 |  | #ifndef UINT16_MAX | 
| 50 |  | #  define UINT16_MAX 65535U | 
| 51 |  | #endif | 
| 52 |  |  | 
| 53 |  | #include "utf8proc_data.c" | 
| 54 |  |  | 
| 55 |  |  | 
| 56 |  | const nc_utf8proc_int8_t nc_utf8proc_utf8class[256] = { | 
| 57 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 58 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 59 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 60 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 61 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 62 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 63 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 64 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 65 |  |   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 66 |  |   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 67 |  |   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 68 |  |   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 69 |  |   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 
| 70 |  |   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 
| 71 |  |   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 
| 72 |  |   4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; | 
| 73 |  |  | 
| 74 | 0 | #define UTF8PROC_HANGUL_SBASE 0xAC00 | 
| 75 | 0 | #define UTF8PROC_HANGUL_LBASE 0x1100 | 
| 76 | 0 | #define UTF8PROC_HANGUL_VBASE 0x1161 | 
| 77 | 0 | #define UTF8PROC_HANGUL_TBASE 0x11A7 | 
| 78 | 0 | #define UTF8PROC_HANGUL_LCOUNT 19 | 
| 79 | 0 | #define UTF8PROC_HANGUL_VCOUNT 21 | 
| 80 | 0 | #define UTF8PROC_HANGUL_TCOUNT 28 | 
| 81 | 0 | #define UTF8PROC_HANGUL_NCOUNT 588 | 
| 82 | 0 | #define UTF8PROC_HANGUL_SCOUNT 11172 | 
| 83 |  | /* END is exclusive */ | 
| 84 |  | #define UTF8PROC_HANGUL_L_START  0x1100 | 
| 85 |  | #define UTF8PROC_HANGUL_L_END    0x115A | 
| 86 |  | #define UTF8PROC_HANGUL_L_FILLER 0x115F | 
| 87 |  | #define UTF8PROC_HANGUL_V_START  0x1160 | 
| 88 |  | #define UTF8PROC_HANGUL_V_END    0x11A3 | 
| 89 |  | #define UTF8PROC_HANGUL_T_START  0x11A8 | 
| 90 |  | #define UTF8PROC_HANGUL_T_END    0x11FA | 
| 91 |  | #define UTF8PROC_HANGUL_S_START  0xAC00 | 
| 92 |  | #define UTF8PROC_HANGUL_S_END    0xD7A4 | 
| 93 |  |  | 
| 94 |  | /* Should follow semantic-versioning rules (semver.org) based on API | 
| 95 |  |    compatibility.  (Note that the shared-library version number will | 
| 96 |  |    be different, being based on ABI compatibility.): */ | 
| 97 | 0 | #define STRINGIZEx(x) #x | 
| 98 | 0 | #define STRINGIZE(x) STRINGIZEx(x) | 
| 99 | 0 | const char *nc_utf8proc_version(void) { | 
| 100 | 0 |   return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; | 
| 101 | 0 | } | 
| 102 |  |  | 
| 103 | 0 | const char *nc_utf8proc_unicode_version(void) { | 
| 104 | 0 |   return "15.1.0"; | 
| 105 | 0 | } | 
| 106 |  |  | 
| 107 | 0 | const char *nc_utf8proc_errmsg(nc_utf8proc_ssize_t errcode) { | 
| 108 | 0 |   switch (errcode) { | 
| 109 | 0 |     case UTF8PROC_ERROR_NOMEM: | 
| 110 | 0 |     return "Memory for processing UTF-8 data could not be allocated."; | 
| 111 | 0 |     case UTF8PROC_ERROR_OVERFLOW: | 
| 112 | 0 |     return "UTF-8 string is too long to be processed."; | 
| 113 | 0 |     case UTF8PROC_ERROR_INVALIDUTF8: | 
| 114 | 0 |     return "Invalid UTF-8 string"; | 
| 115 | 0 |     case UTF8PROC_ERROR_NOTASSIGNED: | 
| 116 | 0 |     return "Unassigned Unicode code point found in UTF-8 string."; | 
| 117 | 0 |     case UTF8PROC_ERROR_INVALIDOPTS: | 
| 118 | 0 |     return "Invalid options for UTF-8 processing chosen."; | 
| 119 | 0 |     default: | 
| 120 | 0 |     return "An unknown error occurred while processing UTF-8 data."; | 
| 121 | 0 |   } | 
| 122 | 0 | } | 
| 123 |  |  | 
| 124 | 0 | #define utf_cont(ch)  (((ch) & 0xc0) == 0x80) | 
| 125 |  | nc_utf8proc_ssize_t nc_utf8proc_iterate( | 
| 126 |  |   const nc_utf8proc_uint8_t *str, nc_utf8proc_ssize_t strlen, nc_utf8proc_int32_t *dst | 
| 127 | 0 | ) { | 
| 128 | 0 |   nc_utf8proc_int32_t uc; | 
| 129 | 0 |   const nc_utf8proc_uint8_t *end; | 
| 130 |  | 
 | 
| 131 | 0 |   *dst = -1; | 
| 132 | 0 |   if (!strlen) return 0; | 
| 133 | 0 |   end = str + ((strlen < 0) ? 4 : strlen); | 
| 134 | 0 |   uc = *str++; | 
| 135 | 0 |   if (uc < 0x80) { | 
| 136 | 0 |     *dst = uc; | 
| 137 | 0 |     return 1; | 
| 138 | 0 |   } | 
| 139 |  |   // Must be between 0xc2 and 0xf4 inclusive to be valid | 
| 140 | 0 |   if ((nc_utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; | 
| 141 | 0 |   if (uc < 0xe0) {         // 2-byte sequence | 
| 142 |  |      // Must have valid continuation character | 
| 143 | 0 |      if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; | 
| 144 | 0 |      *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); | 
| 145 | 0 |      return 2; | 
| 146 | 0 |   } | 
| 147 | 0 |   if (uc < 0xf0) {        // 3-byte sequence | 
| 148 | 0 |      if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) | 
| 149 | 0 |         return UTF8PROC_ERROR_INVALIDUTF8; | 
| 150 |  |      // Check for surrogate chars | 
| 151 | 0 |      if (uc == 0xed && *str > 0x9f) | 
| 152 | 0 |          return UTF8PROC_ERROR_INVALIDUTF8; | 
| 153 | 0 |      uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); | 
| 154 | 0 |      if (uc < 0x800) | 
| 155 | 0 |          return UTF8PROC_ERROR_INVALIDUTF8; | 
| 156 | 0 |      *dst = uc; | 
| 157 | 0 |      return 3; | 
| 158 | 0 |   } | 
| 159 |  |   // 4-byte sequence | 
| 160 |  |   // Must have 3 valid continuation characters | 
| 161 | 0 |   if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) | 
| 162 | 0 |      return UTF8PROC_ERROR_INVALIDUTF8; | 
| 163 |  |   // Make sure in correct range (0x10000 - 0x10ffff) | 
| 164 | 0 |   if (uc == 0xf0) { | 
| 165 | 0 |     if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; | 
| 166 | 0 |   } else if (uc == 0xf4) { | 
| 167 | 0 |     if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; | 
| 168 | 0 |   } | 
| 169 | 0 |   *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); | 
| 170 | 0 |   return 4; | 
| 171 | 0 | } | 
| 172 |  |  | 
| 173 | 0 | nc_utf8proc_bool nc_utf8proc_codepoint_valid(nc_utf8proc_int32_t uc) { | 
| 174 | 0 |     return (((nc_utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((nc_utf8proc_uint32_t)uc < 0x110000); | 
| 175 | 0 | } | 
| 176 |  |  | 
| 177 | 0 | nc_utf8proc_ssize_t nc_utf8proc_encode_char(nc_utf8proc_int32_t uc, nc_utf8proc_uint8_t *dst) { | 
| 178 | 0 |   if (uc < 0x00) { | 
| 179 | 0 |     return 0; | 
| 180 | 0 |   } else if (uc < 0x80) { | 
| 181 | 0 |     dst[0] = (nc_utf8proc_uint8_t) uc; | 
| 182 | 0 |     return 1; | 
| 183 | 0 |   } else if (uc < 0x800) { | 
| 184 | 0 |     dst[0] = (nc_utf8proc_uint8_t)(0xC0 + (uc >> 6)); | 
| 185 | 0 |     dst[1] = (nc_utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | 
| 186 | 0 |     return 2; | 
| 187 |  |   // Note: we allow encoding 0xd800-0xdfff here, so as not to change | 
| 188 |  |   // the API, however, these are actually invalid in UTF-8 | 
| 189 | 0 |   } else if (uc < 0x10000) { | 
| 190 | 0 |     dst[0] = (nc_utf8proc_uint8_t)(0xE0 + (uc >> 12)); | 
| 191 | 0 |     dst[1] = (nc_utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | 
| 192 | 0 |     dst[2] = (nc_utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | 
| 193 | 0 |     return 3; | 
| 194 | 0 |   } else if (uc < 0x110000) { | 
| 195 | 0 |     dst[0] = (nc_utf8proc_uint8_t)(0xF0 + (uc >> 18)); | 
| 196 | 0 |     dst[1] = (nc_utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); | 
| 197 | 0 |     dst[2] = (nc_utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | 
| 198 | 0 |     dst[3] = (nc_utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | 
| 199 | 0 |     return 4; | 
| 200 | 0 |   } else return 0; | 
| 201 | 0 | } | 
| 202 |  |  | 
| 203 |  | /* internal version used for inserting 0xff bytes between graphemes */ | 
| 204 | 0 | static nc_utf8proc_ssize_t nc_charbound_encode_char(nc_utf8proc_int32_t uc, nc_utf8proc_uint8_t *dst) { | 
| 205 | 0 |    if (uc < 0x00) { | 
| 206 | 0 |       if (uc == -1) { /* internal value used for grapheme breaks */ | 
| 207 | 0 |         dst[0] = (nc_utf8proc_uint8_t)0xFF; | 
| 208 | 0 |         return 1; | 
| 209 | 0 |       } | 
| 210 | 0 |       return 0; | 
| 211 | 0 |    } else if (uc < 0x80) { | 
| 212 | 0 |       dst[0] = (nc_utf8proc_uint8_t)uc; | 
| 213 | 0 |       return 1; | 
| 214 | 0 |    } else if (uc < 0x800) { | 
| 215 | 0 |       dst[0] = (nc_utf8proc_uint8_t)(0xC0 + (uc >> 6)); | 
| 216 | 0 |       dst[1] = (nc_utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | 
| 217 | 0 |       return 2; | 
| 218 | 0 |    } else if (uc < 0x10000) { | 
| 219 | 0 |       dst[0] = (nc_utf8proc_uint8_t)(0xE0 + (uc >> 12)); | 
| 220 | 0 |       dst[1] = (nc_utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | 
| 221 | 0 |       dst[2] = (nc_utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | 
| 222 | 0 |       return 3; | 
| 223 | 0 |    } else if (uc < 0x110000) { | 
| 224 | 0 |       dst[0] = (nc_utf8proc_uint8_t)(0xF0 + (uc >> 18)); | 
| 225 | 0 |       dst[1] = (nc_utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); | 
| 226 | 0 |       dst[2] = (nc_utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | 
| 227 | 0 |       dst[3] = (nc_utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | 
| 228 | 0 |       return 4; | 
| 229 | 0 |    } else return 0; | 
| 230 | 0 | } | 
| 231 |  |  | 
| 232 |  | /* internal "unsafe" version that does not check whether uc is in range */ | 
| 233 | 0 | static const nc_utf8proc_property_t *nc_unsafe_get_property(nc_utf8proc_int32_t uc) { | 
| 234 |  |   /* ASSERT: uc >= 0 && uc < 0x110000 */ | 
| 235 | 0 |   return nc_utf8proc_properties + ( | 
| 236 | 0 |     nc_utf8proc_stage2table[ | 
| 237 | 0 |       nc_utf8proc_stage1table[uc >> 8] + (uc & 0xFF) | 
| 238 | 0 |     ] | 
| 239 | 0 |   ); | 
| 240 | 0 | } | 
| 241 |  |  | 
| 242 | 0 | const nc_utf8proc_property_t *nc_utf8proc_get_property(nc_utf8proc_int32_t uc) { | 
| 243 | 0 |   return uc < 0 || uc >= 0x110000 ? nc_utf8proc_properties : nc_unsafe_get_property(uc); | 
| 244 | 0 | } | 
| 245 |  |  | 
| 246 |  | /* return whether there is a grapheme break between boundclasses lbc and tbc | 
| 247 |  |    (according to the definition of extended grapheme clusters) | 
| 248 |  |  | 
| 249 |  |   Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): | 
| 250 |  |   http://www.unicode.org/reports/tr29/tr29-29.html | 
| 251 |  |  | 
| 252 |  |   CAVEATS: | 
| 253 |  |    Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) | 
| 254 |  |    and GB 12/13 (regional indicator code points) require knowledge of previous characters | 
| 255 |  |    and are thus not handled by this function. This may result in an incorrect break before | 
| 256 |  |    an E_Modifier class codepoint and an incorrectly missing break between two | 
| 257 |  |    REGIONAL_INDICATOR class code points if such support does not exist in the caller. | 
| 258 |  |  | 
| 259 |  |    See the special support in grapheme_break_extended, for required bookkeeping by the caller. | 
| 260 |  | */ | 
| 261 | 0 | static nc_utf8proc_bool nc_grapheme_break_simple(int lbc, int tbc) { | 
| 262 | 0 |   return | 
| 263 | 0 |     (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1 | 
| 264 | 0 |     (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3 | 
| 265 | 0 |      tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // --- | 
| 266 | 0 |     (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4 | 
| 267 | 0 |     (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5 | 
| 268 | 0 |     (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6 | 
| 269 | 0 |      (tbc == UTF8PROC_BOUNDCLASS_L ||                 // --- | 
| 270 | 0 |       tbc == UTF8PROC_BOUNDCLASS_V ||                 // --- | 
| 271 | 0 |       tbc == UTF8PROC_BOUNDCLASS_LV ||                // --- | 
| 272 | 0 |       tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // --- | 
| 273 | 0 |     ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7 | 
| 274 | 0 |       lbc == UTF8PROC_BOUNDCLASS_V) &&                // --- | 
| 275 | 0 |      (tbc == UTF8PROC_BOUNDCLASS_V ||                 // --- | 
| 276 | 0 |       tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // --- | 
| 277 | 0 |     ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8 | 
| 278 | 0 |       lbc == UTF8PROC_BOUNDCLASS_T) &&                // --- | 
| 279 | 0 |      tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // --- | 
| 280 | 0 |     (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9 | 
| 281 | 0 |      tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // --- | 
| 282 | 0 |      tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a | 
| 283 | 0 |      lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b | 
| 284 | 0 |     (lbc == UTF8PROC_BOUNDCLASS_E_ZWG &&              // GB11 (requires additional handling below) | 
| 285 | 0 |      tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ---- | 
| 286 | 0 |     (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below) | 
| 287 | 0 |      tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ---- | 
| 288 | 0 |     true; // GB999 | 
| 289 | 0 | } | 
| 290 |  |  | 
| 291 |  | static nc_utf8proc_bool nc_grapheme_break_extended(int lbc, int tbc, int licb, int ticb, nc_utf8proc_int32_t *state) | 
| 292 | 0 | { | 
| 293 | 0 |   if (state) { | 
| 294 | 0 |     int state_bc, state_icb; /* boundclass and indic_conjunct_break state */ | 
| 295 | 0 |     if (*state == 0) { /* state initialization */ | 
| 296 | 0 |       state_bc = lbc; | 
| 297 | 0 |       state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE; | 
| 298 | 0 |     } | 
| 299 | 0 |     else { /* lbc and licb are already encoded in *state */ | 
| 300 | 0 |       state_bc = *state & 0xff;  // 1st byte of state is bound class | 
| 301 | 0 |       state_icb = *state >> 8;   // 2nd byte of state is indic conjunct break | 
| 302 | 0 |     } | 
| 303 |  | 
 | 
| 304 | 0 |     nc_utf8proc_bool break_permitted = nc_grapheme_break_simple(state_bc, tbc) && | 
| 305 | 0 |        !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER | 
| 306 | 0 |         && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c | 
| 307 |  |  | 
| 308 |  |     // Special support for GB9c.  Don't break between two consonants | 
| 309 |  |     // separated 1+ linker characters and 0+ extend characters in any order. | 
| 310 |  |     // After a consonant, we enter LINKER state after at least one linker. | 
| 311 | 0 |     if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT | 
| 312 | 0 |         || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT | 
| 313 | 0 |         || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) | 
| 314 | 0 |       state_icb = ticb; | 
| 315 | 0 |     else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER) | 
| 316 | 0 |       state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ? | 
| 317 | 0 |                   UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb; | 
| 318 |  |  | 
| 319 |  |     // Special support for GB 12/13 made possible by GB999. After two RI | 
| 320 |  |     // class codepoints we want to force a break. Do this by resetting the | 
| 321 |  |     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break | 
| 322 |  |     // after that character according to GB999 (unless of course such a break is | 
| 323 |  |     // forbidden by a different rule such as GB9). | 
| 324 | 0 |     if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) | 
| 325 | 0 |       state_bc = UTF8PROC_BOUNDCLASS_OTHER; | 
| 326 |  |     // Special support for GB11 (emoji extend* zwj / emoji) | 
| 327 | 0 |     else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { | 
| 328 | 0 |       if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji | 
| 329 | 0 |         state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC; | 
| 330 | 0 |       else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ) | 
| 331 | 0 |         state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo | 
| 332 | 0 |       else | 
| 333 | 0 |         state_bc = tbc; | 
| 334 | 0 |     } | 
| 335 | 0 |     else | 
| 336 | 0 |       state_bc = tbc; | 
| 337 |  | 
 | 
| 338 | 0 |     *state = state_bc + (state_icb << 8); | 
| 339 | 0 |     return break_permitted; | 
| 340 | 0 |   } | 
| 341 | 0 |   else | 
| 342 | 0 |     return nc_grapheme_break_simple(lbc, tbc); | 
| 343 | 0 | } | 
| 344 |  |  | 
| 345 |  | nc_utf8proc_bool nc_utf8proc_grapheme_break_stateful( | 
| 346 | 0 |     nc_utf8proc_int32_t c1, nc_utf8proc_int32_t c2, nc_utf8proc_int32_t *state) { | 
| 347 |  | 
 | 
| 348 | 0 |   const nc_utf8proc_property_t *p1 = nc_utf8proc_get_property(c1); | 
| 349 | 0 |   const nc_utf8proc_property_t *p2 = nc_utf8proc_get_property(c2); | 
| 350 | 0 |   return nc_grapheme_break_extended(p1->boundclass, | 
| 351 | 0 |                                  p2->boundclass, | 
| 352 | 0 |                                  p1->indic_conjunct_break, | 
| 353 | 0 |                                  p2->indic_conjunct_break, | 
| 354 | 0 |                                  state); | 
| 355 | 0 | } | 
| 356 |  |  | 
| 357 |  |  | 
| 358 |  | nc_utf8proc_bool nc_utf8proc_grapheme_break( | 
| 359 | 0 |     nc_utf8proc_int32_t c1, nc_utf8proc_int32_t c2) { | 
| 360 | 0 |   return nc_utf8proc_grapheme_break_stateful(c1, c2, NULL); | 
| 361 | 0 | } | 
| 362 |  |  | 
| 363 |  | static nc_utf8proc_int32_t nc_seqindex_decode_entry(const nc_utf8proc_uint16_t **entry) | 
| 364 | 0 | { | 
| 365 | 0 |   nc_utf8proc_int32_t entry_cp = **entry; | 
| 366 | 0 |   if ((entry_cp & 0xF800) == 0xD800) { | 
| 367 | 0 |     *entry = *entry + 1; | 
| 368 | 0 |     entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); | 
| 369 | 0 |     entry_cp += 0x10000; | 
| 370 | 0 |   } | 
| 371 | 0 |   return entry_cp; | 
| 372 | 0 | } | 
| 373 |  |  | 
| 374 |  | static nc_utf8proc_int32_t nc_seqindex_decode_index(const nc_utf8proc_uint32_t seqindex) | 
| 375 | 0 | { | 
| 376 | 0 |   const nc_utf8proc_uint16_t *entry = &nc_utf8proc_sequences[seqindex]; | 
| 377 | 0 |   return nc_seqindex_decode_entry(&entry); | 
| 378 | 0 | } | 
| 379 |  |  | 
| 380 | 0 | static nc_utf8proc_ssize_t nc_seqindex_write_char_decomposed(nc_utf8proc_uint16_t seqindex, nc_utf8proc_int32_t *dst, nc_utf8proc_ssize_t bufsize, nc_utf8proc_option_t options, int *last_boundclass) { | 
| 381 | 0 |   nc_utf8proc_ssize_t written = 0; | 
| 382 | 0 |   const nc_utf8proc_uint16_t *entry = &nc_utf8proc_sequences[seqindex & 0x3FFF]; | 
| 383 | 0 |   int len = seqindex >> 14; | 
| 384 | 0 |   if (len >= 3) { | 
| 385 | 0 |     len = *entry; | 
| 386 | 0 |     entry++; | 
| 387 | 0 |   } | 
| 388 | 0 |   for (; len >= 0; entry++, len--) { | 
| 389 | 0 |     nc_utf8proc_int32_t entry_cp = nc_seqindex_decode_entry(&entry); | 
| 390 |  | 
 | 
| 391 | 0 |     written += nc_utf8proc_decompose_char(entry_cp, dst+written, | 
| 392 | 0 |       (bufsize > written) ? (bufsize - written) : 0, options, | 
| 393 | 0 |     last_boundclass); | 
| 394 | 0 |     if (written < 0) return UTF8PROC_ERROR_OVERFLOW; | 
| 395 | 0 |   } | 
| 396 | 0 |   return written; | 
| 397 | 0 | } | 
| 398 |  |  | 
| 399 |  | nc_utf8proc_int32_t nc_utf8proc_tolower(nc_utf8proc_int32_t c) | 
| 400 | 0 | { | 
| 401 | 0 |   nc_utf8proc_int32_t cl = nc_utf8proc_get_property(c)->lowercase_seqindex; | 
| 402 | 0 |   return cl != UINT16_MAX ? nc_seqindex_decode_index((nc_utf8proc_uint32_t)cl) : c; | 
| 403 | 0 | } | 
| 404 |  |  | 
| 405 |  | nc_utf8proc_int32_t nc_utf8proc_toupper(nc_utf8proc_int32_t c) | 
| 406 | 0 | { | 
| 407 | 0 |   nc_utf8proc_int32_t cu = nc_utf8proc_get_property(c)->uppercase_seqindex; | 
| 408 | 0 |   return cu != UINT16_MAX ? nc_seqindex_decode_index((nc_utf8proc_uint32_t)cu) : c; | 
| 409 | 0 | } | 
| 410 |  |  | 
| 411 |  | nc_utf8proc_int32_t nc_utf8proc_totitle(nc_utf8proc_int32_t c) | 
| 412 | 0 | { | 
| 413 | 0 |   nc_utf8proc_int32_t cu = nc_utf8proc_get_property(c)->titlecase_seqindex; | 
| 414 | 0 |   return cu != UINT16_MAX ? nc_seqindex_decode_index((nc_utf8proc_uint32_t)cu) : c; | 
| 415 | 0 | } | 
| 416 |  |  | 
| 417 |  | int nc_utf8proc_islower(nc_utf8proc_int32_t c) | 
| 418 | 0 | { | 
| 419 | 0 |   const nc_utf8proc_property_t *p = nc_utf8proc_get_property(c); | 
| 420 | 0 |   return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX; | 
| 421 | 0 | } | 
| 422 |  |  | 
| 423 |  | int nc_utf8proc_isupper(nc_utf8proc_int32_t c) | 
| 424 | 0 | { | 
| 425 | 0 |   const nc_utf8proc_property_t *p = nc_utf8proc_get_property(c); | 
| 426 | 0 |   return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT; | 
| 427 | 0 | } | 
| 428 |  |  | 
| 429 |  | /* return a character width analogous to wcwidth (except portable and | 
| 430 |  |    hopefully less buggy than most system wcwidth functions). */ | 
| 431 | 0 | int nc_utf8proc_charwidth(nc_utf8proc_int32_t c) { | 
| 432 | 0 |   return nc_utf8proc_get_property(c)->charwidth; | 
| 433 | 0 | } | 
| 434 |  |  | 
| 435 | 0 | nc_utf8proc_category_t nc_utf8proc_category(nc_utf8proc_int32_t c) { | 
| 436 | 0 |   return (nc_utf8proc_category_t) nc_utf8proc_get_property(c)->category; | 
| 437 | 0 | } | 
| 438 |  |  | 
| 439 | 0 | const char *nc_utf8proc_category_string(nc_utf8proc_int32_t c) { | 
| 440 | 0 |   static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; | 
| 441 | 0 |   return s[nc_utf8proc_category(c)]; | 
| 442 | 0 | } | 
| 443 |  |  | 
| 444 |  | #define nc_utf8proc_decompose_lump(replacement_uc) \ | 
| 445 | 0 |   return nc_utf8proc_decompose_char((replacement_uc), dst, bufsize, \ | 
| 446 | 0 |   options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) | 
| 447 |  |  | 
| 448 | 0 | nc_utf8proc_ssize_t nc_utf8proc_decompose_char(nc_utf8proc_int32_t uc, nc_utf8proc_int32_t *dst, nc_utf8proc_ssize_t bufsize, nc_utf8proc_option_t options, int *last_boundclass) { | 
| 449 | 0 |   const nc_utf8proc_property_t *property; | 
| 450 | 0 |   nc_utf8proc_propval_t category; | 
| 451 | 0 |   nc_utf8proc_int32_t hangul_sindex; | 
| 452 | 0 |   if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; | 
| 453 | 0 |   property = nc_unsafe_get_property(uc); | 
| 454 | 0 |   category = property->category; | 
| 455 | 0 |   hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; | 
| 456 | 0 |   if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { | 
| 457 | 0 |     if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { | 
| 458 | 0 |       nc_utf8proc_int32_t hangul_tindex; | 
| 459 | 0 |       if (bufsize >= 1) { | 
| 460 | 0 |         dst[0] = UTF8PROC_HANGUL_LBASE + | 
| 461 | 0 |           hangul_sindex / UTF8PROC_HANGUL_NCOUNT; | 
| 462 | 0 |         if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + | 
| 463 | 0 |           (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; | 
| 464 | 0 |       } | 
| 465 | 0 |       hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; | 
| 466 | 0 |       if (!hangul_tindex) return 2; | 
| 467 | 0 |       if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; | 
| 468 | 0 |       return 3; | 
| 469 | 0 |     } | 
| 470 | 0 |   } | 
| 471 | 0 |   if (options & UTF8PROC_REJECTNA) { | 
| 472 | 0 |     if (!category) return UTF8PROC_ERROR_NOTASSIGNED; | 
| 473 | 0 |   } | 
| 474 | 0 |   if (options & UTF8PROC_IGNORE) { | 
| 475 | 0 |     if (property->ignorable) return 0; | 
| 476 | 0 |   } | 
| 477 | 0 |   if (options & UTF8PROC_STRIPNA) { | 
| 478 | 0 |     if (!category) return 0; | 
| 479 | 0 |   } | 
| 480 | 0 |   if (options & UTF8PROC_LUMP) { | 
| 481 | 0 |     if (category == UTF8PROC_CATEGORY_ZS) nc_utf8proc_decompose_lump(0x0020); | 
| 482 | 0 |     if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) | 
| 483 | 0 |       nc_utf8proc_decompose_lump(0x0027); | 
| 484 | 0 |     if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) | 
| 485 | 0 |       nc_utf8proc_decompose_lump(0x002D); | 
| 486 | 0 |     if (uc == 0x2044 || uc == 0x2215) nc_utf8proc_decompose_lump(0x002F); | 
| 487 | 0 |     if (uc == 0x2236) nc_utf8proc_decompose_lump(0x003A); | 
| 488 | 0 |     if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) | 
| 489 | 0 |       nc_utf8proc_decompose_lump(0x003C); | 
| 490 | 0 |     if (uc == 0x203A || uc == 0x232A || uc == 0x3009) | 
| 491 | 0 |       nc_utf8proc_decompose_lump(0x003E); | 
| 492 | 0 |     if (uc == 0x2216) nc_utf8proc_decompose_lump(0x005C); | 
| 493 | 0 |     if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) | 
| 494 | 0 |       nc_utf8proc_decompose_lump(0x005E); | 
| 495 | 0 |     if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) | 
| 496 | 0 |       nc_utf8proc_decompose_lump(0x005F); | 
| 497 | 0 |     if (uc == 0x02CB) nc_utf8proc_decompose_lump(0x0060); | 
| 498 | 0 |     if (uc == 0x2223) nc_utf8proc_decompose_lump(0x007C); | 
| 499 | 0 |     if (uc == 0x223C) nc_utf8proc_decompose_lump(0x007E); | 
| 500 | 0 |     if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { | 
| 501 | 0 |       if (category == UTF8PROC_CATEGORY_ZL || | 
| 502 | 0 |           category == UTF8PROC_CATEGORY_ZP) | 
| 503 | 0 |         nc_utf8proc_decompose_lump(0x000A); | 
| 504 | 0 |     } | 
| 505 | 0 |   } | 
| 506 | 0 |   if (options & UTF8PROC_STRIPMARK) { | 
| 507 | 0 |     if (category == UTF8PROC_CATEGORY_MN || | 
| 508 | 0 |       category == UTF8PROC_CATEGORY_MC || | 
| 509 | 0 |       category == UTF8PROC_CATEGORY_ME) return 0; | 
| 510 | 0 |   } | 
| 511 | 0 |   if (options & UTF8PROC_CASEFOLD) { | 
| 512 | 0 |     if (property->casefold_seqindex != UINT16_MAX) { | 
| 513 | 0 |       return nc_seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); | 
| 514 | 0 |     } | 
| 515 | 0 |   } | 
| 516 | 0 |   if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { | 
| 517 | 0 |     if (property->decomp_seqindex != UINT16_MAX && | 
| 518 | 0 |         (!property->decomp_type || (options & UTF8PROC_COMPAT))) { | 
| 519 | 0 |       return nc_seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); | 
| 520 | 0 |     } | 
| 521 | 0 |   } | 
| 522 | 0 |   if (options & UTF8PROC_CHARBOUND) { | 
| 523 | 0 |     nc_utf8proc_bool boundary; | 
| 524 | 0 |     boundary = nc_grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break, | 
| 525 | 0 |                                        last_boundclass); | 
| 526 | 0 |     if (boundary) { | 
| 527 | 0 |       if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ | 
| 528 | 0 |       if (bufsize >= 2) dst[1] = uc; | 
| 529 | 0 |       return 2; | 
| 530 | 0 |     } | 
| 531 | 0 |   } | 
| 532 | 0 |   if (bufsize >= 1) *dst = uc; | 
| 533 | 0 |   return 1; | 
| 534 | 0 | } | 
| 535 |  |  | 
| 536 |  | nc_utf8proc_ssize_t nc_utf8proc_decompose( | 
| 537 |  |   const nc_utf8proc_uint8_t *str, nc_utf8proc_ssize_t strlen, | 
| 538 |  |   nc_utf8proc_int32_t *buffer, nc_utf8proc_ssize_t bufsize, nc_utf8proc_option_t options | 
| 539 | 0 | ) { | 
| 540 | 0 |     return nc_utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); | 
| 541 | 0 | } | 
| 542 |  |  | 
| 543 |  | nc_utf8proc_ssize_t nc_utf8proc_decompose_custom( | 
| 544 |  |   const nc_utf8proc_uint8_t *str, nc_utf8proc_ssize_t strlen, | 
| 545 |  |   nc_utf8proc_int32_t *buffer, nc_utf8proc_ssize_t bufsize, nc_utf8proc_option_t options, | 
| 546 |  |   nc_utf8proc_custom_func custom_func, void *custom_data | 
| 547 | 0 | ) { | 
| 548 |  |   /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ | 
| 549 | 0 |   nc_utf8proc_ssize_t wpos = 0; | 
| 550 | 0 |   if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) | 
| 551 | 0 |     return UTF8PROC_ERROR_INVALIDOPTS; | 
| 552 | 0 |   if ((options & UTF8PROC_STRIPMARK) && | 
| 553 | 0 |       !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) | 
| 554 | 0 |     return UTF8PROC_ERROR_INVALIDOPTS; | 
| 555 | 0 |   { | 
| 556 | 0 |     nc_utf8proc_int32_t uc; | 
| 557 | 0 |     nc_utf8proc_ssize_t rpos = 0; | 
| 558 | 0 |     nc_utf8proc_ssize_t decomp_result; | 
| 559 | 0 |     int boundclass = UTF8PROC_BOUNDCLASS_START; | 
| 560 | 0 |     while (1) { | 
| 561 | 0 |       if (options & UTF8PROC_NULLTERM) { | 
| 562 | 0 |         rpos += nc_utf8proc_iterate(str + rpos, -1, &uc); | 
| 563 |  |         /* checking of return value is not necessary, | 
| 564 |  |            as 'uc' is < 0 in case of error */ | 
| 565 | 0 |         if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; | 
| 566 | 0 |         if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; | 
| 567 | 0 |         if (uc == 0) break; | 
| 568 | 0 |       } else { | 
| 569 | 0 |         if (rpos >= strlen) break; | 
| 570 | 0 |         rpos += nc_utf8proc_iterate(str + rpos, strlen - rpos, &uc); | 
| 571 | 0 |         if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; | 
| 572 | 0 |       } | 
| 573 | 0 |       if (custom_func != NULL) { | 
| 574 | 0 |         uc = custom_func(uc, custom_data);   /* user-specified custom mapping */ | 
| 575 | 0 |       } | 
| 576 | 0 |       decomp_result = nc_utf8proc_decompose_char( | 
| 577 | 0 |         uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, | 
| 578 | 0 |         &boundclass | 
| 579 | 0 |       ); | 
| 580 | 0 |       if (decomp_result < 0) return decomp_result; | 
| 581 | 0 |       wpos += decomp_result; | 
| 582 |  |       /* prohibiting integer overflows due to too long strings: */ | 
| 583 | 0 |       if (wpos < 0 || | 
| 584 | 0 |           wpos > (nc_utf8proc_ssize_t)(SSIZE_MAX/sizeof(nc_utf8proc_int32_t)/2)) | 
| 585 | 0 |         return UTF8PROC_ERROR_OVERFLOW; | 
| 586 | 0 |     } | 
| 587 | 0 |   } | 
| 588 | 0 |   if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { | 
| 589 | 0 |     nc_utf8proc_ssize_t pos = 0; | 
| 590 | 0 |     while (pos < wpos-1) { | 
| 591 | 0 |       nc_utf8proc_int32_t uc1, uc2; | 
| 592 | 0 |       const nc_utf8proc_property_t *property1, *property2; | 
| 593 | 0 |       uc1 = buffer[pos]; | 
| 594 | 0 |       uc2 = buffer[pos+1]; | 
| 595 | 0 |       property1 = nc_unsafe_get_property(uc1); | 
| 596 | 0 |       property2 = nc_unsafe_get_property(uc2); | 
| 597 | 0 |       if (property1->combining_class > property2->combining_class && | 
| 598 | 0 |           property2->combining_class > 0) { | 
| 599 | 0 |         buffer[pos] = uc2; | 
| 600 | 0 |         buffer[pos+1] = uc1; | 
| 601 | 0 |         if (pos > 0) pos--; else pos++; | 
| 602 | 0 |       } else { | 
| 603 | 0 |         pos++; | 
| 604 | 0 |       } | 
| 605 | 0 |     } | 
| 606 | 0 |   } | 
| 607 | 0 |   return wpos; | 
| 608 | 0 | } | 
| 609 |  |  | 
| 610 | 0 | nc_utf8proc_ssize_t nc_utf8proc_normalize_utf32(nc_utf8proc_int32_t *buffer, nc_utf8proc_ssize_t length, nc_utf8proc_option_t options) { | 
| 611 |  |   /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ | 
| 612 | 0 |   if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { | 
| 613 | 0 |     nc_utf8proc_ssize_t rpos; | 
| 614 | 0 |     nc_utf8proc_ssize_t wpos = 0; | 
| 615 | 0 |     nc_utf8proc_int32_t uc; | 
| 616 | 0 |     for (rpos = 0; rpos < length; rpos++) { | 
| 617 | 0 |       uc = buffer[rpos]; | 
| 618 | 0 |       if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; | 
| 619 | 0 |       if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || | 
| 620 | 0 |           ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { | 
| 621 | 0 |         if (options & UTF8PROC_NLF2LS) { | 
| 622 | 0 |           if (options & UTF8PROC_NLF2PS) { | 
| 623 | 0 |             buffer[wpos++] = 0x000A; | 
| 624 | 0 |           } else { | 
| 625 | 0 |             buffer[wpos++] = 0x2028; | 
| 626 | 0 |           } | 
| 627 | 0 |         } else { | 
| 628 | 0 |           if (options & UTF8PROC_NLF2PS) { | 
| 629 | 0 |             buffer[wpos++] = 0x2029; | 
| 630 | 0 |           } else { | 
| 631 | 0 |             buffer[wpos++] = 0x0020; | 
| 632 | 0 |           } | 
| 633 | 0 |         } | 
| 634 | 0 |       } else if ((options & UTF8PROC_STRIPCC) && | 
| 635 | 0 |           (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { | 
| 636 | 0 |         if (uc == 0x0009) buffer[wpos++] = 0x0020; | 
| 637 | 0 |       } else { | 
| 638 | 0 |         buffer[wpos++] = uc; | 
| 639 | 0 |       } | 
| 640 | 0 |     } | 
| 641 | 0 |     length = wpos; | 
| 642 | 0 |   } | 
| 643 | 0 |   if (options & UTF8PROC_COMPOSE) { | 
| 644 | 0 |     nc_utf8proc_int32_t *starter = NULL; | 
| 645 | 0 |     nc_utf8proc_int32_t current_char; | 
| 646 | 0 |     const nc_utf8proc_property_t *starter_property = NULL, *current_property; | 
| 647 | 0 |     nc_utf8proc_propval_t max_combining_class = -1; | 
| 648 | 0 |     nc_utf8proc_ssize_t rpos; | 
| 649 | 0 |     nc_utf8proc_ssize_t wpos = 0; | 
| 650 | 0 |     nc_utf8proc_int32_t composition; | 
| 651 | 0 |     for (rpos = 0; rpos < length; rpos++) { | 
| 652 | 0 |       current_char = buffer[rpos]; | 
| 653 | 0 |       current_property = nc_unsafe_get_property(current_char); | 
| 654 | 0 |       if (starter && current_property->combining_class > max_combining_class) { | 
| 655 |  |         /* combination perhaps possible */ | 
| 656 | 0 |         nc_utf8proc_int32_t hangul_lindex; | 
| 657 | 0 |         nc_utf8proc_int32_t hangul_sindex; | 
| 658 | 0 |         hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; | 
| 659 | 0 |         if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { | 
| 660 | 0 |           nc_utf8proc_int32_t hangul_vindex; | 
| 661 | 0 |           hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; | 
| 662 | 0 |           if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { | 
| 663 | 0 |             *starter = UTF8PROC_HANGUL_SBASE + | 
| 664 | 0 |               (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * | 
| 665 | 0 |               UTF8PROC_HANGUL_TCOUNT; | 
| 666 | 0 |             starter_property = NULL; | 
| 667 | 0 |             continue; | 
| 668 | 0 |           } | 
| 669 | 0 |         } | 
| 670 | 0 |         hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; | 
| 671 | 0 |         if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && | 
| 672 | 0 |             (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { | 
| 673 | 0 |           nc_utf8proc_int32_t hangul_tindex; | 
| 674 | 0 |           hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; | 
| 675 | 0 |           if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { | 
| 676 | 0 |             *starter += hangul_tindex; | 
| 677 | 0 |             starter_property = NULL; | 
| 678 | 0 |             continue; | 
| 679 | 0 |           } | 
| 680 | 0 |         } | 
| 681 | 0 |         if (!starter_property) { | 
| 682 | 0 |           starter_property = nc_unsafe_get_property(*starter); | 
| 683 | 0 |         } | 
| 684 | 0 |         if (starter_property->comb_index < 0x8000 && | 
| 685 | 0 |             current_property->comb_index != UINT16_MAX && | 
| 686 | 0 |             current_property->comb_index >= 0x8000) { | 
| 687 | 0 |           int sidx = starter_property->comb_index; | 
| 688 | 0 |           int idx = current_property->comb_index & 0x3FFF; | 
| 689 | 0 |           if (idx >= nc_utf8proc_combinations[sidx] && idx <= nc_utf8proc_combinations[sidx + 1] ) { | 
| 690 | 0 |             idx += sidx + 2 - nc_utf8proc_combinations[sidx]; | 
| 691 | 0 |             if (current_property->comb_index & 0x4000) { | 
| 692 | 0 |               composition = (nc_utf8proc_combinations[idx] << 16) | nc_utf8proc_combinations[idx+1]; | 
| 693 | 0 |             } else | 
| 694 | 0 |               composition = nc_utf8proc_combinations[idx]; | 
| 695 |  | 
 | 
| 696 | 0 |             if (composition > 0 && (!(options & UTF8PROC_STABLE) || | 
| 697 | 0 |                 !(nc_unsafe_get_property(composition)->comp_exclusion))) { | 
| 698 | 0 |               *starter = composition; | 
| 699 | 0 |               starter_property = NULL; | 
| 700 | 0 |               continue; | 
| 701 | 0 |             } | 
| 702 | 0 |           } | 
| 703 | 0 |         } | 
| 704 | 0 |       } | 
| 705 | 0 |       buffer[wpos] = current_char; | 
| 706 | 0 |       if (current_property->combining_class) { | 
| 707 | 0 |         if (current_property->combining_class > max_combining_class) { | 
| 708 | 0 |           max_combining_class = current_property->combining_class; | 
| 709 | 0 |         } | 
| 710 | 0 |       } else { | 
| 711 | 0 |         starter = buffer + wpos; | 
| 712 | 0 |         starter_property = NULL; | 
| 713 | 0 |         max_combining_class = -1; | 
| 714 | 0 |       } | 
| 715 | 0 |       wpos++; | 
| 716 | 0 |     } | 
| 717 | 0 |     length = wpos; | 
| 718 | 0 |   } | 
| 719 | 0 |   return length; | 
| 720 | 0 | } | 
| 721 |  |  | 
| 722 | 0 | nc_utf8proc_ssize_t nc_utf8proc_reencode(nc_utf8proc_int32_t *buffer, nc_utf8proc_ssize_t length, nc_utf8proc_option_t options) { | 
| 723 |  |   /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored | 
| 724 |  |      ASSERT: 'buffer' has one spare byte of free space at the end! */ | 
| 725 | 0 |   length = nc_utf8proc_normalize_utf32(buffer, length, options); | 
| 726 | 0 |   if (length < 0) return length; | 
| 727 | 0 |   { | 
| 728 | 0 |     nc_utf8proc_ssize_t rpos, wpos = 0; | 
| 729 | 0 |     nc_utf8proc_int32_t uc; | 
| 730 | 0 |     if (options & UTF8PROC_CHARBOUND) { | 
| 731 | 0 |         for (rpos = 0; rpos < length; rpos++) { | 
| 732 | 0 |             uc = buffer[rpos]; | 
| 733 | 0 |             wpos += nc_charbound_encode_char(uc, ((nc_utf8proc_uint8_t *)buffer) + wpos); | 
| 734 | 0 |         } | 
| 735 | 0 |     } else { | 
| 736 | 0 |         for (rpos = 0; rpos < length; rpos++) { | 
| 737 | 0 |             uc = buffer[rpos]; | 
| 738 | 0 |             wpos += nc_utf8proc_encode_char(uc, ((nc_utf8proc_uint8_t *)buffer) + wpos); | 
| 739 | 0 |         } | 
| 740 | 0 |     } | 
| 741 | 0 |     ((nc_utf8proc_uint8_t *)buffer)[wpos] = 0; | 
| 742 | 0 |     return wpos; | 
| 743 | 0 |   } | 
| 744 | 0 | } | 
| 745 |  |  | 
| 746 |  | nc_utf8proc_ssize_t nc_utf8proc_map( | 
| 747 |  |   const nc_utf8proc_uint8_t *str, nc_utf8proc_ssize_t strlen, nc_utf8proc_uint8_t **dstptr, nc_utf8proc_option_t options | 
| 748 | 0 | ) { | 
| 749 | 0 |     return nc_utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); | 
| 750 | 0 | } | 
| 751 |  |  | 
| 752 |  | nc_utf8proc_ssize_t nc_utf8proc_map_custom( | 
| 753 |  |   const nc_utf8proc_uint8_t *str, nc_utf8proc_ssize_t strlen, nc_utf8proc_uint8_t **dstptr, nc_utf8proc_option_t options, | 
| 754 |  |   nc_utf8proc_custom_func custom_func, void *custom_data | 
| 755 | 0 | ) { | 
| 756 | 0 |   nc_utf8proc_int32_t *buffer; | 
| 757 | 0 |   nc_utf8proc_ssize_t result; | 
| 758 | 0 |   *dstptr = NULL; | 
| 759 | 0 |   result = nc_utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); | 
| 760 | 0 |   if (result < 0) return result; | 
| 761 | 0 |   buffer = (nc_utf8proc_int32_t *) malloc(((nc_utf8proc_size_t)result) * sizeof(nc_utf8proc_int32_t) + 1); | 
| 762 | 0 |   if (!buffer) return UTF8PROC_ERROR_NOMEM; | 
| 763 | 0 |   result = nc_utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); | 
| 764 | 0 |   if (result < 0) { | 
| 765 | 0 |     free(buffer); | 
| 766 | 0 |     return result; | 
| 767 | 0 |   } | 
| 768 | 0 |   result = nc_utf8proc_reencode(buffer, result, options); | 
| 769 | 0 |   if (result < 0) { | 
| 770 | 0 |     free(buffer); | 
| 771 | 0 |     return result; | 
| 772 | 0 |   } | 
| 773 | 0 |   { | 
| 774 | 0 |     nc_utf8proc_int32_t *newptr; | 
| 775 | 0 |     newptr = (nc_utf8proc_int32_t *) realloc(buffer, (size_t)result+1); | 
| 776 | 0 |     if (newptr) buffer = newptr; | 
| 777 | 0 |   } | 
| 778 | 0 |   *dstptr = (nc_utf8proc_uint8_t *)buffer; | 
| 779 | 0 |   return result; | 
| 780 | 0 | } | 
| 781 |  |  | 
| 782 | 0 | nc_utf8proc_uint8_t *nc_utf8proc_NFD(const nc_utf8proc_uint8_t *str) { | 
| 783 | 0 |   nc_utf8proc_uint8_t *retval; | 
| 784 | 0 |   nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | 
| 785 | 0 |     UTF8PROC_DECOMPOSE); | 
| 786 | 0 |   return retval; | 
| 787 | 0 | } | 
| 788 |  |  | 
| 789 | 0 | nc_utf8proc_uint8_t *nc_utf8proc_NFC(const nc_utf8proc_uint8_t *str) { | 
| 790 | 0 |   nc_utf8proc_uint8_t *retval; | 
| 791 | 0 |   nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | 
| 792 | 0 |     UTF8PROC_COMPOSE); | 
| 793 | 0 |   return retval; | 
| 794 | 0 | } | 
| 795 |  |  | 
| 796 | 0 | nc_utf8proc_uint8_t *nc_utf8proc_NFKD(const nc_utf8proc_uint8_t *str) { | 
| 797 | 0 |   nc_utf8proc_uint8_t *retval; | 
| 798 | 0 |   nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | 
| 799 | 0 |     UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); | 
| 800 | 0 |   return retval; | 
| 801 | 0 | } | 
| 802 |  |  | 
| 803 | 0 | nc_utf8proc_uint8_t *nc_utf8proc_NFKC(const nc_utf8proc_uint8_t *str) { | 
| 804 | 0 |   nc_utf8proc_uint8_t *retval; | 
| 805 | 0 |   nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | 
| 806 | 0 |     UTF8PROC_COMPOSE | UTF8PROC_COMPAT); | 
| 807 | 0 |   return retval; | 
| 808 | 0 | } | 
| 809 |  |  | 
| 810 | 0 | nc_utf8proc_uint8_t *nc_utf8proc_NFKC_Casefold(const nc_utf8proc_uint8_t *str) { | 
| 811 | 0 |   nc_utf8proc_uint8_t *retval; | 
| 812 | 0 |   nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | 
| 813 | 0 |     UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); | 
| 814 | 0 |   return retval; | 
| 815 | 0 | } |