/src/fluent-bit/lib/onigmo/enc/unicode.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | unicode.c - Oniguruma (regular expression library) |
3 | | **********************************************************************/ |
4 | | /*- |
5 | | * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> |
6 | | * All rights reserved. |
7 | | * |
8 | | * Redistribution and use in source and binary forms, with or without |
9 | | * modification, are permitted provided that the following conditions |
10 | | * are met: |
11 | | * 1. Redistributions of source code must retain the above copyright |
12 | | * notice, this list of conditions and the following disclaimer. |
13 | | * 2. Redistributions in binary form must reproduce the above copyright |
14 | | * notice, this list of conditions and the following disclaimer in the |
15 | | * documentation and/or other materials provided with the distribution. |
16 | | * |
17 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
18 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
19 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
20 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
21 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
22 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
23 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
25 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
26 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
27 | | * SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | #include "regint.h" |
31 | | |
32 | | #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ |
33 | 2.19M | ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0) |
34 | | #if 0 |
35 | | #define ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(code,cbit) \ |
36 | | ((EncUNICODE_ISO_8859_1_CtypeTable[code] & (cbit)) != 0) |
37 | | #endif |
38 | | |
39 | | static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = { |
40 | | 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, |
41 | | 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008, |
42 | | 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, |
43 | | 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, |
44 | | 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, |
45 | | 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, |
46 | | 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, |
47 | | 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, |
48 | | 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2, |
49 | | 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, |
50 | | 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, |
51 | | 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0, |
52 | | 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2, |
53 | | 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, |
54 | | 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, |
55 | | 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008, |
56 | | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008, |
57 | | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, |
58 | | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, |
59 | | 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, |
60 | | 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, |
61 | | 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0, |
62 | | 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0, |
63 | | 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, |
64 | | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, |
65 | | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, |
66 | | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0, |
67 | | 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2, |
68 | | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, |
69 | | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, |
70 | | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0, |
71 | | 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2 |
72 | | }; |
73 | | |
74 | | typedef struct { |
75 | | int n; |
76 | | OnigCodePoint code[3]; |
77 | | } CodePointList3; |
78 | | |
79 | | typedef struct { |
80 | | OnigCodePoint from; |
81 | | CodePointList3 to; |
82 | | } CaseFold_11_Type; |
83 | | |
84 | | typedef struct { |
85 | | OnigCodePoint from; |
86 | | CodePointList3 to; |
87 | | } CaseUnfold_11_Type; |
88 | | |
89 | | typedef struct { |
90 | | int n; |
91 | | OnigCodePoint code[2]; |
92 | | } CodePointList2; |
93 | | |
94 | | typedef struct { |
95 | | OnigCodePoint from[2]; |
96 | | CodePointList2 to; |
97 | | } CaseUnfold_12_Type; |
98 | | |
99 | | typedef struct { |
100 | | OnigCodePoint from[3]; |
101 | | CodePointList2 to; |
102 | | } CaseUnfold_13_Type; |
103 | | |
104 | | static inline int |
105 | | bits_of(const OnigCodePoint c, const int n) |
106 | 5.30M | { |
107 | 5.30M | return (c >> (2 - n) * 7) & 127; |
108 | 5.30M | } |
109 | | |
110 | | static inline int |
111 | | bits_at(const OnigCodePoint *c, const int n) |
112 | 2.12M | { |
113 | 2.12M | return bits_of(c[n / 3], n % 3); |
114 | 2.12M | } |
115 | | |
116 | | static int |
117 | | code1_equal(const OnigCodePoint x, const OnigCodePoint y) |
118 | 951k | { |
119 | 951k | if (x != y) return 0; |
120 | 707k | return 1; |
121 | 951k | } |
122 | | |
123 | | static int |
124 | | code2_equal(const OnigCodePoint *x, const OnigCodePoint *y) |
125 | 73.3k | { |
126 | 73.3k | if (x[0] != y[0]) return 0; |
127 | 3.72k | if (x[1] != y[1]) return 0; |
128 | 3.59k | return 1; |
129 | 3.72k | } |
130 | | |
131 | | static int |
132 | | code3_equal(const OnigCodePoint *x, const OnigCodePoint *y) |
133 | 1.04k | { |
134 | 1.04k | if (x[0] != y[0]) return 0; |
135 | 585 | if (x[1] != y[1]) return 0; |
136 | 445 | if (x[2] != y[2]) return 0; |
137 | 445 | return 1; |
138 | 445 | } |
139 | | |
140 | | /* macros related to ONIGENC_CASE flags */ |
141 | | /* defined here because not used in other files */ |
142 | 0 | #define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL) |
143 | | |
144 | | /* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ |
145 | 0 | #define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ |
146 | 0 | #define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset) |
147 | 0 | #define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1)) |
148 | | #define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset) |
149 | | |
150 | 0 | #define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift) |
151 | | #define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift) |
152 | 0 | #define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift) |
153 | | |
154 | | /* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ |
155 | | #define U ONIGENC_CASE_UPCASE |
156 | | #define D ONIGENC_CASE_DOWNCASE |
157 | | #define F ONIGENC_CASE_FOLD |
158 | | #define ST ONIGENC_CASE_TITLECASE |
159 | | #define SU ONIGENC_CASE_UP_SPECIAL |
160 | | #define SL ONIGENC_CASE_DOWN_SPECIAL |
161 | | #define IT ONIGENC_CASE_IS_TITLECASE |
162 | | #define I(n) OnigSpecialIndexEncode(n) |
163 | | #define L(n) SpecialsLengthEncode(n) |
164 | | |
165 | | #include "casefold.h" |
166 | | |
167 | | #undef U |
168 | | #undef D |
169 | | #undef F |
170 | | #undef ST |
171 | | #undef SU |
172 | | #undef SL |
173 | | #undef IT |
174 | | #undef I |
175 | | #undef L |
176 | | |
177 | | #include "name2ctype.h" |
178 | | |
179 | 2.97M | #define CODE_RANGES_NUM numberof(CodeRanges) |
180 | | |
181 | | extern int |
182 | | onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED) |
183 | 2.20M | { |
184 | 2.20M | if ( |
185 | 2.20M | #ifdef USE_UNICODE_PROPERTIES |
186 | 2.20M | ctype <= ONIGENC_MAX_STD_CTYPE && |
187 | 2.20M | #endif |
188 | 2.20M | code < 256) { |
189 | 2.19M | return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype); |
190 | 2.19M | } |
191 | | |
192 | 4.64k | if (ctype >= CODE_RANGES_NUM) { |
193 | 0 | return ONIGERR_TYPE_BUG; |
194 | 0 | } |
195 | | |
196 | 4.64k | return onig_is_in_code_range((UChar* )CodeRanges[ctype], code); |
197 | 4.64k | } |
198 | | |
199 | | |
200 | | extern int |
201 | | onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[]) |
202 | 2.96M | { |
203 | 2.96M | if (ctype >= CODE_RANGES_NUM) { |
204 | 0 | return ONIGERR_TYPE_BUG; |
205 | 0 | } |
206 | | |
207 | 2.96M | *ranges = CodeRanges[ctype]; |
208 | | |
209 | 2.96M | return 0; |
210 | 2.96M | } |
211 | | |
212 | | extern int |
213 | | onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, |
214 | | const OnigCodePoint* ranges[], |
215 | | OnigEncoding enc ARG_UNUSED) |
216 | 0 | { |
217 | 0 | *sb_out = 0x00; |
218 | 0 | return onigenc_unicode_ctype_code_range(ctype, ranges); |
219 | 0 | } |
220 | | |
221 | 63.7M | #define PROPERTY_NAME_MAX_SIZE (MAX_WORD_LENGTH + 1) |
222 | | |
223 | | extern int |
224 | | onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end) |
225 | 2.64M | { |
226 | 2.64M | int len; |
227 | 2.64M | int ctype; |
228 | 2.64M | UChar buf[PROPERTY_NAME_MAX_SIZE]; |
229 | 2.64M | const UChar *p; |
230 | 2.64M | OnigCodePoint code; |
231 | | |
232 | 2.64M | len = 0; |
233 | 71.2M | for (p = name; p < end; p += enclen(enc, p, end)) { |
234 | 68.5M | code = ONIGENC_MBC_TO_CODE(enc, p, end); |
235 | 68.5M | if (code == ' ' || code == '-' || code == '_') |
236 | 4.85M | continue; |
237 | 63.7M | if (code >= 0x80) |
238 | 0 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME; |
239 | | |
240 | 63.7M | buf[len++] = ONIGENC_ASCII_CODE_TO_LOWER_CASE(code); |
241 | 63.7M | if (len >= PROPERTY_NAME_MAX_SIZE) |
242 | 0 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME; |
243 | 63.7M | } |
244 | | |
245 | 2.64M | buf[len] = 0; |
246 | | |
247 | 2.64M | if ((ctype = uniname2ctype(buf, len)) < 0) { |
248 | 0 | return ONIGERR_INVALID_CHAR_PROPERTY_NAME; |
249 | 0 | } |
250 | | |
251 | 2.64M | return ctype; |
252 | 2.64M | } |
253 | | |
254 | 1.04M | #define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup |
255 | 374k | #define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup |
256 | 255k | #define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup |
257 | 233k | #define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup |
258 | | |
259 | | enum { |
260 | | I_WITH_DOT_ABOVE = 0x0130, |
261 | | DOTLESS_i = 0x0131, |
262 | | DOT_ABOVE = 0x0307 |
263 | | }; |
264 | | |
265 | | extern int |
266 | | onigenc_unicode_mbc_case_fold(OnigEncoding enc, |
267 | | OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, |
268 | | UChar* fold) |
269 | 190k | { |
270 | 190k | const CodePointList3 *to; |
271 | 190k | OnigCodePoint code; |
272 | 190k | int i, len, rlen; |
273 | 190k | const UChar *p = *pp; |
274 | | |
275 | 190k | code = ONIGENC_MBC_TO_CODE(enc, p, end); |
276 | 190k | len = enclen(enc, p, end); |
277 | 190k | *pp += len; |
278 | | |
279 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
280 | | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { |
281 | | if (code == 'I') { |
282 | | return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold); |
283 | | } |
284 | | else if (code == I_WITH_DOT_ABOVE) { |
285 | | return ONIGENC_CODE_TO_MBC(enc, 'i', fold); |
286 | | } |
287 | | } |
288 | | #endif |
289 | | |
290 | 190k | if ((to = onigenc_unicode_fold_lookup(code)) != 0) { |
291 | 110k | if (OnigCodePointCount(to->n) == 1) { |
292 | 110k | return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); |
293 | 110k | } |
294 | | #if 0 |
295 | | /* NO NEEDS TO CHECK */ |
296 | | else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) |
297 | | #else |
298 | 255 | else |
299 | 255 | #endif |
300 | 255 | { |
301 | 255 | rlen = 0; |
302 | 882 | for (i = 0; i < OnigCodePointCount(to->n); i++) { |
303 | 627 | len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); |
304 | 627 | fold += len; |
305 | 627 | rlen += len; |
306 | 627 | } |
307 | 255 | return rlen; |
308 | 255 | } |
309 | 110k | } |
310 | | |
311 | 166k | for (i = 0; i < len; i++) { |
312 | 86.1k | *fold++ = *p++; |
313 | 86.1k | } |
314 | 79.9k | return len; |
315 | 190k | } |
316 | | |
317 | | extern int |
318 | | onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, |
319 | | OnigApplyAllCaseFoldFunc f, void* arg, |
320 | | OnigEncoding enc ARG_UNUSED) |
321 | 3.48k | { |
322 | 3.48k | const CaseUnfold_11_Type* p11; |
323 | 3.48k | OnigCodePoint code; |
324 | 3.48k | int i, j, k, r; |
325 | | |
326 | 4.87M | for (i = 0; i < numberof(CaseUnfold_11); i++) { |
327 | 4.86M | p11 = &CaseUnfold_11[i]; |
328 | 9.83M | for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { |
329 | 4.97M | code = p11->from; |
330 | 4.97M | r = (*f)(p11->to.code[j], &code, 1, arg); |
331 | 4.97M | if (r != 0) return r; |
332 | | |
333 | 4.97M | code = p11->to.code[j]; |
334 | 4.97M | r = (*f)(p11->from, &code, 1, arg); |
335 | 4.97M | if (r != 0) return r; |
336 | | |
337 | 5.08M | for (k = 0; k < j; k++) { |
338 | 115k | r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg); |
339 | 115k | if (r != 0) return r; |
340 | | |
341 | 115k | r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg); |
342 | 115k | if (r != 0) return r; |
343 | 115k | } |
344 | 4.97M | } |
345 | 4.86M | } |
346 | | |
347 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
348 | | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { |
349 | | code = DOTLESS_i; |
350 | | r = (*f)('I', &code, 1, arg); |
351 | | if (r != 0) return r; |
352 | | code = 'I'; |
353 | | r = (*f)(DOTLESS_i, &code, 1, arg); |
354 | | if (r != 0) return r; |
355 | | |
356 | | code = I_WITH_DOT_ABOVE; |
357 | | r = (*f)('i', &code, 1, arg); |
358 | | if (r != 0) return r; |
359 | | code = 'i'; |
360 | | r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg); |
361 | | if (r != 0) return r; |
362 | | } |
363 | | else { |
364 | | #endif |
365 | 6.97k | for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { |
366 | 3.48k | p11 = &CaseUnfold_11_Locale[i]; |
367 | 6.97k | for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { |
368 | 3.48k | code = p11->from; |
369 | 3.48k | r = (*f)(p11->to.code[j], &code, 1, arg); |
370 | 3.48k | if (r != 0) return r; |
371 | | |
372 | 3.48k | code = p11->to.code[j]; |
373 | 3.48k | r = (*f)(p11->from, &code, 1, arg); |
374 | 3.48k | if (r != 0) return r; |
375 | | |
376 | 3.48k | for (k = 0; k < j; k++) { |
377 | 0 | r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), |
378 | 0 | 1, arg); |
379 | 0 | if (r != 0) return r; |
380 | | |
381 | 0 | r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), |
382 | 0 | 1, arg); |
383 | 0 | if (r != 0) return r; |
384 | 0 | } |
385 | 3.48k | } |
386 | 3.48k | } |
387 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
388 | | } |
389 | | #endif |
390 | | |
391 | 3.48k | if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { |
392 | 205k | for (i = 0; i < numberof(CaseUnfold_12); i++) { |
393 | 505k | for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) { |
394 | 303k | r = (*f)(CaseUnfold_12[i].to.code[j], |
395 | 303k | (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg); |
396 | 303k | if (r != 0) return r; |
397 | | |
398 | 809k | for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) { |
399 | 505k | if (k == j) continue; |
400 | | |
401 | 202k | r = (*f)(CaseUnfold_12[i].to.code[j], |
402 | 202k | (OnigCodePoint* )(&CaseUnfold_12[i].to.code[k]), 1, arg); |
403 | 202k | if (r != 0) return r; |
404 | 202k | } |
405 | 303k | } |
406 | 202k | } |
407 | | |
408 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
409 | | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) { |
410 | | #endif |
411 | 6.97k | for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { |
412 | 6.97k | for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) { |
413 | 3.48k | r = (*f)(CaseUnfold_12_Locale[i].to.code[j], |
414 | 3.48k | (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg); |
415 | 3.48k | if (r != 0) return r; |
416 | | |
417 | 6.97k | for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) { |
418 | 3.48k | if (k == j) continue; |
419 | | |
420 | 0 | r = (*f)(CaseUnfold_12_Locale[i].to.code[j], |
421 | 0 | (OnigCodePoint* )(&CaseUnfold_12_Locale[i].to.code[k]), |
422 | 0 | 1, arg); |
423 | 0 | if (r != 0) return r; |
424 | 0 | } |
425 | 3.48k | } |
426 | 3.48k | } |
427 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
428 | | } |
429 | | #endif |
430 | | |
431 | 52.3k | for (i = 0; i < numberof(CaseUnfold_13); i++) { |
432 | 104k | for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) { |
433 | 55.8k | r = (*f)(CaseUnfold_13[i].to.code[j], |
434 | 55.8k | (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg); |
435 | 55.8k | if (r != 0) return r; |
436 | | |
437 | 125k | for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) { |
438 | 69.7k | if (k == j) continue; |
439 | | |
440 | 13.9k | r = (*f)(CaseUnfold_13[i].to.code[j], |
441 | 13.9k | (OnigCodePoint* )(&CaseUnfold_13[i].to.code[k]), 1, arg); |
442 | 13.9k | if (r != 0) return r; |
443 | 13.9k | } |
444 | 55.8k | } |
445 | 48.8k | } |
446 | 3.48k | } |
447 | | |
448 | 3.48k | return 0; |
449 | 3.48k | } |
450 | | |
451 | 233k | #define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code)) |
452 | | |
453 | | extern int |
454 | | onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, |
455 | | OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, |
456 | | OnigCaseFoldCodeItem items[]) |
457 | 372k | { |
458 | 372k | int n, i, j, k, len; |
459 | 372k | OnigCodePoint code, codes[3]; |
460 | 372k | const CodePointList3 *to, *z3; |
461 | 372k | const CodePointList2 *z2; |
462 | | |
463 | 372k | n = 0; |
464 | | |
465 | 372k | code = ONIGENC_MBC_TO_CODE(enc, p, end); |
466 | 372k | len = enclen(enc, p, end); |
467 | | |
468 | | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI |
469 | | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { |
470 | | switch (code) { |
471 | | case 'I': |
472 | | items[0].byte_len = len; |
473 | | items[0].code_len = 1; |
474 | | items[0].code[0] = DOTLESS_i; |
475 | | return 1; |
476 | | case I_WITH_DOT_ABOVE: |
477 | | items[0].byte_len = len; |
478 | | items[0].code_len = 1; |
479 | | items[0].code[0] = 'i'; |
480 | | return 1; |
481 | | case DOTLESS_i: |
482 | | items[0].byte_len = len; |
483 | | items[0].code_len = 1; |
484 | | items[0].code[0] = 'I'; |
485 | | return 1; |
486 | | case 'i': |
487 | | items[0].byte_len = len; |
488 | | items[0].code_len = 1; |
489 | | items[0].code[0] = I_WITH_DOT_ABOVE; |
490 | | return 1; |
491 | | } |
492 | | } |
493 | | #endif |
494 | | |
495 | 372k | if ((to = onigenc_unicode_fold_lookup(code)) != 0) { |
496 | 127k | if (OnigCodePointCount(to->n) == 1) { |
497 | 125k | OnigCodePoint orig_code = code; |
498 | | |
499 | 125k | items[0].byte_len = len; |
500 | 125k | items[0].code_len = 1; |
501 | 125k | items[0].code[0] = to->code[0]; |
502 | 125k | n++; |
503 | | |
504 | 125k | code = to->code[0]; |
505 | 125k | if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && |
506 | 125k | CodePointListValidP(to)) { |
507 | 311k | for (i = 0; i < OnigCodePointCount(to->n); i++) { |
508 | 185k | if (to->code[i] != orig_code) { |
509 | 59.9k | items[n].byte_len = len; |
510 | 59.9k | items[n].code_len = 1; |
511 | 59.9k | items[n].code[0] = to->code[i]; |
512 | 59.9k | n++; |
513 | 59.9k | } |
514 | 185k | } |
515 | 125k | } |
516 | 125k | } |
517 | 1.47k | else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { |
518 | 1.47k | OnigCodePoint cs[3][4]; |
519 | 1.47k | int fn, ncs[3]; |
520 | | |
521 | 4.87k | for (fn = 0; fn < OnigCodePointCount(to->n); fn++) { |
522 | 3.39k | cs[fn][0] = to->code[fn]; |
523 | 3.39k | if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) { |
524 | 7.19k | for (i = 0; i < OnigCodePointCount(z3->n); i++) { |
525 | 4.62k | cs[fn][i+1] = z3->code[i]; |
526 | 4.62k | } |
527 | 2.56k | ncs[fn] = OnigCodePointCount(z3->n) + 1; |
528 | 2.56k | } |
529 | 830 | else |
530 | 830 | ncs[fn] = 1; |
531 | 3.39k | } |
532 | | |
533 | 1.47k | if (fn == 2) { |
534 | 4.09k | for (i = 0; i < ncs[0]; i++) { |
535 | 9.47k | for (j = 0; j < ncs[1]; j++) { |
536 | 6.41k | items[n].byte_len = len; |
537 | 6.41k | items[n].code_len = 2; |
538 | 6.41k | items[n].code[0] = cs[0][i]; |
539 | 6.41k | items[n].code[1] = cs[1][j]; |
540 | 6.41k | n++; |
541 | 6.41k | } |
542 | 3.05k | } |
543 | | |
544 | 1.03k | if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 && |
545 | 1.03k | CodePointListValidP(z2)) { |
546 | 3.05k | for (i = 0; i < OnigCodePointCount(z2->n); i++) { |
547 | 2.02k | if (z2->code[i] == code) continue; |
548 | | |
549 | 994 | items[n].byte_len = len; |
550 | 994 | items[n].code_len = 1; |
551 | 994 | items[n].code[0] = z2->code[i]; |
552 | 994 | n++; |
553 | 994 | } |
554 | 1.03k | } |
555 | 1.03k | } |
556 | 445 | else { |
557 | 2.07k | for (i = 0; i < ncs[0]; i++) { |
558 | 3.26k | for (j = 0; j < ncs[1]; j++) { |
559 | 4.03k | for (k = 0; k < ncs[2]; k++) { |
560 | 2.40k | items[n].byte_len = len; |
561 | 2.40k | items[n].code_len = 3; |
562 | 2.40k | items[n].code[0] = cs[0][i]; |
563 | 2.40k | items[n].code[1] = cs[1][j]; |
564 | 2.40k | items[n].code[2] = cs[2][k]; |
565 | 2.40k | n++; |
566 | 2.40k | } |
567 | 1.63k | } |
568 | 1.63k | } |
569 | | |
570 | 445 | if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 && |
571 | 445 | CodePointListValidP(z2)) { |
572 | 1.23k | for (i = 0; i < OnigCodePointCount(z2->n); i++) { |
573 | 788 | if (z2->code[i] == code) continue; |
574 | | |
575 | 343 | items[n].byte_len = len; |
576 | 343 | items[n].code_len = 1; |
577 | 343 | items[n].code[0] = z2->code[i]; |
578 | 343 | n++; |
579 | 343 | } |
580 | 445 | } |
581 | 445 | } |
582 | | |
583 | | /* multi char folded code is not head of another folded multi char */ |
584 | 1.47k | flag = 0; /* DISABLE_CASE_FOLD_MULTI_CHAR(flag); */ |
585 | 1.47k | } |
586 | 127k | } |
587 | 244k | else { |
588 | 244k | if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && |
589 | 244k | CodePointListValidP(to)) { |
590 | 220k | for (i = 0; i < OnigCodePointCount(to->n); i++) { |
591 | 116k | items[n].byte_len = len; |
592 | 116k | items[n].code_len = 1; |
593 | 116k | items[n].code[0] = to->code[i]; |
594 | 116k | n++; |
595 | 116k | } |
596 | 103k | } |
597 | 244k | } |
598 | | |
599 | | |
600 | 372k | if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { |
601 | 287k | p += len; |
602 | 287k | if (p < end) { |
603 | 253k | int clen; |
604 | | |
605 | 253k | codes[0] = code; |
606 | 253k | code = ONIGENC_MBC_TO_CODE(enc, p, end); |
607 | 253k | if ((to = onigenc_unicode_fold_lookup(code)) != 0 |
608 | 253k | && OnigCodePointCount(to->n) == 1) { |
609 | 121k | codes[1] = to->code[0]; |
610 | 121k | } |
611 | 132k | else |
612 | 132k | codes[1] = code; |
613 | | |
614 | 253k | clen = enclen(enc, p, end); |
615 | 253k | len += clen; |
616 | 253k | if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 && |
617 | 253k | CodePointListValidP(z2)) { |
618 | 6.84k | for (i = 0; i < OnigCodePointCount(z2->n); i++) { |
619 | 4.28k | items[n].byte_len = len; |
620 | 4.28k | items[n].code_len = 1; |
621 | 4.28k | items[n].code[0] = z2->code[i]; |
622 | 4.28k | n++; |
623 | 4.28k | } |
624 | 2.56k | } |
625 | | |
626 | 253k | p += clen; |
627 | 253k | if (p < end) { |
628 | 233k | code = ONIGENC_MBC_TO_CODE(enc, p, end); |
629 | 233k | if ((to = onigenc_unicode_fold_lookup(code)) != 0 |
630 | 233k | && OnigCodePointCount(to->n) == 1) { |
631 | 114k | codes[2] = to->code[0]; |
632 | 114k | } |
633 | 118k | else |
634 | 118k | codes[2] = code; |
635 | | |
636 | 233k | clen = enclen(enc, p, end); |
637 | 233k | len += clen; |
638 | 233k | if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 && |
639 | 233k | CodePointListValidP(z2)) { |
640 | 0 | for (i = 0; i < OnigCodePointCount(z2->n); i++) { |
641 | 0 | items[n].byte_len = len; |
642 | 0 | items[n].code_len = 1; |
643 | 0 | items[n].code[0] = z2->code[i]; |
644 | 0 | n++; |
645 | 0 | } |
646 | 0 | } |
647 | 233k | } |
648 | 253k | } |
649 | 287k | } |
650 | | |
651 | 372k | return n; |
652 | 372k | } |
653 | | |
654 | | #ifdef USE_CASE_MAP_API |
655 | | /* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ |
656 | 0 | #define CASE_MAPPING_SLACK 12 |
657 | 0 | #define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) |
658 | | extern int |
659 | | onigenc_unicode_case_map(OnigCaseFoldType* flagP, |
660 | | const OnigUChar** pp, const OnigUChar* end, |
661 | | OnigUChar* to, OnigUChar* to_end, |
662 | | const struct OnigEncodingTypeST* enc) |
663 | 0 | { |
664 | 0 | OnigCodePoint code; |
665 | 0 | OnigUChar *to_start = to; |
666 | 0 | OnigCaseFoldType flags = *flagP; |
667 | 0 | int codepoint_length; |
668 | |
|
669 | 0 | to_end -= CASE_MAPPING_SLACK; |
670 | | /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to |
671 | | * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ |
672 | 0 | flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET; |
673 | |
|
674 | 0 | while (*pp < end && to <= to_end) { |
675 | 0 | codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); |
676 | 0 | if (codepoint_length < 0) |
677 | 0 | return codepoint_length; /* encoding invalid */ |
678 | 0 | code = ONIGENC_MBC_TO_CODE(enc, *pp, end); |
679 | 0 | *pp += codepoint_length; |
680 | |
|
681 | 0 | if (code <= 'z') { /* ASCII comes first */ |
682 | 0 | if (code >= 'a' /*&& code <= 'z'*/) { |
683 | 0 | if (flags & ONIGENC_CASE_UPCASE) { |
684 | 0 | MODIFIED; |
685 | 0 | if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') |
686 | 0 | code = I_WITH_DOT_ABOVE; |
687 | 0 | else |
688 | 0 | code -= 'a' - 'A'; |
689 | 0 | } |
690 | 0 | } |
691 | 0 | else if (code >= 'A' && code <= 'Z') { |
692 | 0 | if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { |
693 | 0 | MODIFIED; |
694 | 0 | if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I') |
695 | 0 | code = DOTLESS_i; |
696 | 0 | else |
697 | 0 | code += 'a' - 'A'; |
698 | 0 | } |
699 | 0 | } |
700 | 0 | } |
701 | 0 | else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ |
702 | 0 | const CodePointList3 *folded; |
703 | |
|
704 | 0 | if (code == I_WITH_DOT_ABOVE) { |
705 | 0 | if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { |
706 | 0 | MODIFIED; |
707 | 0 | code = 'i'; |
708 | 0 | if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ |
709 | 0 | to += ONIGENC_CODE_TO_MBC(enc, code, to); |
710 | 0 | code = DOT_ABOVE; |
711 | 0 | } |
712 | 0 | } |
713 | 0 | } |
714 | 0 | else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ |
715 | 0 | if (flags & ONIGENC_CASE_UPCASE) { |
716 | 0 | MODIFIED; |
717 | 0 | code = 'I'; |
718 | 0 | } |
719 | 0 | } |
720 | 0 | else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ |
721 | 0 | if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */ |
722 | 0 | MODIFIED; |
723 | 0 | code += 0x10D0 - 0x1C90; |
724 | 0 | } |
725 | 0 | else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ |
726 | 0 | && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ |
727 | | /* already Titlecase, no changes needed */ |
728 | 0 | } |
729 | 0 | else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ |
730 | 0 | const OnigCodePoint *next; |
731 | 0 | int count; |
732 | |
|
733 | 0 | MODIFIED; |
734 | 0 | if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */ |
735 | 0 | const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); |
736 | |
|
737 | 0 | if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ |
738 | 0 | if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) |
739 | 0 | == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ |
740 | 0 | goto SpecialsCopy; |
741 | 0 | else /* swapCASE not needed */ |
742 | 0 | SpecialsStart += SpecialsLengthExtract(*SpecialsStart); |
743 | 0 | } |
744 | 0 | if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */ |
745 | 0 | if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ |
746 | 0 | goto SpecialsCopy; |
747 | 0 | else /* Titlecase not needed */ |
748 | 0 | SpecialsStart += SpecialsLengthExtract(*SpecialsStart); |
749 | 0 | } |
750 | 0 | if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) { |
751 | 0 | if (!(flags & ONIGENC_CASE_DOWN_SPECIAL)) |
752 | 0 | SpecialsStart += SpecialsLengthExtract(*SpecialsStart); |
753 | 0 | } |
754 | | /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ |
755 | 0 | SpecialsCopy: |
756 | 0 | count = SpecialsLengthExtract(*SpecialsStart); |
757 | 0 | next = SpecialsStart; |
758 | 0 | code = SpecialsCodepointExtract(*next++); |
759 | 0 | } |
760 | 0 | else { /* no specials */ |
761 | 0 | count = OnigCodePointCount(folded->n); |
762 | 0 | next = folded->code; |
763 | 0 | code = *next++; |
764 | 0 | } |
765 | 0 | if (count == 1) |
766 | 0 | ; |
767 | 0 | else if (count == 2) { |
768 | 0 | to += ONIGENC_CODE_TO_MBC(enc, code, to); |
769 | 0 | code = *next; |
770 | 0 | } |
771 | 0 | else { /* count == 3 */ |
772 | 0 | to += ONIGENC_CODE_TO_MBC(enc, code, to); |
773 | 0 | to += ONIGENC_CODE_TO_MBC(enc, *next++, to); |
774 | 0 | code = *next; |
775 | 0 | } |
776 | 0 | } |
777 | 0 | } |
778 | 0 | else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */ |
779 | 0 | if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ |
780 | 0 | && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ |
781 | | /* already Titlecase, no changes needed */ |
782 | 0 | } |
783 | 0 | else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ |
784 | 0 | MODIFIED; |
785 | 0 | code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0]; |
786 | 0 | } |
787 | 0 | } |
788 | 0 | } |
789 | 0 | to += ONIGENC_CODE_TO_MBC(enc, code, to); |
790 | | /* switch from titlecase to lowercase for capitalize */ |
791 | 0 | if (flags & ONIGENC_CASE_TITLECASE) |
792 | 0 | flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE | |
793 | 0 | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL); |
794 | 0 | } |
795 | 0 | *flagP = flags; |
796 | 0 | return (int )(to - to_start); |
797 | 0 | } |
798 | | #endif |
799 | | |
800 | | #if 0 |
801 | | const char onigenc_unicode_version_string[] = |
802 | | #ifdef ONIG_UNICODE_VERSION_STRING |
803 | | ONIG_UNICODE_VERSION_STRING |
804 | | #endif |
805 | | ""; |
806 | | |
807 | | const int onigenc_unicode_version_number[3] = { |
808 | | #ifdef ONIG_UNICODE_VERSION_MAJOR |
809 | | ONIG_UNICODE_VERSION_MAJOR, |
810 | | ONIG_UNICODE_VERSION_MINOR, |
811 | | ONIG_UNICODE_VERSION_TEENY, |
812 | | #else |
813 | | 0 |
814 | | #endif |
815 | | }; |
816 | | #endif |