Coverage Report

Created: 2025-11-07 08:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/fluent-bit/lib/onigmo/enc/unicode.c
Line
Count
Source
1
/**********************************************************************
2
  unicode.c -  Oniguruma (regular expression library)
3
**********************************************************************/
4
/*-
5
 * Copyright (c) 2002-2013  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions
10
 * are met:
11
 * 1. Redistributions of source code must retain the above copyright
12
 *    notice, this list of conditions and the following disclaimer.
13
 * 2. Redistributions in binary form must reproduce the above copyright
14
 *    notice, this list of conditions and the following disclaimer in the
15
 *    documentation and/or other materials provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
 * SUCH DAMAGE.
28
 */
29
30
#include "regint.h"
31
32
#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
33
5.05M
  ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
34
#if 0
35
#define ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(code,cbit) \
36
  ((EncUNICODE_ISO_8859_1_CtypeTable[code] & (cbit)) != 0)
37
#endif
38
39
static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
40
  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
41
  0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
42
  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
43
  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44
  0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
45
  0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46
  0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
47
  0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
48
  0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
49
  0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
50
  0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51
  0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
52
  0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
53
  0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
54
  0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55
  0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
56
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
57
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
58
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60
  0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
61
  0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
62
  0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
63
  0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
64
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
65
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
67
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
68
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
69
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
71
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
72
};
73
74
typedef struct {
75
  int n;
76
  OnigCodePoint code[3];
77
} CodePointList3;
78
79
typedef struct {
80
  OnigCodePoint  from;
81
  CodePointList3 to;
82
} CaseFold_11_Type;
83
84
typedef struct {
85
  OnigCodePoint  from;
86
  CodePointList3 to;
87
} CaseUnfold_11_Type;
88
89
typedef struct {
90
  int n;
91
  OnigCodePoint code[2];
92
} CodePointList2;
93
94
typedef struct {
95
  OnigCodePoint  from[2];
96
  CodePointList2 to;
97
} CaseUnfold_12_Type;
98
99
typedef struct {
100
  OnigCodePoint  from[3];
101
  CodePointList2 to;
102
} CaseUnfold_13_Type;
103
104
static inline int
105
bits_of(const OnigCodePoint c, const int n)
106
3.30M
{
107
3.30M
  return (c >> (2 - n) * 7) & 127;
108
3.30M
}
109
110
static inline int
111
bits_at(const OnigCodePoint *c, const int n)
112
1.23M
{
113
1.23M
  return bits_of(c[n / 3], n % 3);
114
1.23M
}
115
116
static int
117
code1_equal(const OnigCodePoint x, const OnigCodePoint y)
118
616k
{
119
616k
  if (x != y) return 0;
120
400k
  return 1;
121
616k
}
122
123
static int
124
code2_equal(const OnigCodePoint *x, const OnigCodePoint *y)
125
43.6k
{
126
43.6k
  if (x[0] != y[0]) return 0;
127
3.05k
  if (x[1] != y[1]) return 0;
128
2.92k
  return 1;
129
3.05k
}
130
131
static int
132
code3_equal(const OnigCodePoint *x, const OnigCodePoint *y)
133
4.24k
{
134
4.24k
  if (x[0] != y[0]) return 0;
135
2.97k
  if (x[1] != y[1]) return 0;
136
2.84k
  if (x[2] != y[2]) return 0;
137
2.84k
  return 1;
138
2.84k
}
139
140
/* macros related to ONIGENC_CASE flags */
141
/* defined here because not used in other files */
142
0
#define ONIGENC_CASE_SPECIALS       (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL)
143
144
/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */
145
0
#define SpecialsLengthOffset 25  /* needs to be higher than the 22 bits used for Unicode codepoints */
146
0
#define SpecialsLengthExtract(n)    ((n) >> SpecialsLengthOffset)
147
0
#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1))
148
#define SpecialsLengthEncode(n)     ((n) << SpecialsLengthOffset)
149
150
0
#define OnigSpecialIndexMask        (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift)
151
#define OnigSpecialIndexEncode(n)   ((n) << OnigSpecialIndexShift)
152
0
#define OnigSpecialIndexDecode(n)   (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift)
153
154
/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */
155
#define U ONIGENC_CASE_UPCASE
156
#define D ONIGENC_CASE_DOWNCASE
157
#define F ONIGENC_CASE_FOLD
158
#define ST ONIGENC_CASE_TITLECASE
159
#define SU ONIGENC_CASE_UP_SPECIAL
160
#define SL ONIGENC_CASE_DOWN_SPECIAL
161
#define IT ONIGENC_CASE_IS_TITLECASE
162
#define I(n) OnigSpecialIndexEncode(n)
163
#define L(n) SpecialsLengthEncode(n)
164
165
#include "casefold.h"
166
167
#undef U
168
#undef D
169
#undef F
170
#undef ST
171
#undef SU
172
#undef SL
173
#undef IT
174
#undef I
175
#undef L
176
177
#include "name2ctype.h"
178
179
3.26M
#define CODE_RANGES_NUM numberof(CodeRanges)
180
181
extern int
182
onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
183
5.06M
{
184
5.06M
  if (
185
5.06M
#ifdef USE_UNICODE_PROPERTIES
186
5.06M
      ctype <= ONIGENC_MAX_STD_CTYPE &&
187
5.06M
#endif
188
5.06M
      code < 256) {
189
5.05M
    return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
190
5.05M
  }
191
192
4.18k
  if (ctype >= CODE_RANGES_NUM) {
193
0
    return ONIGERR_TYPE_BUG;
194
0
  }
195
196
4.18k
  return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
197
4.18k
}
198
199
200
extern int
201
onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[])
202
3.25M
{
203
3.25M
  if (ctype >= CODE_RANGES_NUM) {
204
0
    return ONIGERR_TYPE_BUG;
205
0
  }
206
207
3.25M
  *ranges = CodeRanges[ctype];
208
209
3.25M
  return 0;
210
3.25M
}
211
212
extern int
213
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
214
                                      const OnigCodePoint* ranges[],
215
              OnigEncoding enc ARG_UNUSED)
216
0
{
217
0
  *sb_out = 0x00;
218
0
  return onigenc_unicode_ctype_code_range(ctype, ranges);
219
0
}
220
221
51.5M
#define PROPERTY_NAME_MAX_SIZE    (MAX_WORD_LENGTH + 1)
222
223
extern int
224
onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end)
225
2.14M
{
226
2.14M
  int len;
227
2.14M
  int ctype;
228
2.14M
  UChar buf[PROPERTY_NAME_MAX_SIZE];
229
2.14M
  const UChar *p;
230
2.14M
  OnigCodePoint code;
231
232
2.14M
  len = 0;
233
57.6M
  for (p = name; p < end; p += enclen(enc, p, end)) {
234
55.4M
    code = ONIGENC_MBC_TO_CODE(enc, p, end);
235
55.4M
    if (code == ' ' || code == '-' || code == '_')
236
3.92M
      continue;
237
51.5M
    if (code >= 0x80)
238
1
      return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
239
240
51.5M
    buf[len++] = ONIGENC_ASCII_CODE_TO_LOWER_CASE(code);
241
51.5M
    if (len >= PROPERTY_NAME_MAX_SIZE)
242
0
      return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
243
51.5M
  }
244
245
2.14M
  buf[len] = 0;
246
247
2.14M
  if ((ctype = uniname2ctype(buf, len)) < 0) {
248
4
    return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
249
4
  }
250
251
2.14M
  return ctype;
252
2.14M
}
253
254
762k
#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
255
272k
#define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup
256
181k
#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup
257
163k
#define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup
258
259
enum {
260
  I_WITH_DOT_ABOVE = 0x0130,
261
  DOTLESS_i = 0x0131,
262
  DOT_ABOVE = 0x0307
263
};
264
265
extern int
266
onigenc_unicode_mbc_case_fold(OnigEncoding enc,
267
    OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
268
    UChar* fold)
269
156k
{
270
156k
  const CodePointList3 *to;
271
156k
  OnigCodePoint code;
272
156k
  int i, len, rlen;
273
156k
  const UChar *p = *pp;
274
275
156k
  code = ONIGENC_MBC_TO_CODE(enc, p, end);
276
156k
  len = enclen(enc, p, end);
277
156k
  *pp += len;
278
279
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
280
  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
281
    if (code == 'I') {
282
      return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold);
283
    }
284
    else if (code == I_WITH_DOT_ABOVE) {
285
      return ONIGENC_CODE_TO_MBC(enc, 'i', fold);
286
    }
287
  }
288
#endif
289
290
156k
  if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
291
72.2k
    if (OnigCodePointCount(to->n) == 1) {
292
71.6k
      return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold);
293
71.6k
    }
294
#if 0
295
    /* NO NEEDS TO CHECK */
296
    else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0)
297
#else
298
679
    else
299
679
#endif
300
679
    {
301
679
      rlen = 0;
302
2.63k
      for (i = 0; i < OnigCodePointCount(to->n); i++) {
303
1.95k
  len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold);
304
1.95k
  fold += len;
305
1.95k
  rlen += len;
306
1.95k
      }
307
679
      return rlen;
308
679
    }
309
72.2k
  }
310
311
174k
  for (i = 0; i < len; i++) {
312
89.8k
    *fold++ = *p++;
313
89.8k
  }
314
84.6k
  return len;
315
156k
}
316
317
extern int
318
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
319
            OnigApplyAllCaseFoldFunc f, void* arg,
320
            OnigEncoding enc ARG_UNUSED)
321
3.54k
{
322
3.54k
  const CaseUnfold_11_Type* p11;
323
3.54k
  OnigCodePoint code;
324
3.54k
  int i, j, k, r;
325
326
4.95M
  for (i = 0; i < numberof(CaseUnfold_11); i++) {
327
4.94M
    p11 = &CaseUnfold_11[i];
328
10.0M
    for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
329
5.05M
      code = p11->from;
330
5.05M
      r = (*f)(p11->to.code[j], &code, 1, arg);
331
5.05M
      if (r != 0) return r;
332
333
5.05M
      code = p11->to.code[j];
334
5.05M
      r = (*f)(p11->from, &code, 1, arg);
335
5.05M
      if (r != 0) return r;
336
337
5.17M
      for (k = 0; k < j; k++) {
338
117k
  r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg);
339
117k
  if (r != 0) return r;
340
341
117k
  r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg);
342
117k
  if (r != 0) return r;
343
117k
      }
344
5.05M
    }
345
4.94M
  }
346
347
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
348
  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
349
    code = DOTLESS_i;
350
    r = (*f)('I', &code, 1, arg);
351
    if (r != 0) return r;
352
    code = 'I';
353
    r = (*f)(DOTLESS_i, &code, 1, arg);
354
    if (r != 0) return r;
355
356
    code = I_WITH_DOT_ABOVE;
357
    r = (*f)('i', &code, 1, arg);
358
    if (r != 0) return r;
359
    code = 'i';
360
    r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg);
361
    if (r != 0) return r;
362
  }
363
  else {
364
#endif
365
7.09k
    for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) {
366
3.54k
      p11 = &CaseUnfold_11_Locale[i];
367
7.09k
      for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
368
3.54k
  code = p11->from;
369
3.54k
  r = (*f)(p11->to.code[j], &code, 1, arg);
370
3.54k
  if (r != 0) return r;
371
372
3.54k
  code = p11->to.code[j];
373
3.54k
  r = (*f)(p11->from, &code, 1, arg);
374
3.54k
  if (r != 0) return r;
375
376
3.54k
  for (k = 0; k < j; k++) {
377
0
    r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]),
378
0
       1, arg);
379
0
    if (r != 0) return r;
380
381
0
    r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]),
382
0
       1, arg);
383
0
    if (r != 0) return r;
384
0
  }
385
3.54k
      }
386
3.54k
    }
387
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
388
  }
389
#endif
390
391
3.54k
  if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
392
209k
    for (i = 0; i < numberof(CaseUnfold_12); i++) {
393
514k
      for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) {
394
308k
  r = (*f)(CaseUnfold_12[i].to.code[j],
395
308k
     (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg);
396
308k
  if (r != 0) return r;
397
398
823k
  for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) {
399
514k
    if (k == j) continue;
400
401
205k
    r = (*f)(CaseUnfold_12[i].to.code[j],
402
205k
       (OnigCodePoint* )(&CaseUnfold_12[i].to.code[k]), 1, arg);
403
205k
    if (r != 0) return r;
404
205k
  }
405
308k
      }
406
205k
    }
407
408
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
409
    if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
410
#endif
411
7.09k
      for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) {
412
7.09k
  for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) {
413
3.54k
    r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
414
3.54k
       (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg);
415
3.54k
    if (r != 0) return r;
416
417
7.09k
    for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) {
418
3.54k
      if (k == j) continue;
419
420
0
      r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
421
0
         (OnigCodePoint* )(&CaseUnfold_12_Locale[i].to.code[k]),
422
0
         1, arg);
423
0
      if (r != 0) return r;
424
0
    }
425
3.54k
  }
426
3.54k
      }
427
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
428
    }
429
#endif
430
431
53.2k
    for (i = 0; i < numberof(CaseUnfold_13); i++) {
432
106k
      for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) {
433
56.7k
  r = (*f)(CaseUnfold_13[i].to.code[j],
434
56.7k
     (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg);
435
56.7k
  if (r != 0) return r;
436
437
127k
  for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) {
438
70.9k
    if (k == j) continue;
439
440
14.1k
    r = (*f)(CaseUnfold_13[i].to.code[j],
441
14.1k
       (OnigCodePoint* )(&CaseUnfold_13[i].to.code[k]), 1, arg);
442
14.1k
    if (r != 0) return r;
443
14.1k
  }
444
56.7k
      }
445
49.6k
    }
446
3.54k
  }
447
448
3.54k
  return 0;
449
3.54k
}
450
451
141k
#define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code))
452
453
extern int
454
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
455
    OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
456
    OnigCaseFoldCodeItem items[])
457
265k
{
458
265k
  int n, i, j, k, len;
459
265k
  OnigCodePoint code, codes[3];
460
265k
  const CodePointList3 *to, *z3;
461
265k
  const CodePointList2 *z2;
462
463
265k
  n = 0;
464
465
265k
  code = ONIGENC_MBC_TO_CODE(enc, p, end);
466
265k
  len = enclen(enc, p, end);
467
468
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
469
  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
470
    switch (code) {
471
    case 'I':
472
      items[0].byte_len = len;
473
      items[0].code_len = 1;
474
      items[0].code[0]  = DOTLESS_i;
475
      return 1;
476
    case I_WITH_DOT_ABOVE:
477
      items[0].byte_len = len;
478
      items[0].code_len = 1;
479
      items[0].code[0]  = 'i';
480
      return 1;
481
    case DOTLESS_i:
482
      items[0].byte_len = len;
483
      items[0].code_len = 1;
484
      items[0].code[0]  = 'I';
485
      return 1;
486
    case 'i':
487
      items[0].byte_len = len;
488
      items[0].code_len = 1;
489
      items[0].code[0]  = I_WITH_DOT_ABOVE;
490
      return 1;
491
    }
492
  }
493
#endif
494
495
265k
  if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
496
69.4k
    if (OnigCodePointCount(to->n) == 1) {
497
64.9k
      OnigCodePoint orig_code = code;
498
499
64.9k
      items[0].byte_len = len;
500
64.9k
      items[0].code_len = 1;
501
64.9k
      items[0].code[0]  = to->code[0];
502
64.9k
      n++;
503
504
64.9k
      code = to->code[0];
505
64.9k
      if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
506
64.9k
    CodePointListValidP(to)) {
507
159k
  for (i = 0; i < OnigCodePointCount(to->n); i++) {
508
94.1k
    if (to->code[i] != orig_code) {
509
29.1k
      items[n].byte_len = len;
510
29.1k
      items[n].code_len = 1;
511
29.1k
      items[n].code[0]  = to->code[i];
512
29.1k
      n++;
513
29.1k
    }
514
94.1k
  }
515
64.9k
      }
516
64.9k
    }
517
4.45k
    else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
518
4.45k
      OnigCodePoint cs[3][4];
519
4.45k
      int fn, ncs[3];
520
521
16.2k
      for (fn = 0; fn < OnigCodePointCount(to->n); fn++) {
522
11.7k
  cs[fn][0] = to->code[fn];
523
11.7k
  if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) {
524
20.2k
    for (i = 0; i < OnigCodePointCount(z3->n); i++) {
525
13.8k
      cs[fn][i+1] = z3->code[i];
526
13.8k
    }
527
6.37k
    ncs[fn] = OnigCodePointCount(z3->n) + 1;
528
6.37k
  }
529
5.39k
  else
530
5.39k
    ncs[fn] = 1;
531
11.7k
      }
532
533
4.45k
      if (fn == 2) {
534
6.43k
  for (i = 0; i < ncs[0]; i++) {
535
14.6k
    for (j = 0; j < ncs[1]; j++) {
536
9.81k
      items[n].byte_len = len;
537
9.81k
      items[n].code_len = 2;
538
9.81k
      items[n].code[0]  = cs[0][i];
539
9.81k
      items[n].code[1]  = cs[1][j];
540
9.81k
      n++;
541
9.81k
    }
542
4.81k
  }
543
544
1.61k
  if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 &&
545
1.61k
      CodePointListValidP(z2)) {
546
4.81k
    for (i = 0; i < OnigCodePointCount(z2->n); i++) {
547
3.20k
      if (z2->code[i] == code) continue;
548
549
1.59k
      items[n].byte_len = len;
550
1.59k
      items[n].code_len = 1;
551
1.59k
      items[n].code[0]  = z2->code[i];
552
1.59k
      n++;
553
1.59k
    }
554
1.61k
  }
555
1.61k
      }
556
2.84k
      else {
557
13.7k
  for (i = 0; i < ncs[0]; i++) {
558
21.8k
    for (j = 0; j < ncs[1]; j++) {
559
24.2k
      for (k = 0; k < ncs[2]; k++) {
560
13.3k
        items[n].byte_len = len;
561
13.3k
        items[n].code_len = 3;
562
13.3k
        items[n].code[0]  = cs[0][i];
563
13.3k
        items[n].code[1]  = cs[1][j];
564
13.3k
        items[n].code[2]  = cs[2][k];
565
13.3k
        n++;
566
13.3k
      }
567
10.9k
    }
568
10.9k
  }
569
570
2.84k
  if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 &&
571
2.84k
      CodePointListValidP(z2)) {
572
8.21k
    for (i = 0; i < OnigCodePointCount(z2->n); i++) {
573
5.36k
      if (z2->code[i] == code) continue;
574
575
2.52k
      items[n].byte_len = len;
576
2.52k
      items[n].code_len = 1;
577
2.52k
      items[n].code[0]  = z2->code[i];
578
2.52k
      n++;
579
2.52k
    }
580
2.84k
  }
581
2.84k
      }
582
583
      /* multi char folded code is not head of another folded multi char */
584
4.45k
      flag = 0; /* DISABLE_CASE_FOLD_MULTI_CHAR(flag); */
585
4.45k
    }
586
69.4k
  }
587
195k
  else {
588
195k
    if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
589
70.6k
  CodePointListValidP(to)) {
590
145k
      for (i = 0; i < OnigCodePointCount(to->n); i++) {
591
75.1k
  items[n].byte_len = len;
592
75.1k
  items[n].code_len = 1;
593
75.1k
  items[n].code[0]  = to->code[i];
594
75.1k
  n++;
595
75.1k
      }
596
70.6k
    }
597
195k
  }
598
599
600
265k
  if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
601
211k
    p += len;
602
211k
    if (p < end) {
603
179k
      int clen;
604
605
179k
      codes[0] = code;
606
179k
      code = ONIGENC_MBC_TO_CODE(enc, p, end);
607
179k
      if ((to = onigenc_unicode_fold_lookup(code)) != 0
608
61.4k
    && OnigCodePointCount(to->n) == 1) {
609
60.4k
  codes[1] = to->code[0];
610
60.4k
      }
611
119k
      else
612
119k
  codes[1] = code;
613
614
179k
      clen = enclen(enc, p, end);
615
179k
      len += clen;
616
179k
      if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 &&
617
1.31k
    CodePointListValidP(z2)) {
618
2.86k
  for (i = 0; i < OnigCodePointCount(z2->n); i++) {
619
1.55k
    items[n].byte_len = len;
620
1.55k
    items[n].code_len = 1;
621
1.55k
    items[n].code[0]  = z2->code[i];
622
1.55k
    n++;
623
1.55k
  }
624
1.31k
      }
625
626
179k
      p += clen;
627
179k
      if (p < end) {
628
160k
  code = ONIGENC_MBC_TO_CODE(enc, p, end);
629
160k
  if ((to = onigenc_unicode_fold_lookup(code)) != 0
630
55.5k
      && OnigCodePointCount(to->n) == 1) {
631
54.7k
    codes[2] = to->code[0];
632
54.7k
  }
633
105k
  else
634
105k
    codes[2] = code;
635
636
160k
  clen = enclen(enc, p, end);
637
160k
  len += clen;
638
160k
  if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 &&
639
0
      CodePointListValidP(z2)) {
640
0
    for (i = 0; i < OnigCodePointCount(z2->n); i++) {
641
0
      items[n].byte_len = len;
642
0
      items[n].code_len = 1;
643
0
      items[n].code[0]  = z2->code[i];
644
0
      n++;
645
0
    }
646
0
  }
647
160k
      }
648
179k
    }
649
211k
  }
650
651
265k
  return n;
652
265k
}
653
654
#ifdef USE_CASE_MAP_API
655
/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
656
0
#define CASE_MAPPING_SLACK 12
657
0
#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
658
extern int
659
onigenc_unicode_case_map(OnigCaseFoldType* flagP,
660
    const OnigUChar** pp, const OnigUChar* end,
661
    OnigUChar* to, OnigUChar* to_end,
662
    const struct OnigEncodingTypeST* enc)
663
0
{
664
0
  OnigCodePoint code;
665
0
  OnigUChar *to_start = to;
666
0
  OnigCaseFoldType flags = *flagP;
667
0
  int codepoint_length;
668
669
0
  to_end -= CASE_MAPPING_SLACK;
670
  /* copy flags ONIGENC_CASE_UPCASE     and ONIGENC_CASE_DOWNCASE over to
671
   *            ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */
672
0
  flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET;
673
674
0
  while (*pp < end && to <= to_end) {
675
0
    codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
676
0
    if (codepoint_length < 0)
677
0
      return codepoint_length; /* encoding invalid */
678
0
    code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
679
0
    *pp += codepoint_length;
680
681
0
    if (code <= 'z') { /* ASCII comes first */
682
0
      if (code >= 'a' /*&& code <= 'z'*/) {
683
0
  if (flags & ONIGENC_CASE_UPCASE) {
684
0
    MODIFIED;
685
0
    if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i')
686
0
      code = I_WITH_DOT_ABOVE;
687
0
          else
688
0
            code -= 'a' - 'A';
689
0
  }
690
0
      }
691
0
      else if (code >= 'A' && code <= 'Z') {
692
0
  if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
693
0
    MODIFIED;
694
0
    if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I')
695
0
      code = DOTLESS_i;
696
0
    else
697
0
      code += 'a' - 'A';
698
0
  }
699
0
      }
700
0
    }
701
0
    else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */
702
0
      const CodePointList3 *folded;
703
704
0
      if (code == I_WITH_DOT_ABOVE) {
705
0
  if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
706
0
    MODIFIED;
707
0
    code = 'i';
708
0
    if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */
709
0
      to += ONIGENC_CODE_TO_MBC(enc, code, to);
710
0
      code = DOT_ABOVE;
711
0
    }
712
0
  }
713
0
      }
714
0
      else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */
715
0
  if (flags & ONIGENC_CASE_UPCASE) {
716
0
    MODIFIED;
717
0
    code = 'I';
718
0
  }
719
0
      }
720
0
      else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
721
0
  if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */
722
0
          MODIFIED;
723
0
    code += 0x10D0 - 0x1C90;
724
0
        }
725
0
        else if ((flags & ONIGENC_CASE_TITLECASE)                            /* Titlecase needed, */
726
0
      && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase  */
727
    /* already Titlecase, no changes needed */
728
0
  }
729
0
  else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
730
0
    const OnigCodePoint *next;
731
0
    int count;
732
733
0
    MODIFIED;
734
0
    if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */
735
0
      const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n);
736
737
0
      if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */
738
0
        if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE))
739
0
      == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */
740
0
    goto SpecialsCopy;
741
0
        else /* swapCASE not needed */
742
0
    SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
743
0
      }
744
0
      if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */
745
0
        if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */
746
0
    goto SpecialsCopy;
747
0
        else /* Titlecase not needed */
748
0
    SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
749
0
      }
750
0
      if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) {
751
0
        if (!(flags & ONIGENC_CASE_DOWN_SPECIAL))
752
0
    SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
753
0
      }
754
      /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */
755
0
SpecialsCopy:
756
0
      count = SpecialsLengthExtract(*SpecialsStart);
757
0
      next = SpecialsStart;
758
0
      code = SpecialsCodepointExtract(*next++);
759
0
    }
760
0
    else { /* no specials */
761
0
      count = OnigCodePointCount(folded->n);
762
0
      next = folded->code;
763
0
      code = *next++;
764
0
    }
765
0
    if (count == 1)
766
0
      ;
767
0
    else if (count == 2) {
768
0
      to += ONIGENC_CODE_TO_MBC(enc, code, to);
769
0
      code = *next;
770
0
    }
771
0
    else { /* count == 3 */
772
0
      to += ONIGENC_CODE_TO_MBC(enc, code, to);
773
0
      to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
774
0
      code = *next;
775
0
    }
776
0
  }
777
0
      }
778
0
      else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */
779
0
  if ((flags & ONIGENC_CASE_TITLECASE)                                 /* Titlecase needed, */
780
0
      && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
781
    /* already Titlecase, no changes needed */
782
0
  }
783
0
  else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
784
0
    MODIFIED;
785
0
    code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0];
786
0
  }
787
0
      }
788
0
    }
789
0
    to += ONIGENC_CODE_TO_MBC(enc, code, to);
790
    /* switch from titlecase to lowercase for capitalize */
791
0
    if (flags & ONIGENC_CASE_TITLECASE)
792
0
      flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE |
793
0
    ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL);
794
0
  }
795
0
  *flagP = flags;
796
0
  return (int )(to - to_start);
797
0
}
798
#endif
799
800
#if 0
801
const char onigenc_unicode_version_string[] =
802
#ifdef ONIG_UNICODE_VERSION_STRING
803
    ONIG_UNICODE_VERSION_STRING
804
#endif
805
    "";
806
807
const int onigenc_unicode_version_number[3] = {
808
#ifdef ONIG_UNICODE_VERSION_MAJOR
809
    ONIG_UNICODE_VERSION_MAJOR,
810
    ONIG_UNICODE_VERSION_MINOR,
811
    ONIG_UNICODE_VERSION_TEENY,
812
#else
813
    0
814
#endif
815
};
816
#endif