Coverage Report

Created: 2025-09-04 07:51

/src/fluent-bit/lib/onigmo/enc/unicode.c
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
  unicode.c -  Oniguruma (regular expression library)
3
**********************************************************************/
4
/*-
5
 * Copyright (c) 2002-2013  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions
10
 * are met:
11
 * 1. Redistributions of source code must retain the above copyright
12
 *    notice, this list of conditions and the following disclaimer.
13
 * 2. Redistributions in binary form must reproduce the above copyright
14
 *    notice, this list of conditions and the following disclaimer in the
15
 *    documentation and/or other materials provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
 * SUCH DAMAGE.
28
 */
29
30
#include "regint.h"
31
32
#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
33
2.19M
  ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
34
#if 0
35
#define ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(code,cbit) \
36
  ((EncUNICODE_ISO_8859_1_CtypeTable[code] & (cbit)) != 0)
37
#endif
38
39
static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
40
  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
41
  0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
42
  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
43
  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44
  0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
45
  0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46
  0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
47
  0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
48
  0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
49
  0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
50
  0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51
  0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
52
  0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
53
  0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
54
  0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55
  0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
56
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
57
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
58
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59
  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60
  0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
61
  0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
62
  0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
63
  0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
64
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
65
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
67
  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
68
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
69
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
71
  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
72
};
73
74
typedef struct {
75
  int n;
76
  OnigCodePoint code[3];
77
} CodePointList3;
78
79
typedef struct {
80
  OnigCodePoint  from;
81
  CodePointList3 to;
82
} CaseFold_11_Type;
83
84
typedef struct {
85
  OnigCodePoint  from;
86
  CodePointList3 to;
87
} CaseUnfold_11_Type;
88
89
typedef struct {
90
  int n;
91
  OnigCodePoint code[2];
92
} CodePointList2;
93
94
typedef struct {
95
  OnigCodePoint  from[2];
96
  CodePointList2 to;
97
} CaseUnfold_12_Type;
98
99
typedef struct {
100
  OnigCodePoint  from[3];
101
  CodePointList2 to;
102
} CaseUnfold_13_Type;
103
104
static inline int
105
bits_of(const OnigCodePoint c, const int n)
106
5.30M
{
107
5.30M
  return (c >> (2 - n) * 7) & 127;
108
5.30M
}
109
110
static inline int
111
bits_at(const OnigCodePoint *c, const int n)
112
2.12M
{
113
2.12M
  return bits_of(c[n / 3], n % 3);
114
2.12M
}
115
116
static int
117
code1_equal(const OnigCodePoint x, const OnigCodePoint y)
118
951k
{
119
951k
  if (x != y) return 0;
120
707k
  return 1;
121
951k
}
122
123
static int
124
code2_equal(const OnigCodePoint *x, const OnigCodePoint *y)
125
73.3k
{
126
73.3k
  if (x[0] != y[0]) return 0;
127
3.72k
  if (x[1] != y[1]) return 0;
128
3.59k
  return 1;
129
3.72k
}
130
131
static int
132
code3_equal(const OnigCodePoint *x, const OnigCodePoint *y)
133
1.04k
{
134
1.04k
  if (x[0] != y[0]) return 0;
135
585
  if (x[1] != y[1]) return 0;
136
445
  if (x[2] != y[2]) return 0;
137
445
  return 1;
138
445
}
139
140
/* macros related to ONIGENC_CASE flags */
141
/* defined here because not used in other files */
142
0
#define ONIGENC_CASE_SPECIALS       (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL)
143
144
/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */
145
0
#define SpecialsLengthOffset 25  /* needs to be higher than the 22 bits used for Unicode codepoints */
146
0
#define SpecialsLengthExtract(n)    ((n) >> SpecialsLengthOffset)
147
0
#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1))
148
#define SpecialsLengthEncode(n)     ((n) << SpecialsLengthOffset)
149
150
0
#define OnigSpecialIndexMask        (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift)
151
#define OnigSpecialIndexEncode(n)   ((n) << OnigSpecialIndexShift)
152
0
#define OnigSpecialIndexDecode(n)   (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift)
153
154
/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */
155
#define U ONIGENC_CASE_UPCASE
156
#define D ONIGENC_CASE_DOWNCASE
157
#define F ONIGENC_CASE_FOLD
158
#define ST ONIGENC_CASE_TITLECASE
159
#define SU ONIGENC_CASE_UP_SPECIAL
160
#define SL ONIGENC_CASE_DOWN_SPECIAL
161
#define IT ONIGENC_CASE_IS_TITLECASE
162
#define I(n) OnigSpecialIndexEncode(n)
163
#define L(n) SpecialsLengthEncode(n)
164
165
#include "casefold.h"
166
167
#undef U
168
#undef D
169
#undef F
170
#undef ST
171
#undef SU
172
#undef SL
173
#undef IT
174
#undef I
175
#undef L
176
177
#include "name2ctype.h"
178
179
2.97M
#define CODE_RANGES_NUM numberof(CodeRanges)
180
181
extern int
182
onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
183
2.20M
{
184
2.20M
  if (
185
2.20M
#ifdef USE_UNICODE_PROPERTIES
186
2.20M
      ctype <= ONIGENC_MAX_STD_CTYPE &&
187
2.20M
#endif
188
2.20M
      code < 256) {
189
2.19M
    return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
190
2.19M
  }
191
192
4.64k
  if (ctype >= CODE_RANGES_NUM) {
193
0
    return ONIGERR_TYPE_BUG;
194
0
  }
195
196
4.64k
  return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
197
4.64k
}
198
199
200
extern int
201
onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[])
202
2.96M
{
203
2.96M
  if (ctype >= CODE_RANGES_NUM) {
204
0
    return ONIGERR_TYPE_BUG;
205
0
  }
206
207
2.96M
  *ranges = CodeRanges[ctype];
208
209
2.96M
  return 0;
210
2.96M
}
211
212
extern int
213
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
214
                                      const OnigCodePoint* ranges[],
215
              OnigEncoding enc ARG_UNUSED)
216
0
{
217
0
  *sb_out = 0x00;
218
0
  return onigenc_unicode_ctype_code_range(ctype, ranges);
219
0
}
220
221
63.7M
#define PROPERTY_NAME_MAX_SIZE    (MAX_WORD_LENGTH + 1)
222
223
extern int
224
onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end)
225
2.64M
{
226
2.64M
  int len;
227
2.64M
  int ctype;
228
2.64M
  UChar buf[PROPERTY_NAME_MAX_SIZE];
229
2.64M
  const UChar *p;
230
2.64M
  OnigCodePoint code;
231
232
2.64M
  len = 0;
233
71.2M
  for (p = name; p < end; p += enclen(enc, p, end)) {
234
68.5M
    code = ONIGENC_MBC_TO_CODE(enc, p, end);
235
68.5M
    if (code == ' ' || code == '-' || code == '_')
236
4.85M
      continue;
237
63.7M
    if (code >= 0x80)
238
0
      return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
239
240
63.7M
    buf[len++] = ONIGENC_ASCII_CODE_TO_LOWER_CASE(code);
241
63.7M
    if (len >= PROPERTY_NAME_MAX_SIZE)
242
0
      return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
243
63.7M
  }
244
245
2.64M
  buf[len] = 0;
246
247
2.64M
  if ((ctype = uniname2ctype(buf, len)) < 0) {
248
0
    return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
249
0
  }
250
251
2.64M
  return ctype;
252
2.64M
}
253
254
1.04M
#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
255
374k
#define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup
256
255k
#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup
257
233k
#define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup
258
259
enum {
260
  I_WITH_DOT_ABOVE = 0x0130,
261
  DOTLESS_i = 0x0131,
262
  DOT_ABOVE = 0x0307
263
};
264
265
extern int
266
onigenc_unicode_mbc_case_fold(OnigEncoding enc,
267
    OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
268
    UChar* fold)
269
190k
{
270
190k
  const CodePointList3 *to;
271
190k
  OnigCodePoint code;
272
190k
  int i, len, rlen;
273
190k
  const UChar *p = *pp;
274
275
190k
  code = ONIGENC_MBC_TO_CODE(enc, p, end);
276
190k
  len = enclen(enc, p, end);
277
190k
  *pp += len;
278
279
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
280
  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
281
    if (code == 'I') {
282
      return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold);
283
    }
284
    else if (code == I_WITH_DOT_ABOVE) {
285
      return ONIGENC_CODE_TO_MBC(enc, 'i', fold);
286
    }
287
  }
288
#endif
289
290
190k
  if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
291
110k
    if (OnigCodePointCount(to->n) == 1) {
292
110k
      return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold);
293
110k
    }
294
#if 0
295
    /* NO NEEDS TO CHECK */
296
    else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0)
297
#else
298
255
    else
299
255
#endif
300
255
    {
301
255
      rlen = 0;
302
882
      for (i = 0; i < OnigCodePointCount(to->n); i++) {
303
627
  len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold);
304
627
  fold += len;
305
627
  rlen += len;
306
627
      }
307
255
      return rlen;
308
255
    }
309
110k
  }
310
311
166k
  for (i = 0; i < len; i++) {
312
86.1k
    *fold++ = *p++;
313
86.1k
  }
314
79.9k
  return len;
315
190k
}
316
317
extern int
318
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
319
            OnigApplyAllCaseFoldFunc f, void* arg,
320
            OnigEncoding enc ARG_UNUSED)
321
3.48k
{
322
3.48k
  const CaseUnfold_11_Type* p11;
323
3.48k
  OnigCodePoint code;
324
3.48k
  int i, j, k, r;
325
326
4.87M
  for (i = 0; i < numberof(CaseUnfold_11); i++) {
327
4.86M
    p11 = &CaseUnfold_11[i];
328
9.83M
    for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
329
4.97M
      code = p11->from;
330
4.97M
      r = (*f)(p11->to.code[j], &code, 1, arg);
331
4.97M
      if (r != 0) return r;
332
333
4.97M
      code = p11->to.code[j];
334
4.97M
      r = (*f)(p11->from, &code, 1, arg);
335
4.97M
      if (r != 0) return r;
336
337
5.08M
      for (k = 0; k < j; k++) {
338
115k
  r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg);
339
115k
  if (r != 0) return r;
340
341
115k
  r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg);
342
115k
  if (r != 0) return r;
343
115k
      }
344
4.97M
    }
345
4.86M
  }
346
347
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
348
  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
349
    code = DOTLESS_i;
350
    r = (*f)('I', &code, 1, arg);
351
    if (r != 0) return r;
352
    code = 'I';
353
    r = (*f)(DOTLESS_i, &code, 1, arg);
354
    if (r != 0) return r;
355
356
    code = I_WITH_DOT_ABOVE;
357
    r = (*f)('i', &code, 1, arg);
358
    if (r != 0) return r;
359
    code = 'i';
360
    r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg);
361
    if (r != 0) return r;
362
  }
363
  else {
364
#endif
365
6.97k
    for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) {
366
3.48k
      p11 = &CaseUnfold_11_Locale[i];
367
6.97k
      for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
368
3.48k
  code = p11->from;
369
3.48k
  r = (*f)(p11->to.code[j], &code, 1, arg);
370
3.48k
  if (r != 0) return r;
371
372
3.48k
  code = p11->to.code[j];
373
3.48k
  r = (*f)(p11->from, &code, 1, arg);
374
3.48k
  if (r != 0) return r;
375
376
3.48k
  for (k = 0; k < j; k++) {
377
0
    r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]),
378
0
       1, arg);
379
0
    if (r != 0) return r;
380
381
0
    r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]),
382
0
       1, arg);
383
0
    if (r != 0) return r;
384
0
  }
385
3.48k
      }
386
3.48k
    }
387
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
388
  }
389
#endif
390
391
3.48k
  if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
392
205k
    for (i = 0; i < numberof(CaseUnfold_12); i++) {
393
505k
      for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) {
394
303k
  r = (*f)(CaseUnfold_12[i].to.code[j],
395
303k
     (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg);
396
303k
  if (r != 0) return r;
397
398
809k
  for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) {
399
505k
    if (k == j) continue;
400
401
202k
    r = (*f)(CaseUnfold_12[i].to.code[j],
402
202k
       (OnigCodePoint* )(&CaseUnfold_12[i].to.code[k]), 1, arg);
403
202k
    if (r != 0) return r;
404
202k
  }
405
303k
      }
406
202k
    }
407
408
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
409
    if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
410
#endif
411
6.97k
      for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) {
412
6.97k
  for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) {
413
3.48k
    r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
414
3.48k
       (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg);
415
3.48k
    if (r != 0) return r;
416
417
6.97k
    for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) {
418
3.48k
      if (k == j) continue;
419
420
0
      r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
421
0
         (OnigCodePoint* )(&CaseUnfold_12_Locale[i].to.code[k]),
422
0
         1, arg);
423
0
      if (r != 0) return r;
424
0
    }
425
3.48k
  }
426
3.48k
      }
427
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
428
    }
429
#endif
430
431
52.3k
    for (i = 0; i < numberof(CaseUnfold_13); i++) {
432
104k
      for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) {
433
55.8k
  r = (*f)(CaseUnfold_13[i].to.code[j],
434
55.8k
     (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg);
435
55.8k
  if (r != 0) return r;
436
437
125k
  for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) {
438
69.7k
    if (k == j) continue;
439
440
13.9k
    r = (*f)(CaseUnfold_13[i].to.code[j],
441
13.9k
       (OnigCodePoint* )(&CaseUnfold_13[i].to.code[k]), 1, arg);
442
13.9k
    if (r != 0) return r;
443
13.9k
  }
444
55.8k
      }
445
48.8k
    }
446
3.48k
  }
447
448
3.48k
  return 0;
449
3.48k
}
450
451
233k
#define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code))
452
453
extern int
454
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
455
    OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
456
    OnigCaseFoldCodeItem items[])
457
372k
{
458
372k
  int n, i, j, k, len;
459
372k
  OnigCodePoint code, codes[3];
460
372k
  const CodePointList3 *to, *z3;
461
372k
  const CodePointList2 *z2;
462
463
372k
  n = 0;
464
465
372k
  code = ONIGENC_MBC_TO_CODE(enc, p, end);
466
372k
  len = enclen(enc, p, end);
467
468
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
469
  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
470
    switch (code) {
471
    case 'I':
472
      items[0].byte_len = len;
473
      items[0].code_len = 1;
474
      items[0].code[0]  = DOTLESS_i;
475
      return 1;
476
    case I_WITH_DOT_ABOVE:
477
      items[0].byte_len = len;
478
      items[0].code_len = 1;
479
      items[0].code[0]  = 'i';
480
      return 1;
481
    case DOTLESS_i:
482
      items[0].byte_len = len;
483
      items[0].code_len = 1;
484
      items[0].code[0]  = 'I';
485
      return 1;
486
    case 'i':
487
      items[0].byte_len = len;
488
      items[0].code_len = 1;
489
      items[0].code[0]  = I_WITH_DOT_ABOVE;
490
      return 1;
491
    }
492
  }
493
#endif
494
495
372k
  if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
496
127k
    if (OnigCodePointCount(to->n) == 1) {
497
125k
      OnigCodePoint orig_code = code;
498
499
125k
      items[0].byte_len = len;
500
125k
      items[0].code_len = 1;
501
125k
      items[0].code[0]  = to->code[0];
502
125k
      n++;
503
504
125k
      code = to->code[0];
505
125k
      if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
506
125k
    CodePointListValidP(to)) {
507
311k
  for (i = 0; i < OnigCodePointCount(to->n); i++) {
508
185k
    if (to->code[i] != orig_code) {
509
59.9k
      items[n].byte_len = len;
510
59.9k
      items[n].code_len = 1;
511
59.9k
      items[n].code[0]  = to->code[i];
512
59.9k
      n++;
513
59.9k
    }
514
185k
  }
515
125k
      }
516
125k
    }
517
1.47k
    else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
518
1.47k
      OnigCodePoint cs[3][4];
519
1.47k
      int fn, ncs[3];
520
521
4.87k
      for (fn = 0; fn < OnigCodePointCount(to->n); fn++) {
522
3.39k
  cs[fn][0] = to->code[fn];
523
3.39k
  if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) {
524
7.19k
    for (i = 0; i < OnigCodePointCount(z3->n); i++) {
525
4.62k
      cs[fn][i+1] = z3->code[i];
526
4.62k
    }
527
2.56k
    ncs[fn] = OnigCodePointCount(z3->n) + 1;
528
2.56k
  }
529
830
  else
530
830
    ncs[fn] = 1;
531
3.39k
      }
532
533
1.47k
      if (fn == 2) {
534
4.09k
  for (i = 0; i < ncs[0]; i++) {
535
9.47k
    for (j = 0; j < ncs[1]; j++) {
536
6.41k
      items[n].byte_len = len;
537
6.41k
      items[n].code_len = 2;
538
6.41k
      items[n].code[0]  = cs[0][i];
539
6.41k
      items[n].code[1]  = cs[1][j];
540
6.41k
      n++;
541
6.41k
    }
542
3.05k
  }
543
544
1.03k
  if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 &&
545
1.03k
      CodePointListValidP(z2)) {
546
3.05k
    for (i = 0; i < OnigCodePointCount(z2->n); i++) {
547
2.02k
      if (z2->code[i] == code) continue;
548
549
994
      items[n].byte_len = len;
550
994
      items[n].code_len = 1;
551
994
      items[n].code[0]  = z2->code[i];
552
994
      n++;
553
994
    }
554
1.03k
  }
555
1.03k
      }
556
445
      else {
557
2.07k
  for (i = 0; i < ncs[0]; i++) {
558
3.26k
    for (j = 0; j < ncs[1]; j++) {
559
4.03k
      for (k = 0; k < ncs[2]; k++) {
560
2.40k
        items[n].byte_len = len;
561
2.40k
        items[n].code_len = 3;
562
2.40k
        items[n].code[0]  = cs[0][i];
563
2.40k
        items[n].code[1]  = cs[1][j];
564
2.40k
        items[n].code[2]  = cs[2][k];
565
2.40k
        n++;
566
2.40k
      }
567
1.63k
    }
568
1.63k
  }
569
570
445
  if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 &&
571
445
      CodePointListValidP(z2)) {
572
1.23k
    for (i = 0; i < OnigCodePointCount(z2->n); i++) {
573
788
      if (z2->code[i] == code) continue;
574
575
343
      items[n].byte_len = len;
576
343
      items[n].code_len = 1;
577
343
      items[n].code[0]  = z2->code[i];
578
343
      n++;
579
343
    }
580
445
  }
581
445
      }
582
583
      /* multi char folded code is not head of another folded multi char */
584
1.47k
      flag = 0; /* DISABLE_CASE_FOLD_MULTI_CHAR(flag); */
585
1.47k
    }
586
127k
  }
587
244k
  else {
588
244k
    if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
589
244k
  CodePointListValidP(to)) {
590
220k
      for (i = 0; i < OnigCodePointCount(to->n); i++) {
591
116k
  items[n].byte_len = len;
592
116k
  items[n].code_len = 1;
593
116k
  items[n].code[0]  = to->code[i];
594
116k
  n++;
595
116k
      }
596
103k
    }
597
244k
  }
598
599
600
372k
  if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
601
287k
    p += len;
602
287k
    if (p < end) {
603
253k
      int clen;
604
605
253k
      codes[0] = code;
606
253k
      code = ONIGENC_MBC_TO_CODE(enc, p, end);
607
253k
      if ((to = onigenc_unicode_fold_lookup(code)) != 0
608
253k
    && OnigCodePointCount(to->n) == 1) {
609
121k
  codes[1] = to->code[0];
610
121k
      }
611
132k
      else
612
132k
  codes[1] = code;
613
614
253k
      clen = enclen(enc, p, end);
615
253k
      len += clen;
616
253k
      if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 &&
617
253k
    CodePointListValidP(z2)) {
618
6.84k
  for (i = 0; i < OnigCodePointCount(z2->n); i++) {
619
4.28k
    items[n].byte_len = len;
620
4.28k
    items[n].code_len = 1;
621
4.28k
    items[n].code[0]  = z2->code[i];
622
4.28k
    n++;
623
4.28k
  }
624
2.56k
      }
625
626
253k
      p += clen;
627
253k
      if (p < end) {
628
233k
  code = ONIGENC_MBC_TO_CODE(enc, p, end);
629
233k
  if ((to = onigenc_unicode_fold_lookup(code)) != 0
630
233k
      && OnigCodePointCount(to->n) == 1) {
631
114k
    codes[2] = to->code[0];
632
114k
  }
633
118k
  else
634
118k
    codes[2] = code;
635
636
233k
  clen = enclen(enc, p, end);
637
233k
  len += clen;
638
233k
  if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 &&
639
233k
      CodePointListValidP(z2)) {
640
0
    for (i = 0; i < OnigCodePointCount(z2->n); i++) {
641
0
      items[n].byte_len = len;
642
0
      items[n].code_len = 1;
643
0
      items[n].code[0]  = z2->code[i];
644
0
      n++;
645
0
    }
646
0
  }
647
233k
      }
648
253k
    }
649
287k
  }
650
651
372k
  return n;
652
372k
}
653
654
#ifdef USE_CASE_MAP_API
655
/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
656
0
#define CASE_MAPPING_SLACK 12
657
0
#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
658
extern int
659
onigenc_unicode_case_map(OnigCaseFoldType* flagP,
660
    const OnigUChar** pp, const OnigUChar* end,
661
    OnigUChar* to, OnigUChar* to_end,
662
    const struct OnigEncodingTypeST* enc)
663
0
{
664
0
  OnigCodePoint code;
665
0
  OnigUChar *to_start = to;
666
0
  OnigCaseFoldType flags = *flagP;
667
0
  int codepoint_length;
668
669
0
  to_end -= CASE_MAPPING_SLACK;
670
  /* copy flags ONIGENC_CASE_UPCASE     and ONIGENC_CASE_DOWNCASE over to
671
   *            ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */
672
0
  flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET;
673
674
0
  while (*pp < end && to <= to_end) {
675
0
    codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
676
0
    if (codepoint_length < 0)
677
0
      return codepoint_length; /* encoding invalid */
678
0
    code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
679
0
    *pp += codepoint_length;
680
681
0
    if (code <= 'z') { /* ASCII comes first */
682
0
      if (code >= 'a' /*&& code <= 'z'*/) {
683
0
  if (flags & ONIGENC_CASE_UPCASE) {
684
0
    MODIFIED;
685
0
    if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i')
686
0
      code = I_WITH_DOT_ABOVE;
687
0
          else
688
0
            code -= 'a' - 'A';
689
0
  }
690
0
      }
691
0
      else if (code >= 'A' && code <= 'Z') {
692
0
  if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
693
0
    MODIFIED;
694
0
    if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I')
695
0
      code = DOTLESS_i;
696
0
    else
697
0
      code += 'a' - 'A';
698
0
  }
699
0
      }
700
0
    }
701
0
    else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */
702
0
      const CodePointList3 *folded;
703
704
0
      if (code == I_WITH_DOT_ABOVE) {
705
0
  if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
706
0
    MODIFIED;
707
0
    code = 'i';
708
0
    if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */
709
0
      to += ONIGENC_CODE_TO_MBC(enc, code, to);
710
0
      code = DOT_ABOVE;
711
0
    }
712
0
  }
713
0
      }
714
0
      else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */
715
0
  if (flags & ONIGENC_CASE_UPCASE) {
716
0
    MODIFIED;
717
0
    code = 'I';
718
0
  }
719
0
      }
720
0
      else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
721
0
  if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */
722
0
          MODIFIED;
723
0
    code += 0x10D0 - 0x1C90;
724
0
        }
725
0
        else if ((flags & ONIGENC_CASE_TITLECASE)                            /* Titlecase needed, */
726
0
      && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase  */
727
    /* already Titlecase, no changes needed */
728
0
  }
729
0
  else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
730
0
    const OnigCodePoint *next;
731
0
    int count;
732
733
0
    MODIFIED;
734
0
    if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */
735
0
      const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n);
736
737
0
      if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */
738
0
        if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE))
739
0
      == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */
740
0
    goto SpecialsCopy;
741
0
        else /* swapCASE not needed */
742
0
    SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
743
0
      }
744
0
      if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */
745
0
        if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */
746
0
    goto SpecialsCopy;
747
0
        else /* Titlecase not needed */
748
0
    SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
749
0
      }
750
0
      if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) {
751
0
        if (!(flags & ONIGENC_CASE_DOWN_SPECIAL))
752
0
    SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
753
0
      }
754
      /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */
755
0
SpecialsCopy:
756
0
      count = SpecialsLengthExtract(*SpecialsStart);
757
0
      next = SpecialsStart;
758
0
      code = SpecialsCodepointExtract(*next++);
759
0
    }
760
0
    else { /* no specials */
761
0
      count = OnigCodePointCount(folded->n);
762
0
      next = folded->code;
763
0
      code = *next++;
764
0
    }
765
0
    if (count == 1)
766
0
      ;
767
0
    else if (count == 2) {
768
0
      to += ONIGENC_CODE_TO_MBC(enc, code, to);
769
0
      code = *next;
770
0
    }
771
0
    else { /* count == 3 */
772
0
      to += ONIGENC_CODE_TO_MBC(enc, code, to);
773
0
      to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
774
0
      code = *next;
775
0
    }
776
0
  }
777
0
      }
778
0
      else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */
779
0
  if ((flags & ONIGENC_CASE_TITLECASE)                                 /* Titlecase needed, */
780
0
      && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
781
    /* already Titlecase, no changes needed */
782
0
  }
783
0
  else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
784
0
    MODIFIED;
785
0
    code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0];
786
0
  }
787
0
      }
788
0
    }
789
0
    to += ONIGENC_CODE_TO_MBC(enc, code, to);
790
    /* switch from titlecase to lowercase for capitalize */
791
0
    if (flags & ONIGENC_CASE_TITLECASE)
792
0
      flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE |
793
0
    ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL);
794
0
  }
795
0
  *flagP = flags;
796
0
  return (int )(to - to_start);
797
0
}
798
#endif
799
800
#if 0
801
const char onigenc_unicode_version_string[] =
802
#ifdef ONIG_UNICODE_VERSION_STRING
803
    ONIG_UNICODE_VERSION_STRING
804
#endif
805
    "";
806
807
const int onigenc_unicode_version_number[3] = {
808
#ifdef ONIG_UNICODE_VERSION_MAJOR
809
    ONIG_UNICODE_VERSION_MAJOR,
810
    ONIG_UNICODE_VERSION_MINOR,
811
    ONIG_UNICODE_VERSION_TEENY,
812
#else
813
    0
814
#endif
815
};
816
#endif