Coverage Report

Created: 2025-07-11 06:57

/src/quickjs/libunicode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Unicode utilities
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <string.h>
28
#include <assert.h>
29
30
#include "cutils.h"
31
#include "libunicode.h"
32
#include "libunicode-table.h"
33
34
enum {
35
    RUN_TYPE_U,
36
    RUN_TYPE_L,
37
    RUN_TYPE_UF,
38
    RUN_TYPE_LF,
39
    RUN_TYPE_UL,
40
    RUN_TYPE_LSU,
41
    RUN_TYPE_U2L_399_EXT2,
42
    RUN_TYPE_UF_D20,
43
    RUN_TYPE_UF_D1_EXT,
44
    RUN_TYPE_U_EXT,
45
    RUN_TYPE_LF_EXT,
46
    RUN_TYPE_UF_EXT2,
47
    RUN_TYPE_LF_EXT2,
48
    RUN_TYPE_UF_EXT3,
49
};
50
51
static int lre_case_conv1(uint32_t c, int conv_type)
52
0
{
53
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
54
0
    lre_case_conv(res, c, conv_type);
55
0
    return res[0];
56
0
}
57
58
/* case conversion using the table entry 'idx' with value 'v' */
59
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
896k
{
61
896k
    uint32_t code, data, type, a, is_lower;
62
896k
    is_lower = (conv_type != 0);
63
896k
    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
896k
    data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
896k
    code = v >> (32 - 17);
66
896k
    switch(type) {
67
490k
    case RUN_TYPE_U:
68
491k
    case RUN_TYPE_L:
69
510k
    case RUN_TYPE_UF:
70
578k
    case RUN_TYPE_LF:
71
578k
        if (conv_type == (type & 1) ||
72
578k
            (type >= RUN_TYPE_UF && conv_type == 2)) {
73
509k
            c = c - code + (case_conv_table1[data] >> (32 - 17));
74
509k
        }
75
578k
        break;
76
261k
    case RUN_TYPE_UL:
77
261k
        a = c - code;
78
261k
        if ((a & 1) != (1 - is_lower))
79
278
            break;
80
261k
        c = (a ^ 1) + code;
81
261k
        break;
82
9.44k
    case RUN_TYPE_LSU:
83
9.44k
        a = c - code;
84
9.44k
        if (a == 1) {
85
4.59k
            c += 2 * is_lower - 1;
86
4.85k
        } else if (a == (1 - is_lower) * 2) {
87
4.57k
            c += (2 * is_lower - 1) * 2;
88
4.57k
        }
89
9.44k
        break;
90
6.51k
    case RUN_TYPE_U2L_399_EXT2:
91
6.51k
        if (!is_lower) {
92
6.51k
            res[0] = c - code + case_conv_ext[data >> 6];
93
6.51k
            res[1] = 0x399;
94
6.51k
            return 2;
95
6.51k
        } else {
96
0
            c = c - code + case_conv_ext[data & 0x3f];
97
0
        }
98
0
        break;
99
12.7k
    case RUN_TYPE_UF_D20:
100
12.7k
        if (conv_type == 1)
101
0
            break;
102
12.7k
        c = data + (conv_type == 2) * 0x20;
103
12.7k
        break;
104
738
    case RUN_TYPE_UF_D1_EXT:
105
738
        if (conv_type == 1)
106
0
            break;
107
738
        c = case_conv_ext[data] + (conv_type == 2);
108
738
        break;
109
1.18k
    case RUN_TYPE_U_EXT:
110
1.25k
    case RUN_TYPE_LF_EXT:
111
1.25k
        if (is_lower != (type - RUN_TYPE_U_EXT))
112
73
            break;
113
1.18k
        c = case_conv_ext[data];
114
1.18k
        break;
115
212
    case RUN_TYPE_LF_EXT2:
116
212
        if (!is_lower)
117
212
            break;
118
0
        res[0] = c - code + case_conv_ext[data >> 6];
119
0
        res[1] = case_conv_ext[data & 0x3f];
120
0
        return 2;
121
20.3k
    case RUN_TYPE_UF_EXT2:
122
20.3k
        if (conv_type == 1)
123
0
            break;
124
20.3k
        res[0] = c - code + case_conv_ext[data >> 6];
125
20.3k
        res[1] = case_conv_ext[data & 0x3f];
126
20.3k
        if (conv_type == 2) {
127
            /* convert to lower */
128
0
            res[0] = lre_case_conv1(res[0], 1);
129
0
            res[1] = lre_case_conv1(res[1], 1);
130
0
        }
131
20.3k
        return 2;
132
0
    default:
133
5.15k
    case RUN_TYPE_UF_EXT3:
134
5.15k
        if (conv_type == 1)
135
0
            break;
136
5.15k
        res[0] = case_conv_ext[data >> 8];
137
5.15k
        res[1] = case_conv_ext[(data >> 4) & 0xf];
138
5.15k
        res[2] = case_conv_ext[data & 0xf];
139
5.15k
        if (conv_type == 2) {
140
            /* convert to lower */
141
0
            res[0] = lre_case_conv1(res[0], 1);
142
0
            res[1] = lre_case_conv1(res[1], 1);
143
0
            res[2] = lre_case_conv1(res[2], 1);
144
0
        }
145
5.15k
        return 3;
146
896k
    }
147
864k
    res[0] = c;
148
864k
    return 1;
149
896k
}
150
151
/* conv_type:
152
   0 = to upper
153
   1 = to lower
154
   2 = case folding (= to lower with modifications)
155
*/
156
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
0
{
158
0
    if (c < 128) {
159
0
        if (conv_type) {
160
0
            if (c >= 'A' && c <= 'Z') {
161
0
                c = c - 'A' + 'a';
162
0
            }
163
0
        } else {
164
0
            if (c >= 'a' && c <= 'z') {
165
0
                c = c - 'a' + 'A';
166
0
            }
167
0
        }
168
0
    } else {
169
0
        uint32_t v, code, len;
170
0
        int idx, idx_min, idx_max;
171
172
0
        idx_min = 0;
173
0
        idx_max = countof(case_conv_table1) - 1;
174
0
        while (idx_min <= idx_max) {
175
0
            idx = (unsigned)(idx_max + idx_min) / 2;
176
0
            v = case_conv_table1[idx];
177
0
            code = v >> (32 - 17);
178
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
179
0
            if (c < code) {
180
0
                idx_max = idx - 1;
181
0
            } else if (c >= code + len) {
182
0
                idx_min = idx + 1;
183
0
            } else {
184
0
                return lre_case_conv_entry(res, c, conv_type, idx, v);
185
0
            }
186
0
        }
187
0
    }
188
0
    res[0] = c;
189
0
    return 1;
190
0
}
191
192
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
932k
{
194
932k
    uint32_t res[LRE_CC_RES_LEN_MAX];
195
932k
    int len;
196
197
932k
    if (is_unicode) {
198
0
        len = lre_case_conv_entry(res, c, 2, idx, v);
199
0
        if (len == 1) {
200
0
            c = res[0];
201
0
        } else {
202
            /* handle the few specific multi-character cases (see
203
               unicode_gen.c:dump_case_folding_special_cases()) */
204
0
            if (c == 0xfb06) {
205
0
                c = 0xfb05;
206
0
            } else if (c == 0x01fd3) {
207
0
                c = 0x390;
208
0
            } else if (c == 0x01fe3) {
209
0
                c = 0x3b0;
210
0
            }
211
0
        }
212
932k
    } else {
213
932k
        if (likely(c < 128)) {
214
36.6k
            if (c >= 'a' && c <= 'z')
215
36.6k
                c = c - 'a' + 'A';
216
896k
        } else {
217
            /* legacy regexp: to upper case if single char >= 128 */
218
896k
            len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
896k
            if (len == 1 && res[0] >= 128)
220
861k
                c = res[0];
221
896k
        }
222
932k
    }
223
932k
    return c;
224
932k
}
225
226
/* JS regexp specific rules for case folding */
227
int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
10.8M
{
229
10.8M
    if (c < 128) {
230
        /* fast case */
231
10.1M
        if (is_unicode) {
232
0
            if (c >= 'A' && c <= 'Z') {
233
0
                c = c - 'A' + 'a';
234
0
            }
235
10.1M
        } else {
236
10.1M
            if (c >= 'a' && c <= 'z') {
237
2.55M
                c = c - 'a' + 'A';
238
2.55M
            }
239
10.1M
        }
240
10.1M
    } else {
241
667k
        uint32_t v, code, len;
242
667k
        int idx, idx_min, idx_max;
243
244
667k
        idx_min = 0;
245
667k
        idx_max = countof(case_conv_table1) - 1;
246
5.74M
        while (idx_min <= idx_max) {
247
5.42M
            idx = (unsigned)(idx_max + idx_min) / 2;
248
5.42M
            v = case_conv_table1[idx];
249
5.42M
            code = v >> (32 - 17);
250
5.42M
            len = (v >> (32 - 17 - 7)) & 0x7f;
251
5.42M
            if (c < code) {
252
4.03M
                idx_max = idx - 1;
253
4.03M
            } else if (c >= code + len) {
254
1.03M
                idx_min = idx + 1;
255
1.03M
            } else {
256
351k
                return lre_case_folding_entry(c, idx, v, is_unicode);
257
351k
            }
258
5.42M
        }
259
667k
    }
260
10.4M
    return c;
261
10.8M
}
262
263
static uint32_t get_le24(const uint8_t *ptr)
264
92.2k
{
265
92.2k
    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
92.2k
}
267
268
10.4k
#define UNICODE_INDEX_BLOCK_LEN 32
269
270
/* return -1 if not in table, otherwise the offset in the block */
271
static int get_index_pos(uint32_t *pcode, uint32_t c,
272
                         const uint8_t *index_table, int index_table_len)
273
12.2k
{
274
12.2k
    uint32_t code, v;
275
12.2k
    int idx_min, idx_max, idx;
276
277
12.2k
    idx_min = 0;
278
12.2k
    v = get_le24(index_table);
279
12.2k
    code = v & ((1 << 21) - 1);
280
12.2k
    if (c < code) {
281
1.51k
        *pcode = 0;
282
1.51k
        return 0;
283
1.51k
    }
284
10.7k
    idx_max = index_table_len - 1;
285
10.7k
    code = get_le24(index_table + idx_max * 3);
286
10.7k
    if (c >= code)
287
312
        return -1;
288
    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
69.2k
    while ((idx_max - idx_min) > 1) {
290
58.7k
        idx = (idx_max + idx_min) / 2;
291
58.7k
        v = get_le24(index_table + idx * 3);
292
58.7k
        code = v & ((1 << 21) - 1);
293
58.7k
        if (c < code) {
294
19.7k
            idx_max = idx;
295
39.0k
        } else {
296
39.0k
            idx_min = idx;
297
39.0k
        }
298
58.7k
    }
299
10.4k
    v = get_le24(index_table + idx_min * 3);
300
10.4k
    *pcode = v & ((1 << 21) - 1);
301
10.4k
    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
10.7k
}
303
304
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
                            const uint8_t *index_table, int index_table_len)
306
12.2k
{
307
12.2k
    uint32_t code, b, bit;
308
12.2k
    int pos;
309
12.2k
    const uint8_t *p;
310
311
12.2k
    pos = get_index_pos(&code, c, index_table, index_table_len);
312
12.2k
    if (pos < 0)
313
312
        return FALSE; /* outside the table */
314
11.9k
    p = table + pos;
315
11.9k
    bit = 0;
316
    /* Compressed run length encoding:
317
       00..3F: 2 packed lengths: 3-bit + 3-bit
318
       40..5F: 5-bits plus extra byte for length
319
       60..7F: 5-bits plus 2 extra bytes for length
320
       80..FF: 7-bit length
321
       lengths must be incremented to get character count
322
       Ranges alternate between false and true return value.
323
     */
324
124k
    for(;;) {
325
124k
        b = *p++;
326
124k
        if (b < 64) {
327
31.6k
            code += (b >> 3) + 1;
328
31.6k
            if (c < code)
329
632
                return bit;
330
31.0k
            bit ^= 1;
331
31.0k
            code += (b & 7) + 1;
332
92.8k
        } else if (b >= 0x80) {
333
76.6k
            code += b - 0x80 + 1;
334
76.6k
        } else if (b < 0x60) {
335
8.77k
            code += (((b - 0x40) << 8) | p[0]) + 1;
336
8.77k
            p++;
337
8.77k
        } else {
338
7.38k
            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
7.38k
            p += 2;
340
7.38k
        }
341
123k
        if (c < code)
342
11.3k
            return bit;
343
112k
        bit ^= 1;
344
112k
    }
345
11.9k
}
346
347
BOOL lre_is_cased(uint32_t c)
348
0
{
349
0
    uint32_t v, code, len;
350
0
    int idx, idx_min, idx_max;
351
352
0
    idx_min = 0;
353
0
    idx_max = countof(case_conv_table1) - 1;
354
0
    while (idx_min <= idx_max) {
355
0
        idx = (unsigned)(idx_max + idx_min) / 2;
356
0
        v = case_conv_table1[idx];
357
0
        code = v >> (32 - 17);
358
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
359
0
        if (c < code) {
360
0
            idx_max = idx - 1;
361
0
        } else if (c >= code + len) {
362
0
            idx_min = idx + 1;
363
0
        } else {
364
0
            return TRUE;
365
0
        }
366
0
    }
367
0
    return lre_is_in_table(c, unicode_prop_Cased1_table,
368
0
                           unicode_prop_Cased1_index,
369
0
                           sizeof(unicode_prop_Cased1_index) / 3);
370
0
}
371
372
BOOL lre_is_case_ignorable(uint32_t c)
373
0
{
374
0
    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
0
                           unicode_prop_Case_Ignorable_index,
376
0
                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
0
}
378
379
/* character range */
380
381
static __maybe_unused void cr_dump(CharRange *cr)
382
0
{
383
0
    int i;
384
0
    for(i = 0; i < cr->len; i++)
385
0
        printf("%d: 0x%04x\n", i, cr->points[i]);
386
0
}
387
388
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
0
{
390
0
    return realloc(ptr, size);
391
0
}
392
393
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
94.0k
{
395
94.0k
    cr->len = cr->size = 0;
396
94.0k
    cr->points = NULL;
397
94.0k
    cr->mem_opaque = mem_opaque;
398
94.0k
    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
94.0k
}
400
401
void cr_free(CharRange *cr)
402
146k
{
403
146k
    cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
146k
}
405
406
int cr_realloc(CharRange *cr, int size)
407
1.25M
{
408
1.25M
    int new_size;
409
1.25M
    uint32_t *new_buf;
410
411
1.25M
    if (size > cr->size) {
412
1.18M
        new_size = max_int(size, cr->size * 3 / 2);
413
1.18M
        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
1.18M
                                   new_size * sizeof(cr->points[0]));
415
1.18M
        if (!new_buf)
416
0
            return -1;
417
1.18M
        cr->points = new_buf;
418
1.18M
        cr->size = new_size;
419
1.18M
    }
420
1.25M
    return 0;
421
1.25M
}
422
423
int cr_copy(CharRange *cr, const CharRange *cr1)
424
0
{
425
0
    if (cr_realloc(cr, cr1->len))
426
0
        return -1;
427
0
    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
0
    cr->len = cr1->len;
429
0
    return 0;
430
0
}
431
432
/* merge consecutive intervals and remove empty intervals */
433
static void cr_compress(CharRange *cr)
434
133k
{
435
133k
    int i, j, k, len;
436
133k
    uint32_t *pt;
437
438
133k
    pt = cr->points;
439
133k
    len = cr->len;
440
133k
    i = 0;
441
133k
    j = 0;
442
133k
    k = 0;
443
8.67M
    while ((i + 1) < len) {
444
8.53M
        if (pt[i] == pt[i + 1]) {
445
            /* empty interval */
446
670k
            i += 2;
447
7.86M
        } else {
448
7.86M
            j = i;
449
7.92M
            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
61.0k
                j += 2;
451
            /* just copy */
452
7.86M
            pt[k] = pt[i];
453
7.86M
            pt[k + 1] = pt[j + 1];
454
7.86M
            k += 2;
455
7.86M
            i = j + 2;
456
7.86M
        }
457
8.53M
    }
458
133k
    cr->len = k;
459
133k
}
460
461
/* union or intersection */
462
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
          const uint32_t *b_pt, int b_len, int op)
464
69.4k
{
465
69.4k
    int a_idx, b_idx, is_in;
466
69.4k
    uint32_t v;
467
468
69.4k
    a_idx = 0;
469
69.4k
    b_idx = 0;
470
22.2M
    for(;;) {
471
        /* get one more point from a or b in increasing order */
472
22.2M
        if (a_idx < a_len && b_idx < b_len) {
473
3.40M
            if (a_pt[a_idx] < b_pt[b_idx]) {
474
2.14M
                goto a_add;
475
2.14M
            } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
977k
                v = a_pt[a_idx];
477
977k
                a_idx++;
478
977k
                b_idx++;
479
977k
            } else {
480
281k
                goto b_add;
481
281k
            }
482
18.8M
        } else if (a_idx < a_len) {
483
20.7M
        a_add:
484
20.7M
            v = a_pt[a_idx++];
485
20.7M
        } else if (b_idx < b_len) {
486
414k
        b_add:
487
414k
            v = b_pt[b_idx++];
488
414k
        } else {
489
69.4k
            break;
490
69.4k
        }
491
        /* add the point if the in/out status changes */
492
22.1M
        switch(op) {
493
6.09M
        case CR_OP_UNION:
494
6.09M
            is_in = (a_idx & 1) | (b_idx & 1);
495
6.09M
            break;
496
16.0M
        case CR_OP_INTER:
497
16.0M
            is_in = (a_idx & 1) & (b_idx & 1);
498
16.0M
            break;
499
0
        case CR_OP_XOR:
500
0
            is_in = (a_idx & 1) ^ (b_idx & 1);
501
0
            break;
502
0
        case CR_OP_SUB:
503
0
            is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
504
0
            break;
505
0
        default:
506
0
            abort();
507
22.1M
        }
508
22.1M
        if (is_in != (cr->len & 1)) {
509
7.22M
            if (cr_add_point(cr, v))
510
0
                return -1;
511
7.22M
        }
512
22.1M
    }
513
69.4k
    cr_compress(cr);
514
69.4k
    return 0;
515
69.4k
}
516
517
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
518
51.9k
{
519
51.9k
    CharRange a = *cr;
520
51.9k
    int ret;
521
51.9k
    cr->len = 0;
522
51.9k
    cr->size = 0;
523
51.9k
    cr->points = NULL;
524
51.9k
    ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
525
51.9k
    cr_free(&a);
526
51.9k
    return ret;
527
51.9k
}
528
529
int cr_invert(CharRange *cr)
530
64.3k
{
531
64.3k
    int len;
532
64.3k
    len = cr->len;
533
64.3k
    if (cr_realloc(cr, len + 2))
534
0
        return -1;
535
64.3k
    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
536
64.3k
    cr->points[0] = 0;
537
64.3k
    cr->points[len + 1] = UINT32_MAX;
538
64.3k
    cr->len = len + 2;
539
64.3k
    cr_compress(cr);
540
64.3k
    return 0;
541
64.3k
}
542
543
786k
#define CASE_U (1 << 0)
544
378k
#define CASE_L (1 << 1)
545
378k
#define CASE_F (1 << 2)
546
547
/* use the case conversion table to generate range of characters.
548
   CASE_U: set char if modified by uppercasing,
549
   CASE_L: set char if modified by lowercasing,
550
   CASE_F: set char if modified by case folding,
551
 */
552
static int unicode_case1(CharRange *cr, int case_mask)
553
5.82k
{
554
169k
#define MR(x) (1 << RUN_TYPE_ ## x)
555
5.82k
    const uint32_t tab_run_mask[3] = {
556
5.82k
        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
557
5.82k
        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
558
559
5.82k
        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
560
561
5.82k
        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
562
5.82k
    };
563
5.82k
#undef MR
564
5.82k
    uint32_t mask, v, code, type, len, i, idx;
565
566
5.82k
    if (case_mask == 0)
567
0
        return 0;
568
5.82k
    mask = 0;
569
23.3k
    for(i = 0; i < 3; i++) {
570
17.4k
        if ((case_mask >> i) & 1)
571
5.82k
            mask |= tab_run_mask[i];
572
17.4k
    }
573
2.20M
    for(idx = 0; idx < countof(case_conv_table1); idx++) {
574
2.20M
        v = case_conv_table1[idx];
575
2.20M
        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
576
2.20M
        code = v >> (32 - 17);
577
2.20M
        len = (v >> (32 - 17 - 7)) & 0x7f;
578
2.20M
        if ((mask >> type) & 1) {
579
            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
580
1.51M
            switch(type) {
581
355k
            case RUN_TYPE_UL:
582
355k
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
583
0
                    goto def_case;
584
355k
                code += ((case_mask & CASE_U) != 0);
585
3.51M
                for(i = 0; i < len; i += 2) {
586
3.15M
                    if (cr_add_interval(cr, code + i, code + i + 1))
587
0
                        return -1;
588
3.15M
                }
589
355k
                break;
590
355k
            case RUN_TYPE_LSU:
591
23.3k
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
592
0
                    goto def_case;
593
23.3k
                if (!(case_mask & CASE_U)) {
594
0
                    if (cr_add_interval(cr, code, code + 1))
595
0
                        return -1;
596
0
                }
597
23.3k
                if (cr_add_interval(cr, code + 1, code + 2))
598
0
                    return -1;
599
23.3k
                if (case_mask & CASE_U) {
600
23.3k
                    if (cr_add_interval(cr, code + 2, code + 3))
601
0
                        return -1;
602
23.3k
                }
603
23.3k
                break;
604
1.13M
            default:
605
1.13M
            def_case:
606
1.13M
                if (cr_add_interval(cr, code, code + len))
607
0
                    return -1;
608
1.13M
                break;
609
1.51M
            }
610
1.51M
        }
611
2.20M
    }
612
5.82k
    return 0;
613
5.82k
}
614
615
static int point_cmp(const void *p1, const void *p2, void *arg)
616
2.65M
{
617
2.65M
    uint32_t v1 = *(uint32_t *)p1;
618
2.65M
    uint32_t v2 = *(uint32_t *)p2;
619
2.65M
    return (v1 > v2) - (v1 < v2);
620
2.65M
}
621
622
static void cr_sort_and_remove_overlap(CharRange *cr)
623
5.82k
{
624
5.82k
    uint32_t start, end, start1, end1, i, j;
625
626
    /* the resulting ranges are not necessarily sorted and may overlap */
627
5.82k
    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
628
5.82k
    j = 0;
629
310k
    for(i = 0; i < cr->len; ) {
630
304k
        start = cr->points[i];
631
304k
        end = cr->points[i + 1];
632
304k
        i += 2;
633
356k
        while (i < cr->len) {
634
354k
            start1 = cr->points[i];
635
354k
            end1 = cr->points[i + 1];
636
354k
            if (start1 > end) {
637
                /* |------|
638
                 *           |-------| */
639
301k
                break;
640
301k
            } else if (end1 <= end) {
641
                /* |------|
642
                 *    |--| */
643
14.7k
                i += 2;
644
37.3k
            } else {
645
                /* |------|
646
                 *     |-------| */
647
37.3k
                end = end1;
648
37.3k
                i += 2;
649
37.3k
            }
650
354k
        }
651
304k
        cr->points[j] = start;
652
304k
        cr->points[j + 1] = end;
653
304k
        j += 2;
654
304k
    }
655
5.82k
    cr->len = j;
656
5.82k
}
657
658
/* canonicalize a character set using the JS regex case folding rules
659
   (see lre_canonicalize()) */
660
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
661
5.82k
{
662
5.82k
    CharRange cr_inter, cr_mask, cr_result, cr_sub;
663
5.82k
    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
664
665
5.82k
    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
666
5.82k
    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
667
5.82k
    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
668
5.82k
    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
669
670
5.82k
    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
671
0
        goto fail;
672
5.82k
    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
673
0
        goto fail;
674
675
5.82k
    if (cr_invert(&cr_mask))
676
0
        goto fail;
677
5.82k
    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
678
0
        goto fail;
679
680
    /* cr_inter = cr & cr_mask */
681
    /* cr_sub = cr & ~cr_mask */
682
683
    /* use the case conversion table to compute the result */
684
5.82k
    d_start = -1;
685
5.82k
    d_end = -1;
686
5.82k
    idx = 0;
687
5.82k
    v = case_conv_table1[idx];
688
5.82k
    code = v >> (32 - 17);
689
5.82k
    len = (v >> (32 - 17 - 7)) & 0x7f;
690
312k
    for(i = 0; i < cr_inter.len; i += 2) {
691
306k
        start = cr_inter.points[i];
692
306k
        end = cr_inter.points[i + 1];
693
694
887k
        for(c = start; c < end; c++) {
695
834k
            for(;;) {
696
834k
                if (c >= code && c < code + len)
697
581k
                    break;
698
252k
                idx++;
699
252k
                assert(idx < countof(case_conv_table1));
700
252k
                v = case_conv_table1[idx];
701
252k
                code = v >> (32 - 17);
702
252k
                len = (v >> (32 - 17 - 7)) & 0x7f;
703
252k
            }
704
581k
            d = lre_case_folding_entry(c, idx, v, is_unicode);
705
            /* try to merge with the current interval */
706
581k
            if (d_start == -1) {
707
2.39k
                d_start = d;
708
2.39k
                d_end = d + 1;
709
578k
            } else if (d_end == d) {
710
224k
                d_end++;
711
354k
            } else {
712
354k
                cr_add_interval(&cr_result, d_start, d_end);
713
354k
                d_start = d;
714
354k
                d_end = d + 1;
715
354k
            }
716
581k
        }
717
306k
    }
718
5.82k
    if (d_start != -1) {
719
2.39k
        if (cr_add_interval(&cr_result, d_start, d_end))
720
0
            goto fail;
721
2.39k
    }
722
723
    /* the resulting ranges are not necessarily sorted and may overlap */
724
5.82k
    cr_sort_and_remove_overlap(&cr_result);
725
726
    /* or with the character not affected by the case folding */
727
5.82k
    cr->len = 0;
728
5.82k
    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
729
0
        goto fail;
730
731
5.82k
    cr_free(&cr_inter);
732
5.82k
    cr_free(&cr_mask);
733
5.82k
    cr_free(&cr_result);
734
5.82k
    cr_free(&cr_sub);
735
5.82k
    return 0;
736
0
 fail:
737
0
    cr_free(&cr_inter);
738
0
    cr_free(&cr_mask);
739
0
    cr_free(&cr_result);
740
0
    cr_free(&cr_sub);
741
0
    return -1;
742
5.82k
}
743
744
#ifdef CONFIG_ALL_UNICODE
745
746
BOOL lre_is_id_start(uint32_t c)
747
10.7k
{
748
10.7k
    return lre_is_in_table(c, unicode_prop_ID_Start_table,
749
10.7k
                           unicode_prop_ID_Start_index,
750
10.7k
                           sizeof(unicode_prop_ID_Start_index) / 3);
751
10.7k
}
752
753
BOOL lre_is_id_continue(uint32_t c)
754
3.96k
{
755
3.96k
    return lre_is_id_start(c) ||
756
3.96k
        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
757
1.50k
                        unicode_prop_ID_Continue1_index,
758
1.50k
                        sizeof(unicode_prop_ID_Continue1_index) / 3);
759
3.96k
}
760
761
#define UNICODE_DECOMP_LEN_MAX 18
762
763
typedef enum {
764
    DECOMP_TYPE_C1, /* 16 bit char */
765
    DECOMP_TYPE_L1, /* 16 bit char table */
766
    DECOMP_TYPE_L2,
767
    DECOMP_TYPE_L3,
768
    DECOMP_TYPE_L4,
769
    DECOMP_TYPE_L5, /* XXX: not used */
770
    DECOMP_TYPE_L6, /* XXX: could remove */
771
    DECOMP_TYPE_L7, /* XXX: could remove */
772
    DECOMP_TYPE_LL1, /* 18 bit char table */
773
    DECOMP_TYPE_LL2,
774
    DECOMP_TYPE_S1, /* 8 bit char table */
775
    DECOMP_TYPE_S2,
776
    DECOMP_TYPE_S3,
777
    DECOMP_TYPE_S4,
778
    DECOMP_TYPE_S5,
779
    DECOMP_TYPE_I1, /* increment 16 bit char value */
780
    DECOMP_TYPE_I2_0,
781
    DECOMP_TYPE_I2_1,
782
    DECOMP_TYPE_I3_1,
783
    DECOMP_TYPE_I3_2,
784
    DECOMP_TYPE_I4_1,
785
    DECOMP_TYPE_I4_2,
786
    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
787
    DECOMP_TYPE_B2,
788
    DECOMP_TYPE_B3,
789
    DECOMP_TYPE_B4,
790
    DECOMP_TYPE_B5,
791
    DECOMP_TYPE_B6,
792
    DECOMP_TYPE_B7,
793
    DECOMP_TYPE_B8,
794
    DECOMP_TYPE_B18,
795
    DECOMP_TYPE_LS2,
796
    DECOMP_TYPE_PAT3,
797
    DECOMP_TYPE_S2_UL,
798
    DECOMP_TYPE_LS2_UL,
799
} DecompTypeEnum;
800
801
static uint32_t unicode_get_short_code(uint32_t c)
802
0
{
803
0
    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
804
805
0
    if (c < 0x80)
806
0
        return c;
807
0
    else if (c < 0x80 + 0x50)
808
0
        return c - 0x80 + 0x300;
809
0
    else
810
0
        return unicode_short_table[c - 0x80 - 0x50];
811
0
}
812
813
static uint32_t unicode_get_lower_simple(uint32_t c)
814
0
{
815
0
    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
816
0
        c += 0x20;
817
0
    else
818
0
        c++;
819
0
    return c;
820
0
}
821
822
static uint16_t unicode_get16(const uint8_t *p)
823
0
{
824
0
    return p[0] | (p[1] << 8);
825
0
}
826
827
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
828
                                int idx, uint32_t code, uint32_t len,
829
                                uint32_t type)
830
0
{
831
0
    uint32_t c1;
832
0
    int l, i, p;
833
0
    const uint8_t *d;
834
835
0
    if (type == DECOMP_TYPE_C1) {
836
0
        res[0] = unicode_decomp_table2[idx];
837
0
        return 1;
838
0
    } else {
839
0
        d = unicode_decomp_data + unicode_decomp_table2[idx];
840
0
        switch(type) {
841
0
        case DECOMP_TYPE_L1:
842
0
        case DECOMP_TYPE_L2:
843
0
        case DECOMP_TYPE_L3:
844
0
        case DECOMP_TYPE_L4:
845
0
        case DECOMP_TYPE_L5:
846
0
        case DECOMP_TYPE_L6:
847
0
        case DECOMP_TYPE_L7:
848
0
            l = type - DECOMP_TYPE_L1 + 1;
849
0
            d += (c - code) * l * 2;
850
0
            for(i = 0; i < l; i++) {
851
0
                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
852
0
                    return 0;
853
0
            }
854
0
            return l;
855
0
        case DECOMP_TYPE_LL1:
856
0
        case DECOMP_TYPE_LL2:
857
0
            {
858
0
                uint32_t k, p;
859
0
                l = type - DECOMP_TYPE_LL1 + 1;
860
0
                k = (c - code) * l;
861
0
                p = len * l * 2;
862
0
                for(i = 0; i < l; i++) {
863
0
                    c1 = unicode_get16(d + 2 * k) |
864
0
                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
865
0
                    if (!c1)
866
0
                        return 0;
867
0
                    res[i] = c1;
868
0
                    k++;
869
0
                }
870
0
            }
871
0
            return l;
872
0
        case DECOMP_TYPE_S1:
873
0
        case DECOMP_TYPE_S2:
874
0
        case DECOMP_TYPE_S3:
875
0
        case DECOMP_TYPE_S4:
876
0
        case DECOMP_TYPE_S5:
877
0
            l = type - DECOMP_TYPE_S1 + 1;
878
0
            d += (c - code) * l;
879
0
            for(i = 0; i < l; i++) {
880
0
                if ((res[i] = unicode_get_short_code(d[i])) == 0)
881
0
                    return 0;
882
0
            }
883
0
            return l;
884
0
        case DECOMP_TYPE_I1:
885
0
            l = 1;
886
0
            p = 0;
887
0
            goto decomp_type_i;
888
0
        case DECOMP_TYPE_I2_0:
889
0
        case DECOMP_TYPE_I2_1:
890
0
        case DECOMP_TYPE_I3_1:
891
0
        case DECOMP_TYPE_I3_2:
892
0
        case DECOMP_TYPE_I4_1:
893
0
        case DECOMP_TYPE_I4_2:
894
0
            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
895
0
            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
896
0
        decomp_type_i:
897
0
            for(i = 0; i < l; i++) {
898
0
                c1 = unicode_get16(d + 2 * i);
899
0
                if (i == p)
900
0
                    c1 += c - code;
901
0
                res[i] = c1;
902
0
            }
903
0
            return l;
904
0
        case DECOMP_TYPE_B18:
905
0
            l = 18;
906
0
            goto decomp_type_b;
907
0
        case DECOMP_TYPE_B1:
908
0
        case DECOMP_TYPE_B2:
909
0
        case DECOMP_TYPE_B3:
910
0
        case DECOMP_TYPE_B4:
911
0
        case DECOMP_TYPE_B5:
912
0
        case DECOMP_TYPE_B6:
913
0
        case DECOMP_TYPE_B7:
914
0
        case DECOMP_TYPE_B8:
915
0
            l = type - DECOMP_TYPE_B1 + 1;
916
0
        decomp_type_b:
917
0
            {
918
0
                uint32_t c_min;
919
0
                c_min = unicode_get16(d);
920
0
                d += 2 + (c - code) * l;
921
0
                for(i = 0; i < l; i++) {
922
0
                    c1 = d[i];
923
0
                    if (c1 == 0xff)
924
0
                        c1 = 0x20;
925
0
                    else
926
0
                        c1 += c_min;
927
0
                    res[i] = c1;
928
0
                }
929
0
            }
930
0
            return l;
931
0
        case DECOMP_TYPE_LS2:
932
0
            d += (c - code) * 3;
933
0
            if (!(res[0] = unicode_get16(d)))
934
0
                return 0;
935
0
            res[1] = unicode_get_short_code(d[2]);
936
0
            return 2;
937
0
        case DECOMP_TYPE_PAT3:
938
0
            res[0] = unicode_get16(d);
939
0
            res[2] = unicode_get16(d + 2);
940
0
            d += 4 + (c - code) * 2;
941
0
            res[1] = unicode_get16(d);
942
0
            return 3;
943
0
        case DECOMP_TYPE_S2_UL:
944
0
        case DECOMP_TYPE_LS2_UL:
945
0
            c1 = c - code;
946
0
            if (type == DECOMP_TYPE_S2_UL) {
947
0
                d += c1 & ~1;
948
0
                c = unicode_get_short_code(*d);
949
0
                d++;
950
0
            } else {
951
0
                d += (c1 >> 1) * 3;
952
0
                c = unicode_get16(d);
953
0
                d += 2;
954
0
            }
955
0
            if (c1 & 1)
956
0
                c = unicode_get_lower_simple(c);
957
0
            res[0] = c;
958
0
            res[1] = unicode_get_short_code(*d);
959
0
            return 2;
960
0
        }
961
0
    }
962
0
    return 0;
963
0
}
964
965
966
/* return the length of the decomposition (length <=
967
   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
968
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
969
0
{
970
0
    uint32_t v, type, is_compat, code, len;
971
0
    int idx_min, idx_max, idx;
972
973
0
    idx_min = 0;
974
0
    idx_max = countof(unicode_decomp_table1) - 1;
975
0
    while (idx_min <= idx_max) {
976
0
        idx = (idx_max + idx_min) / 2;
977
0
        v = unicode_decomp_table1[idx];
978
0
        code = v >> (32 - 18);
979
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
980
        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
981
0
        if (c < code) {
982
0
            idx_max = idx - 1;
983
0
        } else if (c >= code + len) {
984
0
            idx_min = idx + 1;
985
0
        } else {
986
0
            is_compat = v & 1;
987
0
            if (is_compat1 < is_compat)
988
0
                break;
989
0
            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
990
0
            return unicode_decomp_entry(res, c, idx, code, len, type);
991
0
        }
992
0
    }
993
0
    return 0;
994
0
}
995
996
/* return 0 if no pair found */
997
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
998
0
{
999
0
    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
1000
0
    int idx_min, idx_max, idx, d;
1001
0
    uint32_t pair[2];
1002
1003
0
    idx_min = 0;
1004
0
    idx_max = countof(unicode_comp_table) - 1;
1005
0
    while (idx_min <= idx_max) {
1006
0
        idx = (idx_max + idx_min) / 2;
1007
0
        idx1 = unicode_comp_table[idx];
1008
1009
        /* idx1 represent an entry of the decomposition table */
1010
0
        d_idx = idx1 >> 6;
1011
0
        d_offset = idx1 & 0x3f;
1012
0
        v = unicode_decomp_table1[d_idx];
1013
0
        code = v >> (32 - 18);
1014
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
1015
0
        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
1016
0
        ch = code + d_offset;
1017
0
        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1018
0
        d = c0 - pair[0];
1019
0
        if (d == 0)
1020
0
            d = c1 - pair[1];
1021
0
        if (d < 0) {
1022
0
            idx_max = idx - 1;
1023
0
        } else if (d > 0) {
1024
0
            idx_min = idx + 1;
1025
0
        } else {
1026
0
            return ch;
1027
0
        }
1028
0
    }
1029
0
    return 0;
1030
0
}
1031
1032
/* return the combining class of character c (between 0 and 255) */
1033
static int unicode_get_cc(uint32_t c)
1034
0
{
1035
0
    uint32_t code, n, type, cc, c1, b;
1036
0
    int pos;
1037
0
    const uint8_t *p;
1038
1039
0
    pos = get_index_pos(&code, c,
1040
0
                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
1041
0
    if (pos < 0)
1042
0
        return 0;
1043
0
    p = unicode_cc_table + pos;
1044
    /* Compressed run length encoding:
1045
       - 2 high order bits are combining class type
1046
       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1047
       - 00..2F: range length (add 1)
1048
       - 30..37: 3-bit range-length + 1 extra byte
1049
       - 38..3F: 3-bit range-length + 2 extra byte
1050
     */
1051
0
    for(;;) {
1052
0
        b = *p++;
1053
0
        type = b >> 6;
1054
0
        n = b & 0x3f;
1055
0
        if (n < 48) {
1056
0
        } else if (n < 56) {
1057
0
            n = (n - 48) << 8;
1058
0
            n |= *p++;
1059
0
            n += 48;
1060
0
        } else {
1061
0
            n = (n - 56) << 8;
1062
0
            n |= *p++ << 8;
1063
0
            n |= *p++;
1064
0
            n += 48 + (1 << 11);
1065
0
        }
1066
0
        if (type <= 1)
1067
0
            p++;
1068
0
        c1 = code + n + 1;
1069
0
        if (c < c1) {
1070
0
            switch(type) {
1071
0
            case 0:
1072
0
                cc = p[-1];
1073
0
                break;
1074
0
            case 1:
1075
0
                cc = p[-1] + c - code;
1076
0
                break;
1077
0
            case 2:
1078
0
                cc = 0;
1079
0
                break;
1080
0
            default:
1081
0
            case 3:
1082
0
                cc = 230;
1083
0
                break;
1084
0
            }
1085
0
            return cc;
1086
0
        }
1087
0
        code = c1;
1088
0
    }
1089
0
}
1090
1091
static void sort_cc(int *buf, int len)
1092
0
{
1093
0
    int i, j, k, cc, cc1, start, ch1;
1094
1095
0
    for(i = 0; i < len; i++) {
1096
0
        cc = unicode_get_cc(buf[i]);
1097
0
        if (cc != 0) {
1098
0
            start = i;
1099
0
            j = i + 1;
1100
0
            while (j < len) {
1101
0
                ch1 = buf[j];
1102
0
                cc1 = unicode_get_cc(ch1);
1103
0
                if (cc1 == 0)
1104
0
                    break;
1105
0
                k = j - 1;
1106
0
                while (k >= start) {
1107
0
                    if (unicode_get_cc(buf[k]) <= cc1)
1108
0
                        break;
1109
0
                    buf[k + 1] = buf[k];
1110
0
                    k--;
1111
0
                }
1112
0
                buf[k + 1] = ch1;
1113
0
                j++;
1114
0
            }
1115
#if 0
1116
            printf("cc:");
1117
            for(k = start; k < j; k++) {
1118
                printf(" %3d", unicode_get_cc(buf[k]));
1119
            }
1120
            printf("\n");
1121
#endif
1122
0
            i = j;
1123
0
        }
1124
0
    }
1125
0
}
1126
1127
static void to_nfd_rec(DynBuf *dbuf,
1128
                       const int *src, int src_len, int is_compat)
1129
0
{
1130
0
    uint32_t c, v;
1131
0
    int i, l;
1132
0
    uint32_t res[UNICODE_DECOMP_LEN_MAX];
1133
1134
0
    for(i = 0; i < src_len; i++) {
1135
0
        c = src[i];
1136
0
        if (c >= 0xac00 && c < 0xd7a4) {
1137
            /* Hangul decomposition */
1138
0
            c -= 0xac00;
1139
0
            dbuf_put_u32(dbuf, 0x1100 + c / 588);
1140
0
            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
1141
0
            v = c % 28;
1142
0
            if (v != 0)
1143
0
                dbuf_put_u32(dbuf, 0x11a7 + v);
1144
0
        } else {
1145
0
            l = unicode_decomp_char(res, c, is_compat);
1146
0
            if (l) {
1147
0
                to_nfd_rec(dbuf, (int *)res, l, is_compat);
1148
0
            } else {
1149
0
                dbuf_put_u32(dbuf, c);
1150
0
            }
1151
0
        }
1152
0
    }
1153
0
}
1154
1155
/* return 0 if not found */
1156
static int compose_pair(uint32_t c0, uint32_t c1)
1157
0
{
1158
    /* Hangul composition */
1159
0
    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
1160
0
        c1 >= 0x1161 && c1 < 0x1161 + 21) {
1161
0
        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
1162
0
    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
1163
0
               (c0 - 0xac00) % 28 == 0 &&
1164
0
               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
1165
0
        return c0 + c1 - 0x11a7;
1166
0
    } else {
1167
0
        return unicode_compose_pair(c0, c1);
1168
0
    }
1169
0
}
1170
1171
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
1172
                      UnicodeNormalizationEnum n_type,
1173
                      void *opaque, DynBufReallocFunc *realloc_func)
1174
0
{
1175
0
    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1176
0
    BOOL is_compat;
1177
0
    DynBuf dbuf_s, *dbuf = &dbuf_s;
1178
1179
0
    is_compat = n_type >> 1;
1180
1181
0
    dbuf_init2(dbuf, opaque, realloc_func);
1182
0
    if (dbuf_realloc(dbuf, sizeof(int) * src_len))
1183
0
        goto fail;
1184
1185
    /* common case: latin1 is unaffected by NFC */
1186
0
    if (n_type == UNICODE_NFC) {
1187
0
        for(i = 0; i < src_len; i++) {
1188
0
            if (src[i] >= 0x100)
1189
0
                goto not_latin1;
1190
0
        }
1191
0
        buf = (int *)dbuf->buf;
1192
0
        memcpy(buf, src, src_len * sizeof(int));
1193
0
        *pdst = (uint32_t *)buf;
1194
0
        return src_len;
1195
0
    not_latin1: ;
1196
0
    }
1197
1198
0
    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1199
0
    if (dbuf_error(dbuf)) {
1200
0
    fail:
1201
0
        *pdst = NULL;
1202
0
        return -1;
1203
0
    }
1204
0
    buf = (int *)dbuf->buf;
1205
0
    buf_len = dbuf->size / sizeof(int);
1206
1207
0
    sort_cc(buf, buf_len);
1208
1209
0
    if (buf_len <= 1 || (n_type & 1) != 0) {
1210
        /* NFD / NFKD */
1211
0
        *pdst = (uint32_t *)buf;
1212
0
        return buf_len;
1213
0
    }
1214
1215
0
    i = 1;
1216
0
    out_len = 1;
1217
0
    while (i < buf_len) {
1218
        /* find the starter character and test if it is blocked from
1219
           the character at 'i' */
1220
0
        last_cc = unicode_get_cc(buf[i]);
1221
0
        starter_pos = out_len - 1;
1222
0
        while (starter_pos >= 0) {
1223
0
            cc = unicode_get_cc(buf[starter_pos]);
1224
0
            if (cc == 0)
1225
0
                break;
1226
0
            if (cc >= last_cc)
1227
0
                goto next;
1228
0
            last_cc = 256;
1229
0
            starter_pos--;
1230
0
        }
1231
0
        if (starter_pos >= 0 &&
1232
0
            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1233
0
            buf[starter_pos] = p;
1234
0
            i++;
1235
0
        } else {
1236
0
        next:
1237
0
            buf[out_len++] = buf[i++];
1238
0
        }
1239
0
    }
1240
0
    *pdst = (uint32_t *)buf;
1241
0
    return out_len;
1242
0
}
1243
1244
/* char ranges for various unicode properties */
1245
1246
static int unicode_find_name(const char *name_table, const char *name)
1247
0
{
1248
0
    const char *p, *r;
1249
0
    int pos;
1250
0
    size_t name_len, len;
1251
1252
0
    p = name_table;
1253
0
    pos = 0;
1254
0
    name_len = strlen(name);
1255
0
    while (*p) {
1256
0
        for(;;) {
1257
0
            r = strchr(p, ',');
1258
0
            if (!r)
1259
0
                len = strlen(p);
1260
0
            else
1261
0
                len = r - p;
1262
0
            if (len == name_len && !memcmp(p, name, name_len))
1263
0
                return pos;
1264
0
            p += len + 1;
1265
0
            if (!r)
1266
0
                break;
1267
0
        }
1268
0
        pos++;
1269
0
    }
1270
0
    return -1;
1271
0
}
1272
1273
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1274
   if not found */
1275
int unicode_script(CharRange *cr,
1276
                   const char *script_name, BOOL is_ext)
1277
0
{
1278
0
    int script_idx;
1279
0
    const uint8_t *p, *p_end;
1280
0
    uint32_t c, c1, b, n, v, v_len, i, type;
1281
0
    CharRange cr1_s, *cr1;
1282
0
    CharRange cr2_s, *cr2 = &cr2_s;
1283
0
    BOOL is_common;
1284
1285
0
    script_idx = unicode_find_name(unicode_script_name_table, script_name);
1286
0
    if (script_idx < 0)
1287
0
        return -2;
1288
1289
0
    is_common = (script_idx == UNICODE_SCRIPT_Common ||
1290
0
                 script_idx == UNICODE_SCRIPT_Inherited);
1291
0
    if (is_ext) {
1292
0
        cr1 = &cr1_s;
1293
0
        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1294
0
        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1295
0
    } else {
1296
0
        cr1 = cr;
1297
0
    }
1298
1299
0
    p = unicode_script_table;
1300
0
    p_end = unicode_script_table + countof(unicode_script_table);
1301
0
    c = 0;
1302
0
    while (p < p_end) {
1303
0
        b = *p++;
1304
0
        type = b >> 7;
1305
0
        n = b & 0x7f;
1306
0
        if (n < 96) {
1307
0
        } else if (n < 112) {
1308
0
            n = (n - 96) << 8;
1309
0
            n |= *p++;
1310
0
            n += 96;
1311
0
        } else {
1312
0
            n = (n - 112) << 16;
1313
0
            n |= *p++ << 8;
1314
0
            n |= *p++;
1315
0
            n += 96 + (1 << 12);
1316
0
        }
1317
0
        c1 = c + n + 1;
1318
0
        if (type != 0) {
1319
0
            v = *p++;
1320
0
            if (v == script_idx || script_idx == UNICODE_SCRIPT_Unknown) {
1321
0
                if (cr_add_interval(cr1, c, c1))
1322
0
                    goto fail;
1323
0
            }
1324
0
        }
1325
0
        c = c1;
1326
0
    }
1327
0
    if (script_idx == UNICODE_SCRIPT_Unknown) {
1328
        /* Unknown is all the characters outside scripts */
1329
0
        if (cr_invert(cr1))
1330
0
            goto fail;
1331
0
    }
1332
1333
0
    if (is_ext) {
1334
        /* add the script extensions */
1335
0
        p = unicode_script_ext_table;
1336
0
        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1337
0
        c = 0;
1338
0
        while (p < p_end) {
1339
0
            b = *p++;
1340
0
            if (b < 128) {
1341
0
                n = b;
1342
0
            } else if (b < 128 + 64) {
1343
0
                n = (b - 128) << 8;
1344
0
                n |= *p++;
1345
0
                n += 128;
1346
0
            } else {
1347
0
                n = (b - 128 - 64) << 16;
1348
0
                n |= *p++ << 8;
1349
0
                n |= *p++;
1350
0
                n += 128 + (1 << 14);
1351
0
            }
1352
0
            c1 = c + n + 1;
1353
0
            v_len = *p++;
1354
0
            if (is_common) {
1355
0
                if (v_len != 0) {
1356
0
                    if (cr_add_interval(cr2, c, c1))
1357
0
                        goto fail;
1358
0
                }
1359
0
            } else {
1360
0
                for(i = 0; i < v_len; i++) {
1361
0
                    if (p[i] == script_idx) {
1362
0
                        if (cr_add_interval(cr2, c, c1))
1363
0
                            goto fail;
1364
0
                        break;
1365
0
                    }
1366
0
                }
1367
0
            }
1368
0
            p += v_len;
1369
0
            c = c1;
1370
0
        }
1371
0
        if (is_common) {
1372
            /* remove all the characters with script extensions */
1373
0
            if (cr_invert(cr2))
1374
0
                goto fail;
1375
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1376
0
                      CR_OP_INTER))
1377
0
                goto fail;
1378
0
        } else {
1379
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1380
0
                      CR_OP_UNION))
1381
0
                goto fail;
1382
0
        }
1383
0
        cr_free(cr1);
1384
0
        cr_free(cr2);
1385
0
    }
1386
0
    return 0;
1387
0
 fail:
1388
0
    if (is_ext) {
1389
0
        cr_free(cr1);
1390
0
        cr_free(cr2);
1391
0
    }
1392
0
    goto fail;
1393
0
}
1394
1395
0
#define M(id) (1U << UNICODE_GC_ ## id)
1396
1397
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1398
0
{
1399
0
    const uint8_t *p, *p_end;
1400
0
    uint32_t c, c0, b, n, v;
1401
1402
0
    p = unicode_gc_table;
1403
0
    p_end = unicode_gc_table + countof(unicode_gc_table);
1404
0
    c = 0;
1405
    /* Compressed range encoding:
1406
       initial byte:
1407
       bits 0..4: category number (special case 31)
1408
       bits 5..7: range length (add 1)
1409
       special case bits 5..7 == 7: read an extra byte
1410
       - 00..7F: range length (add 7 + 1)
1411
       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1412
       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1413
     */
1414
0
    while (p < p_end) {
1415
0
        b = *p++;
1416
0
        n = b >> 5;
1417
0
        v = b & 0x1f;
1418
0
        if (n == 7) {
1419
0
            n = *p++;
1420
0
            if (n < 128) {
1421
0
                n += 7;
1422
0
            } else if (n < 128 + 64) {
1423
0
                n = (n - 128) << 8;
1424
0
                n |= *p++;
1425
0
                n += 7 + 128;
1426
0
            } else {
1427
0
                n = (n - 128 - 64) << 16;
1428
0
                n |= *p++ << 8;
1429
0
                n |= *p++;
1430
0
                n += 7 + 128 + (1 << 14);
1431
0
            }
1432
0
        }
1433
0
        c0 = c;
1434
0
        c += n + 1;
1435
0
        if (v == 31) {
1436
            /* run of Lu / Ll */
1437
0
            b = gc_mask & (M(Lu) | M(Ll));
1438
0
            if (b != 0) {
1439
0
                if (b == (M(Lu) | M(Ll))) {
1440
0
                    goto add_range;
1441
0
                } else {
1442
0
                    c0 += ((gc_mask & M(Ll)) != 0);
1443
0
                    for(; c0 < c; c0 += 2) {
1444
0
                        if (cr_add_interval(cr, c0, c0 + 1))
1445
0
                            return -1;
1446
0
                    }
1447
0
                }
1448
0
            }
1449
0
        } else if ((gc_mask >> v) & 1) {
1450
0
        add_range:
1451
0
            if (cr_add_interval(cr, c0, c))
1452
0
                return -1;
1453
0
        }
1454
0
    }
1455
0
    return 0;
1456
0
}
1457
1458
static int unicode_prop1(CharRange *cr, int prop_idx)
1459
0
{
1460
0
    const uint8_t *p, *p_end;
1461
0
    uint32_t c, c0, b, bit;
1462
1463
0
    p = unicode_prop_table[prop_idx];
1464
0
    p_end = p + unicode_prop_len_table[prop_idx];
1465
0
    c = 0;
1466
0
    bit = 0;
1467
    /* Compressed range encoding:
1468
       00..3F: 2 packed lengths: 3-bit + 3-bit
1469
       40..5F: 5-bits plus extra byte for length
1470
       60..7F: 5-bits plus 2 extra bytes for length
1471
       80..FF: 7-bit length
1472
       lengths must be incremented to get character count
1473
       Ranges alternate between false and true return value.
1474
     */
1475
0
    while (p < p_end) {
1476
0
        c0 = c;
1477
0
        b = *p++;
1478
0
        if (b < 64) {
1479
0
            c += (b >> 3) + 1;
1480
0
            if (bit)  {
1481
0
                if (cr_add_interval(cr, c0, c))
1482
0
                    return -1;
1483
0
            }
1484
0
            bit ^= 1;
1485
0
            c0 = c;
1486
0
            c += (b & 7) + 1;
1487
0
        } else if (b >= 0x80) {
1488
0
            c += b - 0x80 + 1;
1489
0
        } else if (b < 0x60) {
1490
0
            c += (((b - 0x40) << 8) | p[0]) + 1;
1491
0
            p++;
1492
0
        } else {
1493
0
            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1494
0
            p += 2;
1495
0
        }
1496
0
        if (bit)  {
1497
0
            if (cr_add_interval(cr, c0, c))
1498
0
                return -1;
1499
0
        }
1500
0
        bit ^= 1;
1501
0
    }
1502
0
    return 0;
1503
0
}
1504
1505
typedef enum {
1506
    POP_GC,
1507
    POP_PROP,
1508
    POP_CASE,
1509
    POP_UNION,
1510
    POP_INTER,
1511
    POP_XOR,
1512
    POP_INVERT,
1513
    POP_END,
1514
} PropOPEnum;
1515
1516
#define POP_STACK_LEN_MAX 4
1517
1518
static int unicode_prop_ops(CharRange *cr, ...)
1519
0
{
1520
0
    va_list ap;
1521
0
    CharRange stack[POP_STACK_LEN_MAX];
1522
0
    int stack_len, op, ret, i;
1523
0
    uint32_t a;
1524
1525
0
    va_start(ap, cr);
1526
0
    stack_len = 0;
1527
0
    for(;;) {
1528
0
        op = va_arg(ap, int);
1529
0
        switch(op) {
1530
0
        case POP_GC:
1531
0
            assert(stack_len < POP_STACK_LEN_MAX);
1532
0
            a = va_arg(ap, int);
1533
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1534
0
            if (unicode_general_category1(&stack[stack_len - 1], a))
1535
0
                goto fail;
1536
0
            break;
1537
0
        case POP_PROP:
1538
0
            assert(stack_len < POP_STACK_LEN_MAX);
1539
0
            a = va_arg(ap, int);
1540
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1541
0
            if (unicode_prop1(&stack[stack_len - 1], a))
1542
0
                goto fail;
1543
0
            break;
1544
0
        case POP_CASE:
1545
0
            assert(stack_len < POP_STACK_LEN_MAX);
1546
0
            a = va_arg(ap, int);
1547
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1548
0
            if (unicode_case1(&stack[stack_len - 1], a))
1549
0
                goto fail;
1550
0
            break;
1551
0
        case POP_UNION:
1552
0
        case POP_INTER:
1553
0
        case POP_XOR:
1554
0
            {
1555
0
                CharRange *cr1, *cr2, *cr3;
1556
0
                assert(stack_len >= 2);
1557
0
                assert(stack_len < POP_STACK_LEN_MAX);
1558
0
                cr1 = &stack[stack_len - 2];
1559
0
                cr2 = &stack[stack_len - 1];
1560
0
                cr3 = &stack[stack_len++];
1561
0
                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1562
                /* CR_OP_XOR may be used here */
1563
0
                if (cr_op(cr3, cr1->points, cr1->len,
1564
0
                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1565
0
                    goto fail;
1566
0
                cr_free(cr1);
1567
0
                cr_free(cr2);
1568
0
                *cr1 = *cr3;
1569
0
                stack_len -= 2;
1570
0
            }
1571
0
            break;
1572
0
        case POP_INVERT:
1573
0
            assert(stack_len >= 1);
1574
0
            if (cr_invert(&stack[stack_len - 1]))
1575
0
                goto fail;
1576
0
            break;
1577
0
        case POP_END:
1578
0
            goto done;
1579
0
        default:
1580
0
            abort();
1581
0
        }
1582
0
    }
1583
0
 done:
1584
0
    assert(stack_len == 1);
1585
0
    ret = cr_copy(cr, &stack[0]);
1586
0
    cr_free(&stack[0]);
1587
0
    return ret;
1588
0
 fail:
1589
0
    for(i = 0; i < stack_len; i++)
1590
0
        cr_free(&stack[i]);
1591
0
    return -1;
1592
0
}
1593
1594
static const uint32_t unicode_gc_mask_table[] = {
1595
    M(Lu) | M(Ll) | M(Lt), /* LC */
1596
    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1597
    M(Mn) | M(Mc) | M(Me), /* M */
1598
    M(Nd) | M(Nl) | M(No), /* N */
1599
    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1600
    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1601
    M(Zs) | M(Zl) | M(Zp), /* Z */
1602
    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1603
};
1604
1605
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1606
   if not found */
1607
int unicode_general_category(CharRange *cr, const char *gc_name)
1608
0
{
1609
0
    int gc_idx;
1610
0
    uint32_t gc_mask;
1611
1612
0
    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1613
0
    if (gc_idx < 0)
1614
0
        return -2;
1615
0
    if (gc_idx <= UNICODE_GC_Co) {
1616
0
        gc_mask = (uint64_t)1 << gc_idx;
1617
0
    } else {
1618
0
        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1619
0
    }
1620
0
    return unicode_general_category1(cr, gc_mask);
1621
0
}
1622
1623
1624
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1625
   if not found */
1626
int unicode_prop(CharRange *cr, const char *prop_name)
1627
0
{
1628
0
    int prop_idx, ret;
1629
1630
0
    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1631
0
    if (prop_idx < 0)
1632
0
        return -2;
1633
0
    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1634
1635
0
    ret = 0;
1636
0
    switch(prop_idx) {
1637
0
    case UNICODE_PROP_ASCII:
1638
0
        if (cr_add_interval(cr, 0x00, 0x7f + 1))
1639
0
            return -1;
1640
0
        break;
1641
0
    case UNICODE_PROP_Any:
1642
0
        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1643
0
            return -1;
1644
0
        break;
1645
0
    case UNICODE_PROP_Assigned:
1646
0
        ret = unicode_prop_ops(cr,
1647
0
                               POP_GC, M(Cn),
1648
0
                               POP_INVERT,
1649
0
                               POP_END);
1650
0
        break;
1651
0
    case UNICODE_PROP_Math:
1652
0
        ret = unicode_prop_ops(cr,
1653
0
                               POP_GC, M(Sm),
1654
0
                               POP_PROP, UNICODE_PROP_Other_Math,
1655
0
                               POP_UNION,
1656
0
                               POP_END);
1657
0
        break;
1658
0
    case UNICODE_PROP_Lowercase:
1659
0
        ret = unicode_prop_ops(cr,
1660
0
                               POP_GC, M(Ll),
1661
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1662
0
                               POP_UNION,
1663
0
                               POP_END);
1664
0
        break;
1665
0
    case UNICODE_PROP_Uppercase:
1666
0
        ret = unicode_prop_ops(cr,
1667
0
                               POP_GC, M(Lu),
1668
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1669
0
                               POP_UNION,
1670
0
                               POP_END);
1671
0
        break;
1672
0
    case UNICODE_PROP_Cased:
1673
0
        ret = unicode_prop_ops(cr,
1674
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt),
1675
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1676
0
                               POP_UNION,
1677
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1678
0
                               POP_UNION,
1679
0
                               POP_END);
1680
0
        break;
1681
0
    case UNICODE_PROP_Alphabetic:
1682
0
        ret = unicode_prop_ops(cr,
1683
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1684
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1685
0
                               POP_UNION,
1686
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1687
0
                               POP_UNION,
1688
0
                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
1689
0
                               POP_UNION,
1690
0
                               POP_END);
1691
0
        break;
1692
0
    case UNICODE_PROP_Grapheme_Base:
1693
0
        ret = unicode_prop_ops(cr,
1694
0
                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1695
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1696
0
                               POP_UNION,
1697
0
                               POP_INVERT,
1698
0
                               POP_END);
1699
0
        break;
1700
0
    case UNICODE_PROP_Grapheme_Extend:
1701
0
        ret = unicode_prop_ops(cr,
1702
0
                               POP_GC, M(Me) | M(Mn),
1703
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1704
0
                               POP_UNION,
1705
0
                               POP_END);
1706
0
        break;
1707
0
    case UNICODE_PROP_XID_Start:
1708
0
        ret = unicode_prop_ops(cr,
1709
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1710
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1711
0
                               POP_UNION,
1712
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1713
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1714
0
                               POP_UNION,
1715
0
                               POP_PROP, UNICODE_PROP_XID_Start1,
1716
0
                               POP_UNION,
1717
0
                               POP_INVERT,
1718
0
                               POP_INTER,
1719
0
                               POP_END);
1720
0
        break;
1721
0
    case UNICODE_PROP_XID_Continue:
1722
0
        ret = unicode_prop_ops(cr,
1723
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1724
0
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1725
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1726
0
                               POP_UNION,
1727
0
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1728
0
                               POP_UNION,
1729
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1730
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1731
0
                               POP_UNION,
1732
0
                               POP_PROP, UNICODE_PROP_XID_Continue1,
1733
0
                               POP_UNION,
1734
0
                               POP_INVERT,
1735
0
                               POP_INTER,
1736
0
                               POP_END);
1737
0
        break;
1738
0
    case UNICODE_PROP_Changes_When_Uppercased:
1739
0
        ret = unicode_case1(cr, CASE_U);
1740
0
        break;
1741
0
    case UNICODE_PROP_Changes_When_Lowercased:
1742
0
        ret = unicode_case1(cr, CASE_L);
1743
0
        break;
1744
0
    case UNICODE_PROP_Changes_When_Casemapped:
1745
0
        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1746
0
        break;
1747
0
    case UNICODE_PROP_Changes_When_Titlecased:
1748
0
        ret = unicode_prop_ops(cr,
1749
0
                               POP_CASE, CASE_U,
1750
0
                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1751
0
                               POP_XOR,
1752
0
                               POP_END);
1753
0
        break;
1754
0
    case UNICODE_PROP_Changes_When_Casefolded:
1755
0
        ret = unicode_prop_ops(cr,
1756
0
                               POP_CASE, CASE_F,
1757
0
                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1758
0
                               POP_XOR,
1759
0
                               POP_END);
1760
0
        break;
1761
0
    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1762
0
        ret = unicode_prop_ops(cr,
1763
0
                               POP_CASE, CASE_F,
1764
0
                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1765
0
                               POP_XOR,
1766
0
                               POP_END);
1767
0
        break;
1768
#if 0
1769
    case UNICODE_PROP_ID_Start:
1770
        ret = unicode_prop_ops(cr,
1771
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1772
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1773
                               POP_UNION,
1774
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1775
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1776
                               POP_UNION,
1777
                               POP_INVERT,
1778
                               POP_INTER,
1779
                               POP_END);
1780
        break;
1781
    case UNICODE_PROP_ID_Continue:
1782
        ret = unicode_prop_ops(cr,
1783
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1784
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1785
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1786
                               POP_UNION,
1787
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1788
                               POP_UNION,
1789
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1790
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1791
                               POP_UNION,
1792
                               POP_INVERT,
1793
                               POP_INTER,
1794
                               POP_END);
1795
        break;
1796
    case UNICODE_PROP_Case_Ignorable:
1797
        ret = unicode_prop_ops(cr,
1798
                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1799
                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
1800
                               POP_XOR,
1801
                               POP_END);
1802
        break;
1803
#else
1804
        /* we use the existing tables */
1805
0
    case UNICODE_PROP_ID_Continue:
1806
0
        ret = unicode_prop_ops(cr,
1807
0
                               POP_PROP, UNICODE_PROP_ID_Start,
1808
0
                               POP_PROP, UNICODE_PROP_ID_Continue1,
1809
0
                               POP_XOR,
1810
0
                               POP_END);
1811
0
        break;
1812
0
#endif
1813
0
    default:
1814
0
        if (prop_idx >= countof(unicode_prop_table))
1815
0
            return -2;
1816
0
        ret = unicode_prop1(cr, prop_idx);
1817
0
        break;
1818
0
    }
1819
0
    return ret;
1820
0
}
1821
1822
#endif /* CONFIG_ALL_UNICODE */
1823
1824
/*---- lre codepoint categorizing functions ----*/
1825
1826
#define S  UNICODE_C_SPACE
1827
#define D  UNICODE_C_DIGIT
1828
#define X  UNICODE_C_XDIGIT
1829
#define U  UNICODE_C_UPPER
1830
#define L  UNICODE_C_LOWER
1831
#define _  UNICODE_C_UNDER
1832
#define d  UNICODE_C_DOLLAR
1833
1834
uint8_t const lre_ctype_bits[256] = {
1835
    0, 0, 0, 0, 0, 0, 0, 0,
1836
    0, S, S, S, S, S, 0, 0,
1837
    0, 0, 0, 0, 0, 0, 0, 0,
1838
    0, 0, 0, 0, 0, 0, 0, 0,
1839
1840
    S, 0, 0, 0, d, 0, 0, 0,
1841
    0, 0, 0, 0, 0, 0, 0, 0,
1842
    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1843
    X|D, X|D, 0, 0, 0, 0, 0, 0,
1844
1845
    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1846
    U, U, U, U, U, U, U, U,
1847
    U, U, U, U, U, U, U, U,
1848
    U, U, U, 0, 0, 0, 0, _,
1849
1850
    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1851
    L, L, L, L, L, L, L, L,
1852
    L, L, L, L, L, L, L, L,
1853
    L, L, L, 0, 0, 0, 0, 0,
1854
1855
    0, 0, 0, 0, 0, 0, 0, 0,
1856
    0, 0, 0, 0, 0, 0, 0, 0,
1857
    0, 0, 0, 0, 0, 0, 0, 0,
1858
    0, 0, 0, 0, 0, 0, 0, 0,
1859
1860
    S, 0, 0, 0, 0, 0, 0, 0,
1861
    0, 0, 0, 0, 0, 0, 0, 0,
1862
    0, 0, 0, 0, 0, 0, 0, 0,
1863
    0, 0, 0, 0, 0, 0, 0, 0,
1864
1865
    0, 0, 0, 0, 0, 0, 0, 0,
1866
    0, 0, 0, 0, 0, 0, 0, 0,
1867
    0, 0, 0, 0, 0, 0, 0, 0,
1868
    0, 0, 0, 0, 0, 0, 0, 0,
1869
1870
    0, 0, 0, 0, 0, 0, 0, 0,
1871
    0, 0, 0, 0, 0, 0, 0, 0,
1872
    0, 0, 0, 0, 0, 0, 0, 0,
1873
    0, 0, 0, 0, 0, 0, 0, 0,
1874
};
1875
1876
#undef S
1877
#undef D
1878
#undef X
1879
#undef U
1880
#undef L
1881
#undef _
1882
#undef d
1883
1884
/* code point ranges for Zs,Zl or Zp property */
1885
static const uint16_t char_range_s[] = {
1886
    10,
1887
    0x0009, 0x000D + 1,
1888
    0x0020, 0x0020 + 1,
1889
    0x00A0, 0x00A0 + 1,
1890
    0x1680, 0x1680 + 1,
1891
    0x2000, 0x200A + 1,
1892
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1893
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1894
    0x2028, 0x2029 + 1,
1895
    0x202F, 0x202F + 1,
1896
    0x205F, 0x205F + 1,
1897
    0x3000, 0x3000 + 1,
1898
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1899
    0xFEFF, 0xFEFF + 1,
1900
};
1901
1902
BOOL lre_is_space_non_ascii(uint32_t c)
1903
1
{
1904
1
    size_t i, n;
1905
1906
1
    n = countof(char_range_s);
1907
9
    for(i = 5; i < n; i += 2) {
1908
8
        uint32_t low = char_range_s[i];
1909
8
        uint32_t high = char_range_s[i + 1];
1910
8
        if (c < low)
1911
0
            return FALSE;
1912
8
        if (c < high)
1913
0
            return TRUE;
1914
8
    }
1915
1
    return FALSE;
1916
1
}
1917
1918
#define SEQ_MAX_LEN 16
1919
1920
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
1921
                                  CharRange *cr)
1922
0
{
1923
0
    int i, c, j;
1924
0
    uint32_t seq[SEQ_MAX_LEN];
1925
    
1926
0
    switch(seq_prop_idx) {
1927
0
    case UNICODE_SEQUENCE_PROP_Basic_Emoji:
1928
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
1929
0
            return -1;
1930
0
        for(i = 0; i < cr->len; i += 2) {
1931
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1932
0
                seq[0] = c;
1933
0
                cb(opaque, seq, 1);
1934
0
            }
1935
0
        }
1936
1937
0
        cr->len = 0;
1938
1939
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
1940
0
            return -1;
1941
0
        for(i = 0; i < cr->len; i += 2) {
1942
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1943
0
                seq[0] = c;
1944
0
                seq[1] = 0xfe0f;
1945
0
                cb(opaque, seq, 2);
1946
0
            }
1947
0
        }
1948
1949
0
        break;
1950
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
1951
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
1952
0
            return -1;
1953
0
        for(i = 0; i < cr->len; i += 2) {
1954
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1955
0
                for(j = 0; j < 5; j++) {
1956
0
                    seq[0] = c;
1957
0
                    seq[1] = 0x1f3fb + j;
1958
0
                    cb(opaque, seq, 2);
1959
0
                }
1960
0
            }
1961
0
        }
1962
0
        break;
1963
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
1964
0
        if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
1965
0
            return -1;
1966
0
        for(i = 0; i < cr->len; i += 2) {
1967
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1968
0
                int c0, c1;
1969
0
                c0 = c / 26;
1970
0
                c1 = c % 26;
1971
0
                seq[0] = 0x1F1E6 + c0;
1972
0
                seq[1] = 0x1F1E6 + c1;
1973
0
                cb(opaque, seq, 2);
1974
0
            }
1975
0
        }
1976
0
        break;
1977
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
1978
0
        {
1979
0
            int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
1980
0
            int mod_idx, hc_idx, i0, i1;
1981
0
            const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
1982
            
1983
0
            for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
1984
0
                len = tab[i++];
1985
0
                k = 0;
1986
0
                mod = 0;
1987
0
                mod_count = 0;
1988
0
                hc_pos = -1;
1989
0
                for(j = 0; j < len; j++) {
1990
0
                    code = tab[i++];
1991
0
                    code |= tab[i++] << 8;
1992
0
                    pres = code >> 15;
1993
0
                    mod1 = (code >> 13) & 3;
1994
0
                    code &= 0x1fff;
1995
0
                    if (code < 0x1000) {
1996
0
                        c = code + 0x2000;
1997
0
                    } else {
1998
0
                        c = 0x1f000 + (code - 0x1000);
1999
0
                    }
2000
0
                    if (c == 0x1f9b0)
2001
0
                        hc_pos = k;
2002
0
                    seq[k++] = c;
2003
0
                    if (mod1 != 0) {
2004
0
                        assert(mod_count < 2);
2005
0
                        mod = mod1;
2006
0
                        mod_pos[mod_count++] = k;
2007
0
                        seq[k++] = 0; /* will be filled later */
2008
0
                    }
2009
0
                    if (pres) {
2010
0
                        seq[k++] = 0xfe0f;
2011
0
                    }
2012
0
                    if (j < len - 1) {
2013
0
                        seq[k++] = 0x200d;
2014
0
                    }
2015
0
                }
2016
2017
                /* genrate all the variants */
2018
0
                switch(mod) {
2019
0
                case 1:
2020
0
                    n_mod = 5;
2021
0
                    break;
2022
0
                case 2:
2023
0
                    n_mod = 25;
2024
0
                    break;
2025
0
                case 3:
2026
0
                    n_mod = 20;
2027
0
                    break;
2028
0
                default:
2029
0
                    n_mod = 1;
2030
0
                    break;
2031
0
                }
2032
0
                if (hc_pos >= 0)
2033
0
                    n_hc = 4;
2034
0
                else
2035
0
                    n_hc = 1;
2036
0
                for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
2037
0
                    for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
2038
0
                        if (hc_pos >= 0)
2039
0
                            seq[hc_pos] = 0x1f9b0 + hc_idx;
2040
                        
2041
0
                        switch(mod) {
2042
0
                        case 1:
2043
0
                            seq[mod_pos[0]] = 0x1f3fb + mod_idx;
2044
0
                            break;
2045
0
                        case 2:
2046
0
                        case 3:
2047
0
                            i0 = mod_idx / 5;
2048
0
                            i1 = mod_idx % 5;
2049
                            /* avoid identical values */
2050
0
                            if (mod == 3 && i0 >= i1)
2051
0
                                i0++;
2052
0
                            seq[mod_pos[0]] = 0x1f3fb + i0;
2053
0
                            seq[mod_pos[1]] = 0x1f3fb + i1;
2054
0
                            break;
2055
0
                        default:
2056
0
                            break;
2057
0
                        }
2058
#if 0
2059
                        for(j = 0; j < k; j++)
2060
                            printf(" %04x", seq[j]);
2061
                        printf("\n");
2062
#endif                
2063
0
                        cb(opaque, seq, k);
2064
0
                    }
2065
0
                }
2066
0
            }
2067
0
        }
2068
0
        break;
2069
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
2070
0
        {
2071
0
            for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
2072
0
                j = 0;
2073
0
                seq[j++] = 0x1F3F4;
2074
0
                for(;;) {
2075
0
                    c = unicode_rgi_emoji_tag_sequence[i++];
2076
0
                    if (c == 0x00)
2077
0
                        break;
2078
0
                    seq[j++] = 0xe0000 + c;
2079
0
                }
2080
0
                seq[j++] = 0xe007f;
2081
0
                cb(opaque, seq, j);
2082
0
            }
2083
0
        }
2084
0
        break;
2085
0
    case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
2086
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
2087
0
            return -1;
2088
0
        for(i = 0; i < cr->len; i += 2) {
2089
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
2090
0
                seq[0] = c;
2091
0
                seq[1] = 0xfe0f;
2092
0
                seq[2] = 0x20e3;
2093
0
                cb(opaque, seq, 3);
2094
0
            }
2095
0
        }
2096
0
        break;
2097
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji:
2098
        /* all prevous sequences */
2099
0
        for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
2100
0
            int ret;
2101
0
            ret = unicode_sequence_prop1(i, cb, opaque, cr);
2102
0
            if (ret < 0)
2103
0
                return ret;
2104
0
            cr->len = 0;
2105
0
        }
2106
0
        break;
2107
0
    default:
2108
0
        return -2;
2109
0
    }
2110
0
    return 0;
2111
0
}
2112
2113
/* build a unicode sequence property */
2114
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2115
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
2116
                          CharRange *cr)
2117
0
{
2118
0
    int seq_prop_idx;
2119
0
    seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
2120
0
    if (seq_prop_idx < 0)
2121
0
        return -2;
2122
0
    return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
2123
0
}