Coverage Report

Created: 2026-01-17 07:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quickjs/libunicode.c
Line
Count
Source
1
/*
2
 * Unicode utilities
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <string.h>
28
#include <assert.h>
29
30
#include "cutils.h"
31
#include "libunicode.h"
32
#include "libunicode-table.h"
33
34
enum {
35
    RUN_TYPE_U,
36
    RUN_TYPE_L,
37
    RUN_TYPE_UF,
38
    RUN_TYPE_LF,
39
    RUN_TYPE_UL,
40
    RUN_TYPE_LSU,
41
    RUN_TYPE_U2L_399_EXT2,
42
    RUN_TYPE_UF_D20,
43
    RUN_TYPE_UF_D1_EXT,
44
    RUN_TYPE_U_EXT,
45
    RUN_TYPE_LF_EXT,
46
    RUN_TYPE_UF_EXT2,
47
    RUN_TYPE_LF_EXT2,
48
    RUN_TYPE_UF_EXT3,
49
};
50
51
static int lre_case_conv1(uint32_t c, int conv_type)
52
0
{
53
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
54
0
    lre_case_conv(res, c, conv_type);
55
0
    return res[0];
56
0
}
57
58
/* case conversion using the table entry 'idx' with value 'v' */
59
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
92.1M
{
61
92.1M
    uint32_t code, data, type, a, is_lower;
62
92.1M
    is_lower = (conv_type != 0);
63
92.1M
    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
92.1M
    data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
92.1M
    code = v >> (32 - 17);
66
92.1M
    switch(type) {
67
35.6M
    case RUN_TYPE_U:
68
35.6M
    case RUN_TYPE_L:
69
41.9M
    case RUN_TYPE_UF:
70
41.9M
    case RUN_TYPE_LF:
71
41.9M
        if (conv_type == (type & 1) ||
72
41.9M
            (type >= RUN_TYPE_UF && conv_type == 2)) {
73
41.9M
            c = c - code + (case_conv_table1[data] >> (32 - 17));
74
41.9M
        }
75
41.9M
        break;
76
40.3M
    case RUN_TYPE_UL:
77
40.3M
        a = c - code;
78
40.3M
        if ((a & 1) != (1 - is_lower))
79
2.32k
            break;
80
40.3M
        c = (a ^ 1) + code;
81
40.3M
        break;
82
606k
    case RUN_TYPE_LSU:
83
606k
        a = c - code;
84
606k
        if (a == 1) {
85
303k
            c += 2 * is_lower - 1;
86
303k
        } else if (a == (1 - is_lower) * 2) {
87
303k
            c += (2 * is_lower - 1) * 2;
88
303k
        }
89
606k
        break;
90
1.98M
    case RUN_TYPE_U2L_399_EXT2:
91
1.98M
        if (!is_lower) {
92
1.98M
            res[0] = c - code + case_conv_ext[data >> 6];
93
1.98M
            res[1] = 0x399;
94
1.98M
            return 2;
95
1.98M
        } else {
96
0
            c = c - code + case_conv_ext[data & 0x3f];
97
0
        }
98
0
        break;
99
1.40M
    case RUN_TYPE_UF_D20:
100
1.40M
        if (conv_type == 1)
101
0
            break;
102
1.40M
        c = data + (conv_type == 2) * 0x20;
103
1.40M
        break;
104
220k
    case RUN_TYPE_UF_D1_EXT:
105
220k
        if (conv_type == 1)
106
0
            break;
107
220k
        c = case_conv_ext[data] + (conv_type == 2);
108
220k
        break;
109
75.7k
    case RUN_TYPE_U_EXT:
110
76.0k
    case RUN_TYPE_LF_EXT:
111
76.0k
        if (is_lower != (type - RUN_TYPE_U_EXT))
112
270
            break;
113
75.7k
        c = case_conv_ext[data];
114
75.7k
        break;
115
0
    case RUN_TYPE_LF_EXT2:
116
0
        if (!is_lower)
117
0
            break;
118
0
        res[0] = c - code + case_conv_ext[data >> 6];
119
0
        res[1] = case_conv_ext[data & 0x3f];
120
0
        return 2;
121
4.34M
    case RUN_TYPE_UF_EXT2:
122
4.34M
        if (conv_type == 1)
123
0
            break;
124
4.34M
        res[0] = c - code + case_conv_ext[data >> 6];
125
4.34M
        res[1] = case_conv_ext[data & 0x3f];
126
4.34M
        if (conv_type == 2) {
127
            /* convert to lower */
128
0
            res[0] = lre_case_conv1(res[0], 1);
129
0
            res[1] = lre_case_conv1(res[1], 1);
130
0
        }
131
4.34M
        return 2;
132
0
    default:
133
1.17M
    case RUN_TYPE_UF_EXT3:
134
1.17M
        if (conv_type == 1)
135
0
            break;
136
1.17M
        res[0] = case_conv_ext[data >> 8];
137
1.17M
        res[1] = case_conv_ext[(data >> 4) & 0xf];
138
1.17M
        res[2] = case_conv_ext[data & 0xf];
139
1.17M
        if (conv_type == 2) {
140
            /* convert to lower */
141
0
            res[0] = lre_case_conv1(res[0], 1);
142
0
            res[1] = lre_case_conv1(res[1], 1);
143
0
            res[2] = lre_case_conv1(res[2], 1);
144
0
        }
145
1.17M
        return 3;
146
92.1M
    }
147
84.6M
    res[0] = c;
148
84.6M
    return 1;
149
92.1M
}
150
151
/* conv_type:
152
   0 = to upper
153
   1 = to lower
154
   2 = case folding (= to lower with modifications)
155
*/
156
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
0
{
158
0
    if (c < 128) {
159
0
        if (conv_type) {
160
0
            if (c >= 'A' && c <= 'Z') {
161
0
                c = c - 'A' + 'a';
162
0
            }
163
0
        } else {
164
0
            if (c >= 'a' && c <= 'z') {
165
0
                c = c - 'a' + 'A';
166
0
            }
167
0
        }
168
0
    } else {
169
0
        uint32_t v, code, len;
170
0
        int idx, idx_min, idx_max;
171
172
0
        idx_min = 0;
173
0
        idx_max = countof(case_conv_table1) - 1;
174
0
        while (idx_min <= idx_max) {
175
0
            idx = (unsigned)(idx_max + idx_min) / 2;
176
0
            v = case_conv_table1[idx];
177
0
            code = v >> (32 - 17);
178
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
179
0
            if (c < code) {
180
0
                idx_max = idx - 1;
181
0
            } else if (c >= code + len) {
182
0
                idx_min = idx + 1;
183
0
            } else {
184
0
                return lre_case_conv_entry(res, c, conv_type, idx, v);
185
0
            }
186
0
        }
187
0
    }
188
0
    res[0] = c;
189
0
    return 1;
190
0
}
191
192
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
94.1M
{
194
94.1M
    uint32_t res[LRE_CC_RES_LEN_MAX];
195
94.1M
    int len;
196
197
94.1M
    if (is_unicode) {
198
0
        len = lre_case_conv_entry(res, c, 2, idx, v);
199
0
        if (len == 1) {
200
0
            c = res[0];
201
0
        } else {
202
            /* handle the few specific multi-character cases (see
203
               unicode_gen.c:dump_case_folding_special_cases()) */
204
0
            if (c == 0xfb06) {
205
0
                c = 0xfb05;
206
0
            } else if (c == 0x01fd3) {
207
0
                c = 0x390;
208
0
            } else if (c == 0x01fe3) {
209
0
                c = 0x3b0;
210
0
            }
211
0
        }
212
94.1M
    } else {
213
94.1M
        if (likely(c < 128)) {
214
1.96M
            if (c >= 'a' && c <= 'z')
215
1.96M
                c = c - 'a' + 'A';
216
92.1M
        } else {
217
            /* legacy regexp: to upper case if single char >= 128 */
218
92.1M
            len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
92.1M
            if (len == 1 && res[0] >= 128)
220
84.5M
                c = res[0];
221
92.1M
        }
222
94.1M
    }
223
94.1M
    return c;
224
94.1M
}
225
226
/* JS regexp specific rules for case folding */
227
int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
2.28M
{
229
2.28M
    if (c < 128) {
230
        /* fast case */
231
2.23M
        if (is_unicode) {
232
0
            if (c >= 'A' && c <= 'Z') {
233
0
                c = c - 'A' + 'a';
234
0
            }
235
2.23M
        } else {
236
2.23M
            if (c >= 'a' && c <= 'z') {
237
1.26M
                c = c - 'a' + 'A';
238
1.26M
            }
239
2.23M
        }
240
2.23M
    } else {
241
53.2k
        uint32_t v, code, len;
242
53.2k
        int idx, idx_min, idx_max;
243
244
53.2k
        idx_min = 0;
245
53.2k
        idx_max = countof(case_conv_table1) - 1;
246
528k
        while (idx_min <= idx_max) {
247
478k
            idx = (unsigned)(idx_max + idx_min) / 2;
248
478k
            v = case_conv_table1[idx];
249
478k
            code = v >> (32 - 17);
250
478k
            len = (v >> (32 - 17 - 7)) & 0x7f;
251
478k
            if (c < code) {
252
59.1k
                idx_max = idx - 1;
253
419k
            } else if (c >= code + len) {
254
416k
                idx_min = idx + 1;
255
416k
            } else {
256
2.60k
                return lre_case_folding_entry(c, idx, v, is_unicode);
257
2.60k
            }
258
478k
        }
259
53.2k
    }
260
2.28M
    return c;
261
2.28M
}
262
263
static uint32_t get_le24(const uint8_t *ptr)
264
2.32k
{
265
2.32k
    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
2.32k
}
267
268
37
#define UNICODE_INDEX_BLOCK_LEN 32
269
270
/* return -1 if not in table, otherwise the offset in the block */
271
static int get_index_pos(uint32_t *pcode, uint32_t c,
272
                         const uint8_t *index_table, int index_table_len)
273
2.06k
{
274
2.06k
    uint32_t code, v;
275
2.06k
    int idx_min, idx_max, idx;
276
277
2.06k
    idx_min = 0;
278
2.06k
    v = get_le24(index_table);
279
2.06k
    code = v & ((1 << 21) - 1);
280
2.06k
    if (c < code) {
281
2.02k
        *pcode = 0;
282
2.02k
        return 0;
283
2.02k
    }
284
37
    idx_max = index_table_len - 1;
285
37
    code = get_le24(index_table + idx_max * 3);
286
37
    if (c >= code)
287
0
        return -1;
288
    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
222
    while ((idx_max - idx_min) > 1) {
290
185
        idx = (idx_max + idx_min) / 2;
291
185
        v = get_le24(index_table + idx * 3);
292
185
        code = v & ((1 << 21) - 1);
293
185
        if (c < code) {
294
179
            idx_max = idx;
295
179
        } else {
296
6
            idx_min = idx;
297
6
        }
298
185
    }
299
37
    v = get_le24(index_table + idx_min * 3);
300
37
    *pcode = v & ((1 << 21) - 1);
301
37
    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
37
}
303
304
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
                            const uint8_t *index_table, int index_table_len)
306
2.06k
{
307
2.06k
    uint32_t code, b, bit;
308
2.06k
    int pos;
309
2.06k
    const uint8_t *p;
310
311
2.06k
    pos = get_index_pos(&code, c, index_table, index_table_len);
312
2.06k
    if (pos < 0)
313
0
        return FALSE; /* outside the table */
314
2.06k
    p = table + pos;
315
2.06k
    bit = 0;
316
    /* Compressed run length encoding:
317
       00..3F: 2 packed lengths: 3-bit + 3-bit
318
       40..5F: 5-bits plus extra byte for length
319
       60..7F: 5-bits plus 2 extra bytes for length
320
       80..FF: 7-bit length
321
       lengths must be incremented to get character count
322
       Ranges alternate between false and true return value.
323
     */
324
28.6k
    for(;;) {
325
28.6k
        b = *p++;
326
28.6k
        if (b < 64) {
327
4.08k
            code += (b >> 3) + 1;
328
4.08k
            if (c < code)
329
0
                return bit;
330
4.08k
            bit ^= 1;
331
4.08k
            code += (b & 7) + 1;
332
24.5k
        } else if (b >= 0x80) {
333
22.4k
            code += b - 0x80 + 1;
334
22.4k
        } else if (b < 0x60) {
335
2.08k
            code += (((b - 0x40) << 8) | p[0]) + 1;
336
2.08k
            p++;
337
2.08k
        } else {
338
0
            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
0
            p += 2;
340
0
        }
341
28.6k
        if (c < code)
342
2.06k
            return bit;
343
26.5k
        bit ^= 1;
344
26.5k
    }
345
2.06k
}
346
347
BOOL lre_is_cased(uint32_t c)
348
0
{
349
0
    uint32_t v, code, len;
350
0
    int idx, idx_min, idx_max;
351
352
0
    idx_min = 0;
353
0
    idx_max = countof(case_conv_table1) - 1;
354
0
    while (idx_min <= idx_max) {
355
0
        idx = (unsigned)(idx_max + idx_min) / 2;
356
0
        v = case_conv_table1[idx];
357
0
        code = v >> (32 - 17);
358
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
359
0
        if (c < code) {
360
0
            idx_max = idx - 1;
361
0
        } else if (c >= code + len) {
362
0
            idx_min = idx + 1;
363
0
        } else {
364
0
            return TRUE;
365
0
        }
366
0
    }
367
0
    return lre_is_in_table(c, unicode_prop_Cased1_table,
368
0
                           unicode_prop_Cased1_index,
369
0
                           sizeof(unicode_prop_Cased1_index) / 3);
370
0
}
371
372
BOOL lre_is_case_ignorable(uint32_t c)
373
0
{
374
0
    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
0
                           unicode_prop_Case_Ignorable_index,
376
0
                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
0
}
378
379
/* character range */
380
381
static __maybe_unused void cr_dump(CharRange *cr)
382
0
{
383
0
    int i;
384
0
    for(i = 0; i < cr->len; i++)
385
0
        printf("%d: 0x%04x\n", i, cr->points[i]);
386
0
}
387
388
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
0
{
390
0
    return realloc(ptr, size);
391
0
}
392
393
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
1.01M
{
395
1.01M
    cr->len = cr->size = 0;
396
1.01M
    cr->points = NULL;
397
1.01M
    cr->mem_opaque = mem_opaque;
398
1.01M
    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
1.01M
}
400
401
void cr_free(CharRange *cr)
402
3.03M
{
403
3.03M
    cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
3.03M
}
405
406
int cr_realloc(CharRange *cr, int size)
407
26.6M
{
408
26.6M
    int new_size;
409
26.6M
    uint32_t *new_buf;
410
411
26.6M
    if (size > cr->size) {
412
26.3M
        new_size = max_int(size, cr->size * 3 / 2);
413
26.3M
        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
26.3M
                                   new_size * sizeof(cr->points[0]));
415
26.3M
        if (!new_buf)
416
0
            return -1;
417
26.3M
        cr->points = new_buf;
418
26.3M
        cr->size = new_size;
419
26.3M
    }
420
26.6M
    return 0;
421
26.6M
}
422
423
int cr_copy(CharRange *cr, const CharRange *cr1)
424
0
{
425
0
    if (cr_realloc(cr, cr1->len))
426
0
        return -1;
427
0
    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
0
    cr->len = cr1->len;
429
0
    return 0;
430
0
}
431
432
/* merge consecutive intervals and remove empty intervals */
433
static void cr_compress(CharRange *cr)
434
2.59M
{
435
2.59M
    int i, j, k, len;
436
2.59M
    uint32_t *pt;
437
438
2.59M
    pt = cr->points;
439
2.59M
    len = cr->len;
440
2.59M
    i = 0;
441
2.59M
    j = 0;
442
2.59M
    k = 0;
443
366M
    while ((i + 1) < len) {
444
363M
        if (pt[i] == pt[i + 1]) {
445
            /* empty interval */
446
13.6M
            i += 2;
447
350M
        } else {
448
350M
            j = i;
449
358M
            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
8.49M
                j += 2;
451
            /* just copy */
452
350M
            pt[k] = pt[i];
453
350M
            pt[k + 1] = pt[j + 1];
454
350M
            k += 2;
455
350M
            i = j + 2;
456
350M
        }
457
363M
    }
458
2.59M
    cr->len = k;
459
2.59M
}
460
461
/* union or intersection */
462
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
          const uint32_t *b_pt, int b_len, int op)
464
2.37M
{
465
2.37M
    int a_idx, b_idx, is_in;
466
2.37M
    uint32_t v;
467
468
2.37M
    a_idx = 0;
469
2.37M
    b_idx = 0;
470
703M
    for(;;) {
471
        /* get one more point from a or b in increasing order */
472
703M
        if (a_idx < a_len && b_idx < b_len) {
473
351M
            if (a_pt[a_idx] < b_pt[b_idx]) {
474
251M
                goto a_add;
475
251M
            } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
86.0M
                v = a_pt[a_idx];
477
86.0M
                a_idx++;
478
86.0M
                b_idx++;
479
86.0M
            } else {
480
13.9M
                goto b_add;
481
13.9M
            }
482
352M
        } else if (a_idx < a_len) {
483
511M
        a_add:
484
511M
            v = a_pt[a_idx++];
485
511M
        } else if (b_idx < b_len) {
486
103M
        b_add:
487
103M
            v = b_pt[b_idx++];
488
103M
        } else {
489
2.37M
            break;
490
2.37M
        }
491
        /* add the point if the in/out status changes */
492
701M
        switch(op) {
493
373M
        case CR_OP_UNION:
494
373M
            is_in = (a_idx & 1) | (b_idx & 1);
495
373M
            break;
496
328M
        case CR_OP_INTER:
497
328M
            is_in = (a_idx & 1) & (b_idx & 1);
498
328M
            break;
499
0
        case CR_OP_XOR:
500
0
            is_in = (a_idx & 1) ^ (b_idx & 1);
501
0
            break;
502
0
        case CR_OP_SUB:
503
0
            is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
504
0
            break;
505
0
        default:
506
0
            abort();
507
701M
        }
508
701M
        if (is_in != (cr->len & 1)) {
509
564M
            if (cr_add_point(cr, v))
510
0
                return -1;
511
564M
        }
512
701M
    }
513
2.37M
    cr_compress(cr);
514
2.37M
    return 0;
515
2.37M
}
516
517
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
518
2.01M
{
519
2.01M
    CharRange a = *cr;
520
2.01M
    int ret;
521
2.01M
    cr->len = 0;
522
2.01M
    cr->size = 0;
523
2.01M
    cr->points = NULL;
524
2.01M
    ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
525
2.01M
    cr_free(&a);
526
2.01M
    return ret;
527
2.01M
}
528
529
int cr_invert(CharRange *cr)
530
218k
{
531
218k
    int len;
532
218k
    len = cr->len;
533
218k
    if (cr_realloc(cr, len + 2))
534
0
        return -1;
535
218k
    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
536
218k
    cr->points[0] = 0;
537
218k
    cr->points[len + 1] = UINT32_MAX;
538
218k
    cr->len = len + 2;
539
218k
    cr_compress(cr);
540
218k
    return 0;
541
218k
}
542
543
15.5M
#define CASE_U (1 << 0)
544
7.45M
#define CASE_L (1 << 1)
545
7.45M
#define CASE_F (1 << 2)
546
547
/* use the case conversion table to generate range of characters.
548
   CASE_U: set char if modified by uppercasing,
549
   CASE_L: set char if modified by lowercasing,
550
   CASE_F: set char if modified by case folding,
551
 */
552
static int unicode_case1(CharRange *cr, int case_mask)
553
118k
{
554
3.43M
#define MR(x) (1 << RUN_TYPE_ ## x)
555
118k
    const uint32_t tab_run_mask[3] = {
556
118k
        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
557
118k
        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
558
559
118k
        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
560
561
118k
        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
562
118k
    };
563
118k
#undef MR
564
118k
    uint32_t mask, v, code, type, len, i, idx;
565
566
118k
    if (case_mask == 0)
567
0
        return 0;
568
118k
    mask = 0;
569
473k
    for(i = 0; i < 3; i++) {
570
355k
        if ((case_mask >> i) & 1)
571
118k
            mask |= tab_run_mask[i];
572
355k
    }
573
44.8M
    for(idx = 0; idx < countof(case_conv_table1); idx++) {
574
44.7M
        v = case_conv_table1[idx];
575
44.7M
        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
576
44.7M
        code = v >> (32 - 17);
577
44.7M
        len = (v >> (32 - 17 - 7)) & 0x7f;
578
44.7M
        if ((mask >> type) & 1) {
579
            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
580
30.6M
            switch(type) {
581
6.98M
            case RUN_TYPE_UL:
582
6.98M
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
583
0
                    goto def_case;
584
6.98M
                code += ((case_mask & CASE_U) != 0);
585
71.5M
                for(i = 0; i < len; i += 2) {
586
64.5M
                    if (cr_add_interval(cr, code + i, code + i + 1))
587
0
                        return -1;
588
64.5M
                }
589
6.98M
                break;
590
6.98M
            case RUN_TYPE_LSU:
591
473k
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
592
0
                    goto def_case;
593
473k
                if (!(case_mask & CASE_U)) {
594
0
                    if (cr_add_interval(cr, code, code + 1))
595
0
                        return -1;
596
0
                }
597
473k
                if (cr_add_interval(cr, code + 1, code + 2))
598
0
                    return -1;
599
473k
                if (case_mask & CASE_U) {
600
473k
                    if (cr_add_interval(cr, code + 2, code + 3))
601
0
                        return -1;
602
473k
                }
603
473k
                break;
604
23.2M
            default:
605
23.2M
            def_case:
606
23.2M
                if (cr_add_interval(cr, code, code + len))
607
0
                    return -1;
608
23.2M
                break;
609
30.6M
            }
610
30.6M
        }
611
44.7M
    }
612
118k
    return 0;
613
118k
}
614
615
static int point_cmp(const void *p1, const void *p2, void *arg)
616
429M
{
617
429M
    uint32_t v1 = *(uint32_t *)p1;
618
429M
    uint32_t v2 = *(uint32_t *)p2;
619
429M
    return (v1 > v2) - (v1 < v2);
620
429M
}
621
622
static void cr_sort_and_remove_overlap(CharRange *cr)
623
118k
{
624
118k
    uint32_t start, end, start1, end1, i, j;
625
626
    /* the resulting ranges are not necessarily sorted and may overlap */
627
118k
    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
628
118k
    j = 0;
629
46.0M
    for(i = 0; i < cr->len; ) {
630
45.9M
        start = cr->points[i];
631
45.9M
        end = cr->points[i + 1];
632
45.9M
        i += 2;
633
51.7M
        while (i < cr->len) {
634
51.6M
            start1 = cr->points[i];
635
51.6M
            end1 = cr->points[i + 1];
636
51.6M
            if (start1 > end) {
637
                /* |------|
638
                 *           |-------| */
639
45.7M
                break;
640
45.7M
            } else if (end1 <= end) {
641
                /* |------|
642
                 *    |--| */
643
1.70M
                i += 2;
644
4.15M
            } else {
645
                /* |------|
646
                 *     |-------| */
647
4.15M
                end = end1;
648
4.15M
                i += 2;
649
4.15M
            }
650
51.6M
        }
651
45.9M
        cr->points[j] = start;
652
45.9M
        cr->points[j + 1] = end;
653
45.9M
        j += 2;
654
45.9M
    }
655
118k
    cr->len = j;
656
118k
}
657
658
/* canonicalize a character set using the JS regex case folding rules
659
   (see lre_canonicalize()) */
660
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
661
118k
{
662
118k
    CharRange cr_inter, cr_mask, cr_result, cr_sub;
663
118k
    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
664
665
118k
    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
666
118k
    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
667
118k
    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
668
118k
    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
669
670
118k
    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
671
0
        goto fail;
672
118k
    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
673
0
        goto fail;
674
675
118k
    if (cr_invert(&cr_mask))
676
0
        goto fail;
677
118k
    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
678
0
        goto fail;
679
680
    /* cr_inter = cr & cr_mask */
681
    /* cr_sub = cr & ~cr_mask */
682
683
    /* use the case conversion table to compute the result */
684
118k
    d_start = -1;
685
118k
    d_end = -1;
686
118k
    idx = 0;
687
118k
    v = case_conv_table1[idx];
688
118k
    code = v >> (32 - 17);
689
118k
    len = (v >> (32 - 17 - 7)) & 0x7f;
690
46.2M
    for(i = 0; i < cr_inter.len; i += 2) {
691
46.0M
        start = cr_inter.points[i];
692
46.0M
        end = cr_inter.points[i + 1];
693
694
140M
        for(c = start; c < end; c++) {
695
122M
            for(;;) {
696
122M
                if (c >= code && c < code + len)
697
94.1M
                    break;
698
28.4M
                idx++;
699
28.4M
                assert(idx < countof(case_conv_table1));
700
28.4M
                v = case_conv_table1[idx];
701
28.4M
                code = v >> (32 - 17);
702
28.4M
                len = (v >> (32 - 17 - 7)) & 0x7f;
703
28.4M
            }
704
94.1M
            d = lre_case_folding_entry(c, idx, v, is_unicode);
705
            /* try to merge with the current interval */
706
94.1M
            if (d_start == -1) {
707
113k
                d_start = d;
708
113k
                d_end = d + 1;
709
94.0M
            } else if (d_end == d) {
710
42.3M
                d_end++;
711
51.6M
            } else {
712
51.6M
                cr_add_interval(&cr_result, d_start, d_end);
713
51.6M
                d_start = d;
714
51.6M
                d_end = d + 1;
715
51.6M
            }
716
94.1M
        }
717
46.0M
    }
718
118k
    if (d_start != -1) {
719
113k
        if (cr_add_interval(&cr_result, d_start, d_end))
720
0
            goto fail;
721
113k
    }
722
723
    /* the resulting ranges are not necessarily sorted and may overlap */
724
118k
    cr_sort_and_remove_overlap(&cr_result);
725
726
    /* or with the character not affected by the case folding */
727
118k
    cr->len = 0;
728
118k
    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
729
0
        goto fail;
730
731
118k
    cr_free(&cr_inter);
732
118k
    cr_free(&cr_mask);
733
118k
    cr_free(&cr_result);
734
118k
    cr_free(&cr_sub);
735
118k
    return 0;
736
0
 fail:
737
0
    cr_free(&cr_inter);
738
0
    cr_free(&cr_mask);
739
0
    cr_free(&cr_result);
740
0
    cr_free(&cr_sub);
741
0
    return -1;
742
118k
}
743
744
#ifdef CONFIG_ALL_UNICODE
745
746
BOOL lre_is_id_start(uint32_t c)
747
2.06k
{
748
2.06k
    return lre_is_in_table(c, unicode_prop_ID_Start_table,
749
2.06k
                           unicode_prop_ID_Start_index,
750
2.06k
                           sizeof(unicode_prop_ID_Start_index) / 3);
751
2.06k
}
752
753
BOOL lre_is_id_continue(uint32_t c)
754
1.99k
{
755
1.99k
    return lre_is_id_start(c) ||
756
0
        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
757
0
                        unicode_prop_ID_Continue1_index,
758
0
                        sizeof(unicode_prop_ID_Continue1_index) / 3);
759
1.99k
}
760
761
#define UNICODE_DECOMP_LEN_MAX 18
762
763
typedef enum {
764
    DECOMP_TYPE_C1, /* 16 bit char */
765
    DECOMP_TYPE_L1, /* 16 bit char table */
766
    DECOMP_TYPE_L2,
767
    DECOMP_TYPE_L3,
768
    DECOMP_TYPE_L4,
769
    DECOMP_TYPE_L5, /* XXX: not used */
770
    DECOMP_TYPE_L6, /* XXX: could remove */
771
    DECOMP_TYPE_L7, /* XXX: could remove */
772
    DECOMP_TYPE_LL1, /* 18 bit char table */
773
    DECOMP_TYPE_LL2,
774
    DECOMP_TYPE_S1, /* 8 bit char table */
775
    DECOMP_TYPE_S2,
776
    DECOMP_TYPE_S3,
777
    DECOMP_TYPE_S4,
778
    DECOMP_TYPE_S5,
779
    DECOMP_TYPE_I1, /* increment 16 bit char value */
780
    DECOMP_TYPE_I2_0,
781
    DECOMP_TYPE_I2_1,
782
    DECOMP_TYPE_I3_1,
783
    DECOMP_TYPE_I3_2,
784
    DECOMP_TYPE_I4_1,
785
    DECOMP_TYPE_I4_2,
786
    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
787
    DECOMP_TYPE_B2,
788
    DECOMP_TYPE_B3,
789
    DECOMP_TYPE_B4,
790
    DECOMP_TYPE_B5,
791
    DECOMP_TYPE_B6,
792
    DECOMP_TYPE_B7,
793
    DECOMP_TYPE_B8,
794
    DECOMP_TYPE_B18,
795
    DECOMP_TYPE_LS2,
796
    DECOMP_TYPE_PAT3,
797
    DECOMP_TYPE_S2_UL,
798
    DECOMP_TYPE_LS2_UL,
799
} DecompTypeEnum;
800
801
static uint32_t unicode_get_short_code(uint32_t c)
802
0
{
803
0
    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
804
805
0
    if (c < 0x80)
806
0
        return c;
807
0
    else if (c < 0x80 + 0x50)
808
0
        return c - 0x80 + 0x300;
809
0
    else
810
0
        return unicode_short_table[c - 0x80 - 0x50];
811
0
}
812
813
static uint32_t unicode_get_lower_simple(uint32_t c)
814
0
{
815
0
    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
816
0
        c += 0x20;
817
0
    else
818
0
        c++;
819
0
    return c;
820
0
}
821
822
static uint16_t unicode_get16(const uint8_t *p)
823
0
{
824
0
    return p[0] | (p[1] << 8);
825
0
}
826
827
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
828
                                int idx, uint32_t code, uint32_t len,
829
                                uint32_t type)
830
0
{
831
0
    uint32_t c1;
832
0
    int l, i, p;
833
0
    const uint8_t *d;
834
835
0
    if (type == DECOMP_TYPE_C1) {
836
0
        res[0] = unicode_decomp_table2[idx];
837
0
        return 1;
838
0
    } else {
839
0
        d = unicode_decomp_data + unicode_decomp_table2[idx];
840
0
        switch(type) {
841
0
        case DECOMP_TYPE_L1:
842
0
        case DECOMP_TYPE_L2:
843
0
        case DECOMP_TYPE_L3:
844
0
        case DECOMP_TYPE_L4:
845
0
        case DECOMP_TYPE_L5:
846
0
        case DECOMP_TYPE_L6:
847
0
        case DECOMP_TYPE_L7:
848
0
            l = type - DECOMP_TYPE_L1 + 1;
849
0
            d += (c - code) * l * 2;
850
0
            for(i = 0; i < l; i++) {
851
0
                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
852
0
                    return 0;
853
0
            }
854
0
            return l;
855
0
        case DECOMP_TYPE_LL1:
856
0
        case DECOMP_TYPE_LL2:
857
0
            {
858
0
                uint32_t k, p;
859
0
                l = type - DECOMP_TYPE_LL1 + 1;
860
0
                k = (c - code) * l;
861
0
                p = len * l * 2;
862
0
                for(i = 0; i < l; i++) {
863
0
                    c1 = unicode_get16(d + 2 * k) |
864
0
                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
865
0
                    if (!c1)
866
0
                        return 0;
867
0
                    res[i] = c1;
868
0
                    k++;
869
0
                }
870
0
            }
871
0
            return l;
872
0
        case DECOMP_TYPE_S1:
873
0
        case DECOMP_TYPE_S2:
874
0
        case DECOMP_TYPE_S3:
875
0
        case DECOMP_TYPE_S4:
876
0
        case DECOMP_TYPE_S5:
877
0
            l = type - DECOMP_TYPE_S1 + 1;
878
0
            d += (c - code) * l;
879
0
            for(i = 0; i < l; i++) {
880
0
                if ((res[i] = unicode_get_short_code(d[i])) == 0)
881
0
                    return 0;
882
0
            }
883
0
            return l;
884
0
        case DECOMP_TYPE_I1:
885
0
            l = 1;
886
0
            p = 0;
887
0
            goto decomp_type_i;
888
0
        case DECOMP_TYPE_I2_0:
889
0
        case DECOMP_TYPE_I2_1:
890
0
        case DECOMP_TYPE_I3_1:
891
0
        case DECOMP_TYPE_I3_2:
892
0
        case DECOMP_TYPE_I4_1:
893
0
        case DECOMP_TYPE_I4_2:
894
0
            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
895
0
            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
896
0
        decomp_type_i:
897
0
            for(i = 0; i < l; i++) {
898
0
                c1 = unicode_get16(d + 2 * i);
899
0
                if (i == p)
900
0
                    c1 += c - code;
901
0
                res[i] = c1;
902
0
            }
903
0
            return l;
904
0
        case DECOMP_TYPE_B18:
905
0
            l = 18;
906
0
            goto decomp_type_b;
907
0
        case DECOMP_TYPE_B1:
908
0
        case DECOMP_TYPE_B2:
909
0
        case DECOMP_TYPE_B3:
910
0
        case DECOMP_TYPE_B4:
911
0
        case DECOMP_TYPE_B5:
912
0
        case DECOMP_TYPE_B6:
913
0
        case DECOMP_TYPE_B7:
914
0
        case DECOMP_TYPE_B8:
915
0
            l = type - DECOMP_TYPE_B1 + 1;
916
0
        decomp_type_b:
917
0
            {
918
0
                uint32_t c_min;
919
0
                c_min = unicode_get16(d);
920
0
                d += 2 + (c - code) * l;
921
0
                for(i = 0; i < l; i++) {
922
0
                    c1 = d[i];
923
0
                    if (c1 == 0xff)
924
0
                        c1 = 0x20;
925
0
                    else
926
0
                        c1 += c_min;
927
0
                    res[i] = c1;
928
0
                }
929
0
            }
930
0
            return l;
931
0
        case DECOMP_TYPE_LS2:
932
0
            d += (c - code) * 3;
933
0
            if (!(res[0] = unicode_get16(d)))
934
0
                return 0;
935
0
            res[1] = unicode_get_short_code(d[2]);
936
0
            return 2;
937
0
        case DECOMP_TYPE_PAT3:
938
0
            res[0] = unicode_get16(d);
939
0
            res[2] = unicode_get16(d + 2);
940
0
            d += 4 + (c - code) * 2;
941
0
            res[1] = unicode_get16(d);
942
0
            return 3;
943
0
        case DECOMP_TYPE_S2_UL:
944
0
        case DECOMP_TYPE_LS2_UL:
945
0
            c1 = c - code;
946
0
            if (type == DECOMP_TYPE_S2_UL) {
947
0
                d += c1 & ~1;
948
0
                c = unicode_get_short_code(*d);
949
0
                d++;
950
0
            } else {
951
0
                d += (c1 >> 1) * 3;
952
0
                c = unicode_get16(d);
953
0
                d += 2;
954
0
            }
955
0
            if (c1 & 1)
956
0
                c = unicode_get_lower_simple(c);
957
0
            res[0] = c;
958
0
            res[1] = unicode_get_short_code(*d);
959
0
            return 2;
960
0
        }
961
0
    }
962
0
    return 0;
963
0
}
964
965
966
/* return the length of the decomposition (length <=
967
   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
968
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
969
0
{
970
0
    uint32_t v, type, is_compat, code, len;
971
0
    int idx_min, idx_max, idx;
972
973
0
    idx_min = 0;
974
0
    idx_max = countof(unicode_decomp_table1) - 1;
975
0
    while (idx_min <= idx_max) {
976
0
        idx = (idx_max + idx_min) / 2;
977
0
        v = unicode_decomp_table1[idx];
978
0
        code = v >> (32 - 18);
979
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
980
        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
981
0
        if (c < code) {
982
0
            idx_max = idx - 1;
983
0
        } else if (c >= code + len) {
984
0
            idx_min = idx + 1;
985
0
        } else {
986
0
            is_compat = v & 1;
987
0
            if (is_compat1 < is_compat)
988
0
                break;
989
0
            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
990
0
            return unicode_decomp_entry(res, c, idx, code, len, type);
991
0
        }
992
0
    }
993
0
    return 0;
994
0
}
995
996
/* return 0 if no pair found */
997
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
998
0
{
999
0
    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
1000
0
    int idx_min, idx_max, idx, d;
1001
0
    uint32_t pair[2];
1002
1003
0
    idx_min = 0;
1004
0
    idx_max = countof(unicode_comp_table) - 1;
1005
0
    while (idx_min <= idx_max) {
1006
0
        idx = (idx_max + idx_min) / 2;
1007
0
        idx1 = unicode_comp_table[idx];
1008
1009
        /* idx1 represent an entry of the decomposition table */
1010
0
        d_idx = idx1 >> 6;
1011
0
        d_offset = idx1 & 0x3f;
1012
0
        v = unicode_decomp_table1[d_idx];
1013
0
        code = v >> (32 - 18);
1014
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
1015
0
        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
1016
0
        ch = code + d_offset;
1017
0
        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1018
0
        d = c0 - pair[0];
1019
0
        if (d == 0)
1020
0
            d = c1 - pair[1];
1021
0
        if (d < 0) {
1022
0
            idx_max = idx - 1;
1023
0
        } else if (d > 0) {
1024
0
            idx_min = idx + 1;
1025
0
        } else {
1026
0
            return ch;
1027
0
        }
1028
0
    }
1029
0
    return 0;
1030
0
}
1031
1032
/* return the combining class of character c (between 0 and 255) */
1033
static int unicode_get_cc(uint32_t c)
1034
0
{
1035
0
    uint32_t code, n, type, cc, c1, b;
1036
0
    int pos;
1037
0
    const uint8_t *p;
1038
1039
0
    pos = get_index_pos(&code, c,
1040
0
                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
1041
0
    if (pos < 0)
1042
0
        return 0;
1043
0
    p = unicode_cc_table + pos;
1044
    /* Compressed run length encoding:
1045
       - 2 high order bits are combining class type
1046
       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1047
       - 00..2F: range length (add 1)
1048
       - 30..37: 3-bit range-length + 1 extra byte
1049
       - 38..3F: 3-bit range-length + 2 extra byte
1050
     */
1051
0
    for(;;) {
1052
0
        b = *p++;
1053
0
        type = b >> 6;
1054
0
        n = b & 0x3f;
1055
0
        if (n < 48) {
1056
0
        } else if (n < 56) {
1057
0
            n = (n - 48) << 8;
1058
0
            n |= *p++;
1059
0
            n += 48;
1060
0
        } else {
1061
0
            n = (n - 56) << 8;
1062
0
            n |= *p++ << 8;
1063
0
            n |= *p++;
1064
0
            n += 48 + (1 << 11);
1065
0
        }
1066
0
        if (type <= 1)
1067
0
            p++;
1068
0
        c1 = code + n + 1;
1069
0
        if (c < c1) {
1070
0
            switch(type) {
1071
0
            case 0:
1072
0
                cc = p[-1];
1073
0
                break;
1074
0
            case 1:
1075
0
                cc = p[-1] + c - code;
1076
0
                break;
1077
0
            case 2:
1078
0
                cc = 0;
1079
0
                break;
1080
0
            default:
1081
0
            case 3:
1082
0
                cc = 230;
1083
0
                break;
1084
0
            }
1085
0
            return cc;
1086
0
        }
1087
0
        code = c1;
1088
0
    }
1089
0
}
1090
1091
static void sort_cc(int *buf, int len)
1092
0
{
1093
0
    int i, j, k, cc, cc1, start, ch1;
1094
1095
0
    for(i = 0; i < len; i++) {
1096
0
        cc = unicode_get_cc(buf[i]);
1097
0
        if (cc != 0) {
1098
0
            start = i;
1099
0
            j = i + 1;
1100
0
            while (j < len) {
1101
0
                ch1 = buf[j];
1102
0
                cc1 = unicode_get_cc(ch1);
1103
0
                if (cc1 == 0)
1104
0
                    break;
1105
0
                k = j - 1;
1106
0
                while (k >= start) {
1107
0
                    if (unicode_get_cc(buf[k]) <= cc1)
1108
0
                        break;
1109
0
                    buf[k + 1] = buf[k];
1110
0
                    k--;
1111
0
                }
1112
0
                buf[k + 1] = ch1;
1113
0
                j++;
1114
0
            }
1115
#if 0
1116
            printf("cc:");
1117
            for(k = start; k < j; k++) {
1118
                printf(" %3d", unicode_get_cc(buf[k]));
1119
            }
1120
            printf("\n");
1121
#endif
1122
0
            i = j;
1123
0
        }
1124
0
    }
1125
0
}
1126
1127
static void to_nfd_rec(DynBuf *dbuf,
1128
                       const int *src, int src_len, int is_compat)
1129
0
{
1130
0
    uint32_t c, v;
1131
0
    int i, l;
1132
0
    uint32_t res[UNICODE_DECOMP_LEN_MAX];
1133
1134
0
    for(i = 0; i < src_len; i++) {
1135
0
        c = src[i];
1136
0
        if (c >= 0xac00 && c < 0xd7a4) {
1137
            /* Hangul decomposition */
1138
0
            c -= 0xac00;
1139
0
            dbuf_put_u32(dbuf, 0x1100 + c / 588);
1140
0
            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
1141
0
            v = c % 28;
1142
0
            if (v != 0)
1143
0
                dbuf_put_u32(dbuf, 0x11a7 + v);
1144
0
        } else {
1145
0
            l = unicode_decomp_char(res, c, is_compat);
1146
0
            if (l) {
1147
0
                to_nfd_rec(dbuf, (int *)res, l, is_compat);
1148
0
            } else {
1149
0
                dbuf_put_u32(dbuf, c);
1150
0
            }
1151
0
        }
1152
0
    }
1153
0
}
1154
1155
/* return 0 if not found */
1156
static int compose_pair(uint32_t c0, uint32_t c1)
1157
0
{
1158
    /* Hangul composition */
1159
0
    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
1160
0
        c1 >= 0x1161 && c1 < 0x1161 + 21) {
1161
0
        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
1162
0
    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
1163
0
               (c0 - 0xac00) % 28 == 0 &&
1164
0
               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
1165
0
        return c0 + c1 - 0x11a7;
1166
0
    } else {
1167
0
        return unicode_compose_pair(c0, c1);
1168
0
    }
1169
0
}
1170
1171
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
1172
                      UnicodeNormalizationEnum n_type,
1173
                      void *opaque, DynBufReallocFunc *realloc_func)
1174
0
{
1175
0
    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1176
0
    BOOL is_compat;
1177
0
    DynBuf dbuf_s, *dbuf = &dbuf_s;
1178
1179
0
    is_compat = n_type >> 1;
1180
1181
0
    dbuf_init2(dbuf, opaque, realloc_func);
1182
0
    if (dbuf_claim(dbuf, sizeof(int) * src_len))
1183
0
        goto fail;
1184
1185
    /* common case: latin1 is unaffected by NFC */
1186
0
    if (n_type == UNICODE_NFC) {
1187
0
        for(i = 0; i < src_len; i++) {
1188
0
            if (src[i] >= 0x100)
1189
0
                goto not_latin1;
1190
0
        }
1191
0
        buf = (int *)dbuf->buf;
1192
0
        memcpy(buf, src, src_len * sizeof(int));
1193
0
        *pdst = (uint32_t *)buf;
1194
0
        return src_len;
1195
0
    not_latin1: ;
1196
0
    }
1197
1198
0
    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1199
0
    if (dbuf_error(dbuf)) {
1200
0
    fail:
1201
0
        *pdst = NULL;
1202
0
        return -1;
1203
0
    }
1204
0
    buf = (int *)dbuf->buf;
1205
0
    buf_len = dbuf->size / sizeof(int);
1206
1207
0
    sort_cc(buf, buf_len);
1208
1209
0
    if (buf_len <= 1 || (n_type & 1) != 0) {
1210
        /* NFD / NFKD */
1211
0
        *pdst = (uint32_t *)buf;
1212
0
        return buf_len;
1213
0
    }
1214
1215
0
    i = 1;
1216
0
    out_len = 1;
1217
0
    while (i < buf_len) {
1218
        /* find the starter character and test if it is blocked from
1219
           the character at 'i' */
1220
0
        last_cc = unicode_get_cc(buf[i]);
1221
0
        starter_pos = out_len - 1;
1222
0
        while (starter_pos >= 0) {
1223
0
            cc = unicode_get_cc(buf[starter_pos]);
1224
0
            if (cc == 0)
1225
0
                break;
1226
0
            if (cc >= last_cc)
1227
0
                goto next;
1228
0
            last_cc = 256;
1229
0
            starter_pos--;
1230
0
        }
1231
0
        if (starter_pos >= 0 &&
1232
0
            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1233
0
            buf[starter_pos] = p;
1234
0
            i++;
1235
0
        } else {
1236
0
        next:
1237
0
            buf[out_len++] = buf[i++];
1238
0
        }
1239
0
    }
1240
0
    *pdst = (uint32_t *)buf;
1241
0
    return out_len;
1242
0
}
1243
1244
/* char ranges for various unicode properties */
1245
1246
static int unicode_find_name(const char *name_table, const char *name)
1247
0
{
1248
0
    const char *p, *r;
1249
0
    int pos;
1250
0
    size_t name_len, len;
1251
1252
0
    p = name_table;
1253
0
    pos = 0;
1254
0
    name_len = strlen(name);
1255
0
    while (*p) {
1256
0
        for(;;) {
1257
0
            r = strchr(p, ',');
1258
0
            if (!r)
1259
0
                len = strlen(p);
1260
0
            else
1261
0
                len = r - p;
1262
0
            if (len == name_len && !memcmp(p, name, name_len))
1263
0
                return pos;
1264
0
            p += len + 1;
1265
0
            if (!r)
1266
0
                break;
1267
0
        }
1268
0
        pos++;
1269
0
    }
1270
0
    return -1;
1271
0
}
1272
1273
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1274
   if not found */
1275
int unicode_script(CharRange *cr,
1276
                   const char *script_name, BOOL is_ext)
1277
0
{
1278
0
    int script_idx;
1279
0
    const uint8_t *p, *p_end;
1280
0
    uint32_t c, c1, b, n, v, v_len, i, type;
1281
0
    CharRange cr1_s, *cr1;
1282
0
    CharRange cr2_s, *cr2 = &cr2_s;
1283
0
    BOOL is_common;
1284
1285
0
    script_idx = unicode_find_name(unicode_script_name_table, script_name);
1286
0
    if (script_idx < 0)
1287
0
        return -2;
1288
1289
0
    is_common = (script_idx == UNICODE_SCRIPT_Common ||
1290
0
                 script_idx == UNICODE_SCRIPT_Inherited);
1291
0
    if (is_ext) {
1292
0
        cr1 = &cr1_s;
1293
0
        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1294
0
        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1295
0
    } else {
1296
0
        cr1 = cr;
1297
0
    }
1298
1299
0
    p = unicode_script_table;
1300
0
    p_end = unicode_script_table + countof(unicode_script_table);
1301
0
    c = 0;
1302
0
    while (p < p_end) {
1303
0
        b = *p++;
1304
0
        type = b >> 7;
1305
0
        n = b & 0x7f;
1306
0
        if (n < 96) {
1307
0
        } else if (n < 112) {
1308
0
            n = (n - 96) << 8;
1309
0
            n |= *p++;
1310
0
            n += 96;
1311
0
        } else {
1312
0
            n = (n - 112) << 16;
1313
0
            n |= *p++ << 8;
1314
0
            n |= *p++;
1315
0
            n += 96 + (1 << 12);
1316
0
        }
1317
0
        c1 = c + n + 1;
1318
0
        if (type != 0) {
1319
0
            v = *p++;
1320
0
            if (v == script_idx || script_idx == UNICODE_SCRIPT_Unknown) {
1321
0
                if (cr_add_interval(cr1, c, c1))
1322
0
                    goto fail;
1323
0
            }
1324
0
        }
1325
0
        c = c1;
1326
0
    }
1327
0
    if (script_idx == UNICODE_SCRIPT_Unknown) {
1328
        /* Unknown is all the characters outside scripts */
1329
0
        if (cr_invert(cr1))
1330
0
            goto fail;
1331
0
    }
1332
1333
0
    if (is_ext) {
1334
        /* add the script extensions */
1335
0
        p = unicode_script_ext_table;
1336
0
        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1337
0
        c = 0;
1338
0
        while (p < p_end) {
1339
0
            b = *p++;
1340
0
            if (b < 128) {
1341
0
                n = b;
1342
0
            } else if (b < 128 + 64) {
1343
0
                n = (b - 128) << 8;
1344
0
                n |= *p++;
1345
0
                n += 128;
1346
0
            } else {
1347
0
                n = (b - 128 - 64) << 16;
1348
0
                n |= *p++ << 8;
1349
0
                n |= *p++;
1350
0
                n += 128 + (1 << 14);
1351
0
            }
1352
0
            c1 = c + n + 1;
1353
0
            v_len = *p++;
1354
0
            if (is_common) {
1355
0
                if (v_len != 0) {
1356
0
                    if (cr_add_interval(cr2, c, c1))
1357
0
                        goto fail;
1358
0
                }
1359
0
            } else {
1360
0
                for(i = 0; i < v_len; i++) {
1361
0
                    if (p[i] == script_idx) {
1362
0
                        if (cr_add_interval(cr2, c, c1))
1363
0
                            goto fail;
1364
0
                        break;
1365
0
                    }
1366
0
                }
1367
0
            }
1368
0
            p += v_len;
1369
0
            c = c1;
1370
0
        }
1371
0
        if (is_common) {
1372
            /* remove all the characters with script extensions */
1373
0
            if (cr_invert(cr2))
1374
0
                goto fail;
1375
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1376
0
                      CR_OP_INTER))
1377
0
                goto fail;
1378
0
        } else {
1379
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1380
0
                      CR_OP_UNION))
1381
0
                goto fail;
1382
0
        }
1383
0
        cr_free(cr1);
1384
0
        cr_free(cr2);
1385
0
    }
1386
0
    return 0;
1387
0
 fail:
1388
0
    if (is_ext) {
1389
0
        cr_free(cr1);
1390
0
        cr_free(cr2);
1391
0
    }
1392
0
    goto fail;
1393
0
}
1394
1395
0
#define M(id) (1U << UNICODE_GC_ ## id)
1396
1397
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1398
0
{
1399
0
    const uint8_t *p, *p_end;
1400
0
    uint32_t c, c0, b, n, v;
1401
1402
0
    p = unicode_gc_table;
1403
0
    p_end = unicode_gc_table + countof(unicode_gc_table);
1404
0
    c = 0;
1405
    /* Compressed range encoding:
1406
       initial byte:
1407
       bits 0..4: category number (special case 31)
1408
       bits 5..7: range length (add 1)
1409
       special case bits 5..7 == 7: read an extra byte
1410
       - 00..7F: range length (add 7 + 1)
1411
       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1412
       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1413
     */
1414
0
    while (p < p_end) {
1415
0
        b = *p++;
1416
0
        n = b >> 5;
1417
0
        v = b & 0x1f;
1418
0
        if (n == 7) {
1419
0
            n = *p++;
1420
0
            if (n < 128) {
1421
0
                n += 7;
1422
0
            } else if (n < 128 + 64) {
1423
0
                n = (n - 128) << 8;
1424
0
                n |= *p++;
1425
0
                n += 7 + 128;
1426
0
            } else {
1427
0
                n = (n - 128 - 64) << 16;
1428
0
                n |= *p++ << 8;
1429
0
                n |= *p++;
1430
0
                n += 7 + 128 + (1 << 14);
1431
0
            }
1432
0
        }
1433
0
        c0 = c;
1434
0
        c += n + 1;
1435
0
        if (v == 31) {
1436
            /* run of Lu / Ll */
1437
0
            b = gc_mask & (M(Lu) | M(Ll));
1438
0
            if (b != 0) {
1439
0
                if (b == (M(Lu) | M(Ll))) {
1440
0
                    goto add_range;
1441
0
                } else {
1442
0
                    c0 += ((gc_mask & M(Ll)) != 0);
1443
0
                    for(; c0 < c; c0 += 2) {
1444
0
                        if (cr_add_interval(cr, c0, c0 + 1))
1445
0
                            return -1;
1446
0
                    }
1447
0
                }
1448
0
            }
1449
0
        } else if ((gc_mask >> v) & 1) {
1450
0
        add_range:
1451
0
            if (cr_add_interval(cr, c0, c))
1452
0
                return -1;
1453
0
        }
1454
0
    }
1455
0
    return 0;
1456
0
}
1457
1458
static int unicode_prop1(CharRange *cr, int prop_idx)
1459
0
{
1460
0
    const uint8_t *p, *p_end;
1461
0
    uint32_t c, c0, b, bit;
1462
1463
0
    p = unicode_prop_table[prop_idx];
1464
0
    p_end = p + unicode_prop_len_table[prop_idx];
1465
0
    c = 0;
1466
0
    bit = 0;
1467
    /* Compressed range encoding:
1468
       00..3F: 2 packed lengths: 3-bit + 3-bit
1469
       40..5F: 5-bits plus extra byte for length
1470
       60..7F: 5-bits plus 2 extra bytes for length
1471
       80..FF: 7-bit length
1472
       lengths must be incremented to get character count
1473
       Ranges alternate between false and true return value.
1474
     */
1475
0
    while (p < p_end) {
1476
0
        c0 = c;
1477
0
        b = *p++;
1478
0
        if (b < 64) {
1479
0
            c += (b >> 3) + 1;
1480
0
            if (bit)  {
1481
0
                if (cr_add_interval(cr, c0, c))
1482
0
                    return -1;
1483
0
            }
1484
0
            bit ^= 1;
1485
0
            c0 = c;
1486
0
            c += (b & 7) + 1;
1487
0
        } else if (b >= 0x80) {
1488
0
            c += b - 0x80 + 1;
1489
0
        } else if (b < 0x60) {
1490
0
            c += (((b - 0x40) << 8) | p[0]) + 1;
1491
0
            p++;
1492
0
        } else {
1493
0
            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1494
0
            p += 2;
1495
0
        }
1496
0
        if (bit)  {
1497
0
            if (cr_add_interval(cr, c0, c))
1498
0
                return -1;
1499
0
        }
1500
0
        bit ^= 1;
1501
0
    }
1502
0
    return 0;
1503
0
}
1504
1505
typedef enum {
1506
    POP_GC,
1507
    POP_PROP,
1508
    POP_CASE,
1509
    POP_UNION,
1510
    POP_INTER,
1511
    POP_XOR,
1512
    POP_INVERT,
1513
    POP_END,
1514
} PropOPEnum;
1515
1516
#define POP_STACK_LEN_MAX 4
1517
1518
static int unicode_prop_ops(CharRange *cr, ...)
1519
0
{
1520
0
    va_list ap;
1521
0
    CharRange stack[POP_STACK_LEN_MAX];
1522
0
    int stack_len, op, ret, i;
1523
0
    uint32_t a;
1524
1525
0
    va_start(ap, cr);
1526
0
    stack_len = 0;
1527
0
    for(;;) {
1528
0
        op = va_arg(ap, int);
1529
0
        switch(op) {
1530
0
        case POP_GC:
1531
0
            assert(stack_len < POP_STACK_LEN_MAX);
1532
0
            a = va_arg(ap, int);
1533
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1534
0
            if (unicode_general_category1(&stack[stack_len - 1], a))
1535
0
                goto fail;
1536
0
            break;
1537
0
        case POP_PROP:
1538
0
            assert(stack_len < POP_STACK_LEN_MAX);
1539
0
            a = va_arg(ap, int);
1540
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1541
0
            if (unicode_prop1(&stack[stack_len - 1], a))
1542
0
                goto fail;
1543
0
            break;
1544
0
        case POP_CASE:
1545
0
            assert(stack_len < POP_STACK_LEN_MAX);
1546
0
            a = va_arg(ap, int);
1547
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1548
0
            if (unicode_case1(&stack[stack_len - 1], a))
1549
0
                goto fail;
1550
0
            break;
1551
0
        case POP_UNION:
1552
0
        case POP_INTER:
1553
0
        case POP_XOR:
1554
0
            {
1555
0
                CharRange *cr1, *cr2, *cr3;
1556
0
                assert(stack_len >= 2);
1557
0
                assert(stack_len < POP_STACK_LEN_MAX);
1558
0
                cr1 = &stack[stack_len - 2];
1559
0
                cr2 = &stack[stack_len - 1];
1560
0
                cr3 = &stack[stack_len++];
1561
0
                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1562
                /* CR_OP_XOR may be used here */
1563
0
                if (cr_op(cr3, cr1->points, cr1->len,
1564
0
                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1565
0
                    goto fail;
1566
0
                cr_free(cr1);
1567
0
                cr_free(cr2);
1568
0
                *cr1 = *cr3;
1569
0
                stack_len -= 2;
1570
0
            }
1571
0
            break;
1572
0
        case POP_INVERT:
1573
0
            assert(stack_len >= 1);
1574
0
            if (cr_invert(&stack[stack_len - 1]))
1575
0
                goto fail;
1576
0
            break;
1577
0
        case POP_END:
1578
0
            goto done;
1579
0
        default:
1580
0
            abort();
1581
0
        }
1582
0
    }
1583
0
 done:
1584
0
    assert(stack_len == 1);
1585
0
    ret = cr_copy(cr, &stack[0]);
1586
0
    cr_free(&stack[0]);
1587
0
    return ret;
1588
0
 fail:
1589
0
    for(i = 0; i < stack_len; i++)
1590
0
        cr_free(&stack[i]);
1591
0
    return -1;
1592
0
}
1593
1594
static const uint32_t unicode_gc_mask_table[] = {
1595
    M(Lu) | M(Ll) | M(Lt), /* LC */
1596
    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1597
    M(Mn) | M(Mc) | M(Me), /* M */
1598
    M(Nd) | M(Nl) | M(No), /* N */
1599
    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1600
    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1601
    M(Zs) | M(Zl) | M(Zp), /* Z */
1602
    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1603
};
1604
1605
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1606
   if not found */
1607
int unicode_general_category(CharRange *cr, const char *gc_name)
1608
0
{
1609
0
    int gc_idx;
1610
0
    uint32_t gc_mask;
1611
1612
0
    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1613
0
    if (gc_idx < 0)
1614
0
        return -2;
1615
0
    if (gc_idx <= UNICODE_GC_Co) {
1616
0
        gc_mask = (uint64_t)1 << gc_idx;
1617
0
    } else {
1618
0
        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1619
0
    }
1620
0
    return unicode_general_category1(cr, gc_mask);
1621
0
}
1622
1623
1624
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1625
   if not found */
1626
int unicode_prop(CharRange *cr, const char *prop_name)
1627
0
{
1628
0
    int prop_idx, ret;
1629
1630
0
    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1631
0
    if (prop_idx < 0)
1632
0
        return -2;
1633
0
    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1634
1635
0
    ret = 0;
1636
0
    switch(prop_idx) {
1637
0
    case UNICODE_PROP_ASCII:
1638
0
        if (cr_add_interval(cr, 0x00, 0x7f + 1))
1639
0
            return -1;
1640
0
        break;
1641
0
    case UNICODE_PROP_Any:
1642
0
        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1643
0
            return -1;
1644
0
        break;
1645
0
    case UNICODE_PROP_Assigned:
1646
0
        ret = unicode_prop_ops(cr,
1647
0
                               POP_GC, M(Cn),
1648
0
                               POP_INVERT,
1649
0
                               POP_END);
1650
0
        break;
1651
0
    case UNICODE_PROP_Math:
1652
0
        ret = unicode_prop_ops(cr,
1653
0
                               POP_GC, M(Sm),
1654
0
                               POP_PROP, UNICODE_PROP_Other_Math,
1655
0
                               POP_UNION,
1656
0
                               POP_END);
1657
0
        break;
1658
0
    case UNICODE_PROP_Lowercase:
1659
0
        ret = unicode_prop_ops(cr,
1660
0
                               POP_GC, M(Ll),
1661
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1662
0
                               POP_UNION,
1663
0
                               POP_END);
1664
0
        break;
1665
0
    case UNICODE_PROP_Uppercase:
1666
0
        ret = unicode_prop_ops(cr,
1667
0
                               POP_GC, M(Lu),
1668
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1669
0
                               POP_UNION,
1670
0
                               POP_END);
1671
0
        break;
1672
0
    case UNICODE_PROP_Cased:
1673
0
        ret = unicode_prop_ops(cr,
1674
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt),
1675
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1676
0
                               POP_UNION,
1677
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1678
0
                               POP_UNION,
1679
0
                               POP_END);
1680
0
        break;
1681
0
    case UNICODE_PROP_Alphabetic:
1682
0
        ret = unicode_prop_ops(cr,
1683
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1684
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1685
0
                               POP_UNION,
1686
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1687
0
                               POP_UNION,
1688
0
                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
1689
0
                               POP_UNION,
1690
0
                               POP_END);
1691
0
        break;
1692
0
    case UNICODE_PROP_Grapheme_Base:
1693
0
        ret = unicode_prop_ops(cr,
1694
0
                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1695
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1696
0
                               POP_UNION,
1697
0
                               POP_INVERT,
1698
0
                               POP_END);
1699
0
        break;
1700
0
    case UNICODE_PROP_Grapheme_Extend:
1701
0
        ret = unicode_prop_ops(cr,
1702
0
                               POP_GC, M(Me) | M(Mn),
1703
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1704
0
                               POP_UNION,
1705
0
                               POP_END);
1706
0
        break;
1707
0
    case UNICODE_PROP_XID_Start:
1708
0
        ret = unicode_prop_ops(cr,
1709
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1710
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1711
0
                               POP_UNION,
1712
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1713
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1714
0
                               POP_UNION,
1715
0
                               POP_PROP, UNICODE_PROP_XID_Start1,
1716
0
                               POP_UNION,
1717
0
                               POP_INVERT,
1718
0
                               POP_INTER,
1719
0
                               POP_END);
1720
0
        break;
1721
0
    case UNICODE_PROP_XID_Continue:
1722
0
        ret = unicode_prop_ops(cr,
1723
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1724
0
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1725
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1726
0
                               POP_UNION,
1727
0
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1728
0
                               POP_UNION,
1729
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1730
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1731
0
                               POP_UNION,
1732
0
                               POP_PROP, UNICODE_PROP_XID_Continue1,
1733
0
                               POP_UNION,
1734
0
                               POP_INVERT,
1735
0
                               POP_INTER,
1736
0
                               POP_END);
1737
0
        break;
1738
0
    case UNICODE_PROP_Changes_When_Uppercased:
1739
0
        ret = unicode_case1(cr, CASE_U);
1740
0
        break;
1741
0
    case UNICODE_PROP_Changes_When_Lowercased:
1742
0
        ret = unicode_case1(cr, CASE_L);
1743
0
        break;
1744
0
    case UNICODE_PROP_Changes_When_Casemapped:
1745
0
        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1746
0
        break;
1747
0
    case UNICODE_PROP_Changes_When_Titlecased:
1748
0
        ret = unicode_prop_ops(cr,
1749
0
                               POP_CASE, CASE_U,
1750
0
                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1751
0
                               POP_XOR,
1752
0
                               POP_END);
1753
0
        break;
1754
0
    case UNICODE_PROP_Changes_When_Casefolded:
1755
0
        ret = unicode_prop_ops(cr,
1756
0
                               POP_CASE, CASE_F,
1757
0
                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1758
0
                               POP_XOR,
1759
0
                               POP_END);
1760
0
        break;
1761
0
    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1762
0
        ret = unicode_prop_ops(cr,
1763
0
                               POP_CASE, CASE_F,
1764
0
                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1765
0
                               POP_XOR,
1766
0
                               POP_END);
1767
0
        break;
1768
#if 0
1769
    case UNICODE_PROP_ID_Start:
1770
        ret = unicode_prop_ops(cr,
1771
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1772
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1773
                               POP_UNION,
1774
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1775
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1776
                               POP_UNION,
1777
                               POP_INVERT,
1778
                               POP_INTER,
1779
                               POP_END);
1780
        break;
1781
    case UNICODE_PROP_ID_Continue:
1782
        ret = unicode_prop_ops(cr,
1783
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1784
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1785
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1786
                               POP_UNION,
1787
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1788
                               POP_UNION,
1789
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1790
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1791
                               POP_UNION,
1792
                               POP_INVERT,
1793
                               POP_INTER,
1794
                               POP_END);
1795
        break;
1796
    case UNICODE_PROP_Case_Ignorable:
1797
        ret = unicode_prop_ops(cr,
1798
                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1799
                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
1800
                               POP_XOR,
1801
                               POP_END);
1802
        break;
1803
#else
1804
        /* we use the existing tables */
1805
0
    case UNICODE_PROP_ID_Continue:
1806
0
        ret = unicode_prop_ops(cr,
1807
0
                               POP_PROP, UNICODE_PROP_ID_Start,
1808
0
                               POP_PROP, UNICODE_PROP_ID_Continue1,
1809
0
                               POP_XOR,
1810
0
                               POP_END);
1811
0
        break;
1812
0
#endif
1813
0
    default:
1814
0
        if (prop_idx >= countof(unicode_prop_table))
1815
0
            return -2;
1816
0
        ret = unicode_prop1(cr, prop_idx);
1817
0
        break;
1818
0
    }
1819
0
    return ret;
1820
0
}
1821
1822
#endif /* CONFIG_ALL_UNICODE */
1823
1824
/*---- lre codepoint categorizing functions ----*/
1825
1826
#define S  UNICODE_C_SPACE
1827
#define D  UNICODE_C_DIGIT
1828
#define X  UNICODE_C_XDIGIT
1829
#define U  UNICODE_C_UPPER
1830
#define L  UNICODE_C_LOWER
1831
#define _  UNICODE_C_UNDER
1832
#define d  UNICODE_C_DOLLAR
1833
1834
uint8_t const lre_ctype_bits[256] = {
1835
    0, 0, 0, 0, 0, 0, 0, 0,
1836
    0, S, S, S, S, S, 0, 0,
1837
    0, 0, 0, 0, 0, 0, 0, 0,
1838
    0, 0, 0, 0, 0, 0, 0, 0,
1839
1840
    S, 0, 0, 0, d, 0, 0, 0,
1841
    0, 0, 0, 0, 0, 0, 0, 0,
1842
    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1843
    X|D, X|D, 0, 0, 0, 0, 0, 0,
1844
1845
    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1846
    U, U, U, U, U, U, U, U,
1847
    U, U, U, U, U, U, U, U,
1848
    U, U, U, 0, 0, 0, 0, _,
1849
1850
    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1851
    L, L, L, L, L, L, L, L,
1852
    L, L, L, L, L, L, L, L,
1853
    L, L, L, 0, 0, 0, 0, 0,
1854
1855
    0, 0, 0, 0, 0, 0, 0, 0,
1856
    0, 0, 0, 0, 0, 0, 0, 0,
1857
    0, 0, 0, 0, 0, 0, 0, 0,
1858
    0, 0, 0, 0, 0, 0, 0, 0,
1859
1860
    S, 0, 0, 0, 0, 0, 0, 0,
1861
    0, 0, 0, 0, 0, 0, 0, 0,
1862
    0, 0, 0, 0, 0, 0, 0, 0,
1863
    0, 0, 0, 0, 0, 0, 0, 0,
1864
1865
    0, 0, 0, 0, 0, 0, 0, 0,
1866
    0, 0, 0, 0, 0, 0, 0, 0,
1867
    0, 0, 0, 0, 0, 0, 0, 0,
1868
    0, 0, 0, 0, 0, 0, 0, 0,
1869
1870
    0, 0, 0, 0, 0, 0, 0, 0,
1871
    0, 0, 0, 0, 0, 0, 0, 0,
1872
    0, 0, 0, 0, 0, 0, 0, 0,
1873
    0, 0, 0, 0, 0, 0, 0, 0,
1874
};
1875
1876
#undef S
1877
#undef D
1878
#undef X
1879
#undef U
1880
#undef L
1881
#undef _
1882
#undef d
1883
1884
/* code point ranges for Zs,Zl or Zp property */
1885
static const uint16_t char_range_s[] = {
1886
    10,
1887
    0x0009, 0x000D + 1,
1888
    0x0020, 0x0020 + 1,
1889
    0x00A0, 0x00A0 + 1,
1890
    0x1680, 0x1680 + 1,
1891
    0x2000, 0x200A + 1,
1892
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1893
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1894
    0x2028, 0x2029 + 1,
1895
    0x202F, 0x202F + 1,
1896
    0x205F, 0x205F + 1,
1897
    0x3000, 0x3000 + 1,
1898
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1899
    0xFEFF, 0xFEFF + 1,
1900
};
1901
1902
BOOL lre_is_space_non_ascii(uint32_t c)
1903
6
{
1904
6
    size_t i, n;
1905
1906
6
    n = countof(char_range_s);
1907
36
    for(i = 5; i < n; i += 2) {
1908
36
        uint32_t low = char_range_s[i];
1909
36
        uint32_t high = char_range_s[i + 1];
1910
36
        if (c < low)
1911
0
            return FALSE;
1912
36
        if (c < high)
1913
6
            return TRUE;
1914
36
    }
1915
0
    return FALSE;
1916
6
}
1917
1918
#define SEQ_MAX_LEN 16
1919
1920
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
1921
                                  CharRange *cr)
1922
0
{
1923
0
    int i, c, j;
1924
0
    uint32_t seq[SEQ_MAX_LEN];
1925
    
1926
0
    switch(seq_prop_idx) {
1927
0
    case UNICODE_SEQUENCE_PROP_Basic_Emoji:
1928
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
1929
0
            return -1;
1930
0
        for(i = 0; i < cr->len; i += 2) {
1931
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1932
0
                seq[0] = c;
1933
0
                cb(opaque, seq, 1);
1934
0
            }
1935
0
        }
1936
1937
0
        cr->len = 0;
1938
1939
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
1940
0
            return -1;
1941
0
        for(i = 0; i < cr->len; i += 2) {
1942
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1943
0
                seq[0] = c;
1944
0
                seq[1] = 0xfe0f;
1945
0
                cb(opaque, seq, 2);
1946
0
            }
1947
0
        }
1948
1949
0
        break;
1950
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
1951
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
1952
0
            return -1;
1953
0
        for(i = 0; i < cr->len; i += 2) {
1954
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1955
0
                for(j = 0; j < 5; j++) {
1956
0
                    seq[0] = c;
1957
0
                    seq[1] = 0x1f3fb + j;
1958
0
                    cb(opaque, seq, 2);
1959
0
                }
1960
0
            }
1961
0
        }
1962
0
        break;
1963
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
1964
0
        if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
1965
0
            return -1;
1966
0
        for(i = 0; i < cr->len; i += 2) {
1967
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1968
0
                int c0, c1;
1969
0
                c0 = c / 26;
1970
0
                c1 = c % 26;
1971
0
                seq[0] = 0x1F1E6 + c0;
1972
0
                seq[1] = 0x1F1E6 + c1;
1973
0
                cb(opaque, seq, 2);
1974
0
            }
1975
0
        }
1976
0
        break;
1977
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
1978
0
        {
1979
0
            int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
1980
0
            int mod_idx, hc_idx, i0, i1;
1981
0
            const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
1982
            
1983
0
            for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
1984
0
                len = tab[i++];
1985
0
                k = 0;
1986
0
                mod = 0;
1987
0
                mod_count = 0;
1988
0
                hc_pos = -1;
1989
0
                for(j = 0; j < len; j++) {
1990
0
                    code = tab[i++];
1991
0
                    code |= tab[i++] << 8;
1992
0
                    pres = code >> 15;
1993
0
                    mod1 = (code >> 13) & 3;
1994
0
                    code &= 0x1fff;
1995
0
                    if (code < 0x1000) {
1996
0
                        c = code + 0x2000;
1997
0
                    } else {
1998
0
                        c = 0x1f000 + (code - 0x1000);
1999
0
                    }
2000
0
                    if (c == 0x1f9b0)
2001
0
                        hc_pos = k;
2002
0
                    seq[k++] = c;
2003
0
                    if (mod1 != 0) {
2004
0
                        assert(mod_count < 2);
2005
0
                        mod = mod1;
2006
0
                        mod_pos[mod_count++] = k;
2007
0
                        seq[k++] = 0; /* will be filled later */
2008
0
                    }
2009
0
                    if (pres) {
2010
0
                        seq[k++] = 0xfe0f;
2011
0
                    }
2012
0
                    if (j < len - 1) {
2013
0
                        seq[k++] = 0x200d;
2014
0
                    }
2015
0
                }
2016
2017
                /* genrate all the variants */
2018
0
                switch(mod) {
2019
0
                case 1:
2020
0
                    n_mod = 5;
2021
0
                    break;
2022
0
                case 2:
2023
0
                    n_mod = 25;
2024
0
                    break;
2025
0
                case 3:
2026
0
                    n_mod = 20;
2027
0
                    break;
2028
0
                default:
2029
0
                    n_mod = 1;
2030
0
                    break;
2031
0
                }
2032
0
                if (hc_pos >= 0)
2033
0
                    n_hc = 4;
2034
0
                else
2035
0
                    n_hc = 1;
2036
0
                for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
2037
0
                    for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
2038
0
                        if (hc_pos >= 0)
2039
0
                            seq[hc_pos] = 0x1f9b0 + hc_idx;
2040
                        
2041
0
                        switch(mod) {
2042
0
                        case 1:
2043
0
                            seq[mod_pos[0]] = 0x1f3fb + mod_idx;
2044
0
                            break;
2045
0
                        case 2:
2046
0
                        case 3:
2047
0
                            i0 = mod_idx / 5;
2048
0
                            i1 = mod_idx % 5;
2049
                            /* avoid identical values */
2050
0
                            if (mod == 3 && i0 >= i1)
2051
0
                                i0++;
2052
0
                            seq[mod_pos[0]] = 0x1f3fb + i0;
2053
0
                            seq[mod_pos[1]] = 0x1f3fb + i1;
2054
0
                            break;
2055
0
                        default:
2056
0
                            break;
2057
0
                        }
2058
#if 0
2059
                        for(j = 0; j < k; j++)
2060
                            printf(" %04x", seq[j]);
2061
                        printf("\n");
2062
#endif                
2063
0
                        cb(opaque, seq, k);
2064
0
                    }
2065
0
                }
2066
0
            }
2067
0
        }
2068
0
        break;
2069
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
2070
0
        {
2071
0
            for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
2072
0
                j = 0;
2073
0
                seq[j++] = 0x1F3F4;
2074
0
                for(;;) {
2075
0
                    c = unicode_rgi_emoji_tag_sequence[i++];
2076
0
                    if (c == 0x00)
2077
0
                        break;
2078
0
                    seq[j++] = 0xe0000 + c;
2079
0
                }
2080
0
                seq[j++] = 0xe007f;
2081
0
                cb(opaque, seq, j);
2082
0
            }
2083
0
        }
2084
0
        break;
2085
0
    case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
2086
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
2087
0
            return -1;
2088
0
        for(i = 0; i < cr->len; i += 2) {
2089
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
2090
0
                seq[0] = c;
2091
0
                seq[1] = 0xfe0f;
2092
0
                seq[2] = 0x20e3;
2093
0
                cb(opaque, seq, 3);
2094
0
            }
2095
0
        }
2096
0
        break;
2097
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji:
2098
        /* all prevous sequences */
2099
0
        for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
2100
0
            int ret;
2101
0
            ret = unicode_sequence_prop1(i, cb, opaque, cr);
2102
0
            if (ret < 0)
2103
0
                return ret;
2104
0
            cr->len = 0;
2105
0
        }
2106
0
        break;
2107
0
    default:
2108
0
        return -2;
2109
0
    }
2110
0
    return 0;
2111
0
}
2112
2113
/* build a unicode sequence property */
2114
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2115
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
2116
                          CharRange *cr)
2117
0
{
2118
0
    int seq_prop_idx;
2119
0
    seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
2120
0
    if (seq_prop_idx < 0)
2121
0
        return -2;
2122
0
    return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
2123
0
}