Coverage Report

Created: 2025-10-13 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quickjs/libunicode.c
Line
Count
Source
1
/*
2
 * Unicode utilities
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <string.h>
28
#include <assert.h>
29
30
#include "cutils.h"
31
#include "libunicode.h"
32
#include "libunicode-table.h"
33
34
enum {
35
    RUN_TYPE_U,
36
    RUN_TYPE_L,
37
    RUN_TYPE_UF,
38
    RUN_TYPE_LF,
39
    RUN_TYPE_UL,
40
    RUN_TYPE_LSU,
41
    RUN_TYPE_U2L_399_EXT2,
42
    RUN_TYPE_UF_D20,
43
    RUN_TYPE_UF_D1_EXT,
44
    RUN_TYPE_U_EXT,
45
    RUN_TYPE_LF_EXT,
46
    RUN_TYPE_UF_EXT2,
47
    RUN_TYPE_LF_EXT2,
48
    RUN_TYPE_UF_EXT3,
49
};
50
51
static int lre_case_conv1(uint32_t c, int conv_type)
52
0
{
53
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
54
0
    lre_case_conv(res, c, conv_type);
55
0
    return res[0];
56
0
}
57
58
/* case conversion using the table entry 'idx' with value 'v' */
59
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
489k
{
61
489k
    uint32_t code, data, type, a, is_lower;
62
489k
    is_lower = (conv_type != 0);
63
489k
    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
489k
    data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
489k
    code = v >> (32 - 17);
66
489k
    switch(type) {
67
193k
    case RUN_TYPE_U:
68
193k
    case RUN_TYPE_L:
69
217k
    case RUN_TYPE_UF:
70
218k
    case RUN_TYPE_LF:
71
218k
        if (conv_type == (type & 1) ||
72
217k
            (type >= RUN_TYPE_UF && conv_type == 2)) {
73
217k
            c = c - code + (case_conv_table1[data] >> (32 - 17));
74
217k
        }
75
218k
        break;
76
220k
    case RUN_TYPE_UL:
77
220k
        a = c - code;
78
220k
        if ((a & 1) != (1 - is_lower))
79
216
            break;
80
220k
        c = (a ^ 1) + code;
81
220k
        break;
82
5.46k
    case RUN_TYPE_LSU:
83
5.46k
        a = c - code;
84
5.46k
        if (a == 1) {
85
2.63k
            c += 2 * is_lower - 1;
86
2.83k
        } else if (a == (1 - is_lower) * 2) {
87
2.63k
            c += (2 * is_lower - 1) * 2;
88
2.63k
        }
89
5.46k
        break;
90
9.43k
    case RUN_TYPE_U2L_399_EXT2:
91
9.43k
        if (!is_lower) {
92
9.43k
            res[0] = c - code + case_conv_ext[data >> 6];
93
9.43k
            res[1] = 0x399;
94
9.43k
            return 2;
95
9.43k
        } else {
96
0
            c = c - code + case_conv_ext[data & 0x3f];
97
0
        }
98
0
        break;
99
9.09k
    case RUN_TYPE_UF_D20:
100
9.09k
        if (conv_type == 1)
101
0
            break;
102
9.09k
        c = data + (conv_type == 2) * 0x20;
103
9.09k
        break;
104
1.07k
    case RUN_TYPE_UF_D1_EXT:
105
1.07k
        if (conv_type == 1)
106
0
            break;
107
1.07k
        c = case_conv_ext[data] + (conv_type == 2);
108
1.07k
        break;
109
721
    case RUN_TYPE_U_EXT:
110
919
    case RUN_TYPE_LF_EXT:
111
919
        if (is_lower != (type - RUN_TYPE_U_EXT))
112
198
            break;
113
721
        c = case_conv_ext[data];
114
721
        break;
115
202
    case RUN_TYPE_LF_EXT2:
116
202
        if (!is_lower)
117
202
            break;
118
0
        res[0] = c - code + case_conv_ext[data >> 6];
119
0
        res[1] = case_conv_ext[data & 0x3f];
120
0
        return 2;
121
18.5k
    case RUN_TYPE_UF_EXT2:
122
18.5k
        if (conv_type == 1)
123
0
            break;
124
18.5k
        res[0] = c - code + case_conv_ext[data >> 6];
125
18.5k
        res[1] = case_conv_ext[data & 0x3f];
126
18.5k
        if (conv_type == 2) {
127
            /* convert to lower */
128
0
            res[0] = lre_case_conv1(res[0], 1);
129
0
            res[1] = lre_case_conv1(res[1], 1);
130
0
        }
131
18.5k
        return 2;
132
0
    default:
133
5.30k
    case RUN_TYPE_UF_EXT3:
134
5.30k
        if (conv_type == 1)
135
0
            break;
136
5.30k
        res[0] = case_conv_ext[data >> 8];
137
5.30k
        res[1] = case_conv_ext[(data >> 4) & 0xf];
138
5.30k
        res[2] = case_conv_ext[data & 0xf];
139
5.30k
        if (conv_type == 2) {
140
            /* convert to lower */
141
0
            res[0] = lre_case_conv1(res[0], 1);
142
0
            res[1] = lre_case_conv1(res[1], 1);
143
0
            res[2] = lre_case_conv1(res[2], 1);
144
0
        }
145
5.30k
        return 3;
146
489k
    }
147
455k
    res[0] = c;
148
455k
    return 1;
149
489k
}
150
151
/* conv_type:
152
   0 = to upper
153
   1 = to lower
154
   2 = case folding (= to lower with modifications)
155
*/
156
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
0
{
158
0
    if (c < 128) {
159
0
        if (conv_type) {
160
0
            if (c >= 'A' && c <= 'Z') {
161
0
                c = c - 'A' + 'a';
162
0
            }
163
0
        } else {
164
0
            if (c >= 'a' && c <= 'z') {
165
0
                c = c - 'a' + 'A';
166
0
            }
167
0
        }
168
0
    } else {
169
0
        uint32_t v, code, len;
170
0
        int idx, idx_min, idx_max;
171
172
0
        idx_min = 0;
173
0
        idx_max = countof(case_conv_table1) - 1;
174
0
        while (idx_min <= idx_max) {
175
0
            idx = (unsigned)(idx_max + idx_min) / 2;
176
0
            v = case_conv_table1[idx];
177
0
            code = v >> (32 - 17);
178
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
179
0
            if (c < code) {
180
0
                idx_max = idx - 1;
181
0
            } else if (c >= code + len) {
182
0
                idx_min = idx + 1;
183
0
            } else {
184
0
                return lre_case_conv_entry(res, c, conv_type, idx, v);
185
0
            }
186
0
        }
187
0
    }
188
0
    res[0] = c;
189
0
    return 1;
190
0
}
191
192
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
511k
{
194
511k
    uint32_t res[LRE_CC_RES_LEN_MAX];
195
511k
    int len;
196
197
511k
    if (is_unicode) {
198
0
        len = lre_case_conv_entry(res, c, 2, idx, v);
199
0
        if (len == 1) {
200
0
            c = res[0];
201
0
        } else {
202
            /* handle the few specific multi-character cases (see
203
               unicode_gen.c:dump_case_folding_special_cases()) */
204
0
            if (c == 0xfb06) {
205
0
                c = 0xfb05;
206
0
            } else if (c == 0x01fd3) {
207
0
                c = 0x390;
208
0
            } else if (c == 0x01fe3) {
209
0
                c = 0x3b0;
210
0
            }
211
0
        }
212
511k
    } else {
213
511k
        if (likely(c < 128)) {
214
22.5k
            if (c >= 'a' && c <= 'z')
215
22.5k
                c = c - 'a' + 'A';
216
489k
        } else {
217
            /* legacy regexp: to upper case if single char >= 128 */
218
489k
            len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
489k
            if (len == 1 && res[0] >= 128)
220
454k
                c = res[0];
221
489k
        }
222
511k
    }
223
511k
    return c;
224
511k
}
225
226
/* JS regexp specific rules for case folding */
227
int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
12.1k
{
229
12.1k
    if (c < 128) {
230
        /* fast case */
231
9.45k
        if (is_unicode) {
232
0
            if (c >= 'A' && c <= 'Z') {
233
0
                c = c - 'A' + 'a';
234
0
            }
235
9.45k
        } else {
236
9.45k
            if (c >= 'a' && c <= 'z') {
237
962
                c = c - 'a' + 'A';
238
962
            }
239
9.45k
        }
240
9.45k
    } else {
241
2.70k
        uint32_t v, code, len;
242
2.70k
        int idx, idx_min, idx_max;
243
244
2.70k
        idx_min = 0;
245
2.70k
        idx_max = countof(case_conv_table1) - 1;
246
20.8k
        while (idx_min <= idx_max) {
247
20.4k
            idx = (unsigned)(idx_max + idx_min) / 2;
248
20.4k
            v = case_conv_table1[idx];
249
20.4k
            code = v >> (32 - 17);
250
20.4k
            len = (v >> (32 - 17 - 7)) & 0x7f;
251
20.4k
            if (c < code) {
252
13.3k
                idx_max = idx - 1;
253
13.3k
            } else if (c >= code + len) {
254
4.76k
                idx_min = idx + 1;
255
4.76k
            } else {
256
2.29k
                return lre_case_folding_entry(c, idx, v, is_unicode);
257
2.29k
            }
258
20.4k
        }
259
2.70k
    }
260
9.87k
    return c;
261
12.1k
}
262
263
static uint32_t get_le24(const uint8_t *ptr)
264
85.7k
{
265
85.7k
    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
85.7k
}
267
268
9.64k
#define UNICODE_INDEX_BLOCK_LEN 32
269
270
/* return -1 if not in table, otherwise the offset in the block */
271
static int get_index_pos(uint32_t *pcode, uint32_t c,
272
                         const uint8_t *index_table, int index_table_len)
273
11.3k
{
274
11.3k
    uint32_t code, v;
275
11.3k
    int idx_min, idx_max, idx;
276
277
11.3k
    idx_min = 0;
278
11.3k
    v = get_le24(index_table);
279
11.3k
    code = v & ((1 << 21) - 1);
280
11.3k
    if (c < code) {
281
1.33k
        *pcode = 0;
282
1.33k
        return 0;
283
1.33k
    }
284
9.98k
    idx_max = index_table_len - 1;
285
9.98k
    code = get_le24(index_table + idx_max * 3);
286
9.98k
    if (c >= code)
287
335
        return -1;
288
    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
64.4k
    while ((idx_max - idx_min) > 1) {
290
54.7k
        idx = (idx_max + idx_min) / 2;
291
54.7k
        v = get_le24(index_table + idx * 3);
292
54.7k
        code = v & ((1 << 21) - 1);
293
54.7k
        if (c < code) {
294
16.5k
            idx_max = idx;
295
38.2k
        } else {
296
38.2k
            idx_min = idx;
297
38.2k
        }
298
54.7k
    }
299
9.64k
    v = get_le24(index_table + idx_min * 3);
300
9.64k
    *pcode = v & ((1 << 21) - 1);
301
9.64k
    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
9.98k
}
303
304
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
                            const uint8_t *index_table, int index_table_len)
306
11.3k
{
307
11.3k
    uint32_t code, b, bit;
308
11.3k
    int pos;
309
11.3k
    const uint8_t *p;
310
311
11.3k
    pos = get_index_pos(&code, c, index_table, index_table_len);
312
11.3k
    if (pos < 0)
313
335
        return FALSE; /* outside the table */
314
10.9k
    p = table + pos;
315
10.9k
    bit = 0;
316
    /* Compressed run length encoding:
317
       00..3F: 2 packed lengths: 3-bit + 3-bit
318
       40..5F: 5-bits plus extra byte for length
319
       60..7F: 5-bits plus 2 extra bytes for length
320
       80..FF: 7-bit length
321
       lengths must be incremented to get character count
322
       Ranges alternate between false and true return value.
323
     */
324
118k
    for(;;) {
325
118k
        b = *p++;
326
118k
        if (b < 64) {
327
31.4k
            code += (b >> 3) + 1;
328
31.4k
            if (c < code)
329
658
                return bit;
330
30.7k
            bit ^= 1;
331
30.7k
            code += (b & 7) + 1;
332
87.4k
        } else if (b >= 0x80) {
333
72.8k
            code += b - 0x80 + 1;
334
72.8k
        } else if (b < 0x60) {
335
7.63k
            code += (((b - 0x40) << 8) | p[0]) + 1;
336
7.63k
            p++;
337
7.63k
        } else {
338
6.90k
            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
6.90k
            p += 2;
340
6.90k
        }
341
118k
        if (c < code)
342
10.3k
            return bit;
343
107k
        bit ^= 1;
344
107k
    }
345
10.9k
}
346
347
BOOL lre_is_cased(uint32_t c)
348
0
{
349
0
    uint32_t v, code, len;
350
0
    int idx, idx_min, idx_max;
351
352
0
    idx_min = 0;
353
0
    idx_max = countof(case_conv_table1) - 1;
354
0
    while (idx_min <= idx_max) {
355
0
        idx = (unsigned)(idx_max + idx_min) / 2;
356
0
        v = case_conv_table1[idx];
357
0
        code = v >> (32 - 17);
358
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
359
0
        if (c < code) {
360
0
            idx_max = idx - 1;
361
0
        } else if (c >= code + len) {
362
0
            idx_min = idx + 1;
363
0
        } else {
364
0
            return TRUE;
365
0
        }
366
0
    }
367
0
    return lre_is_in_table(c, unicode_prop_Cased1_table,
368
0
                           unicode_prop_Cased1_index,
369
0
                           sizeof(unicode_prop_Cased1_index) / 3);
370
0
}
371
372
BOOL lre_is_case_ignorable(uint32_t c)
373
0
{
374
0
    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
0
                           unicode_prop_Case_Ignorable_index,
376
0
                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
0
}
378
379
/* character range */
380
381
static __maybe_unused void cr_dump(CharRange *cr)
382
0
{
383
0
    int i;
384
0
    for(i = 0; i < cr->len; i++)
385
0
        printf("%d: 0x%04x\n", i, cr->points[i]);
386
0
}
387
388
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
0
{
390
0
    return realloc(ptr, size);
391
0
}
392
393
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
31.1k
{
395
31.1k
    cr->len = cr->size = 0;
396
31.1k
    cr->points = NULL;
397
31.1k
    cr->mem_opaque = mem_opaque;
398
31.1k
    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
31.1k
}
400
401
void cr_free(CharRange *cr)
402
48.1k
{
403
48.1k
    cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
48.1k
}
405
406
int cr_realloc(CharRange *cr, int size)
407
271k
{
408
271k
    int new_size;
409
271k
    uint32_t *new_buf;
410
411
271k
    if (size > cr->size) {
412
265k
        new_size = max_int(size, cr->size * 3 / 2);
413
265k
        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
265k
                                   new_size * sizeof(cr->points[0]));
415
265k
        if (!new_buf)
416
0
            return -1;
417
265k
        cr->points = new_buf;
418
265k
        cr->size = new_size;
419
265k
    }
420
271k
    return 0;
421
271k
}
422
423
int cr_copy(CharRange *cr, const CharRange *cr1)
424
0
{
425
0
    if (cr_realloc(cr, cr1->len))
426
0
        return -1;
427
0
    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
0
    cr->len = cr1->len;
429
0
    return 0;
430
0
}
431
432
/* merge consecutive intervals and remove empty intervals */
433
static void cr_compress(CharRange *cr)
434
38.4k
{
435
38.4k
    int i, j, k, len;
436
38.4k
    uint32_t *pt;
437
438
38.4k
    pt = cr->points;
439
38.4k
    len = cr->len;
440
38.4k
    i = 0;
441
38.4k
    j = 0;
442
38.4k
    k = 0;
443
5.00M
    while ((i + 1) < len) {
444
4.96M
        if (pt[i] == pt[i + 1]) {
445
            /* empty interval */
446
565k
            i += 2;
447
4.40M
        } else {
448
4.40M
            j = i;
449
4.45M
            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
49.1k
                j += 2;
451
            /* just copy */
452
4.40M
            pt[k] = pt[i];
453
4.40M
            pt[k + 1] = pt[j + 1];
454
4.40M
            k += 2;
455
4.40M
            i = j + 2;
456
4.40M
        }
457
4.96M
    }
458
38.4k
    cr->len = k;
459
38.4k
}
460
461
/* union or intersection */
462
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
          const uint32_t *b_pt, int b_len, int op)
464
31.7k
{
465
31.7k
    int a_idx, b_idx, is_in;
466
31.7k
    uint32_t v;
467
468
31.7k
    a_idx = 0;
469
31.7k
    b_idx = 0;
470
15.1M
    for(;;) {
471
        /* get one more point from a or b in increasing order */
472
15.1M
        if (a_idx < a_len && b_idx < b_len) {
473
2.47M
            if (a_pt[a_idx] < b_pt[b_idx]) {
474
1.47M
                goto a_add;
475
1.47M
            } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
905k
                v = a_pt[a_idx];
477
905k
                a_idx++;
478
905k
                b_idx++;
479
905k
            } else {
480
100k
                goto b_add;
481
100k
            }
482
12.6M
        } else if (a_idx < a_len) {
483
14.0M
        a_add:
484
14.0M
            v = a_pt[a_idx++];
485
14.0M
        } else if (b_idx < b_len) {
486
197k
        b_add:
487
197k
            v = b_pt[b_idx++];
488
197k
        } else {
489
31.7k
            break;
490
31.7k
        }
491
        /* add the point if the in/out status changes */
492
15.1M
        switch(op) {
493
1.60M
        case CR_OP_UNION:
494
1.60M
            is_in = (a_idx & 1) | (b_idx & 1);
495
1.60M
            break;
496
13.5M
        case CR_OP_INTER:
497
13.5M
            is_in = (a_idx & 1) & (b_idx & 1);
498
13.5M
            break;
499
0
        case CR_OP_XOR:
500
0
            is_in = (a_idx & 1) ^ (b_idx & 1);
501
0
            break;
502
0
        case CR_OP_SUB:
503
0
            is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
504
0
            break;
505
0
        default:
506
0
            abort();
507
15.1M
        }
508
15.1M
        if (is_in != (cr->len & 1)) {
509
2.67M
            if (cr_add_point(cr, v))
510
0
                return -1;
511
2.67M
        }
512
15.1M
    }
513
31.7k
    cr_compress(cr);
514
31.7k
    return 0;
515
31.7k
}
516
517
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
518
16.9k
{
519
16.9k
    CharRange a = *cr;
520
16.9k
    int ret;
521
16.9k
    cr->len = 0;
522
16.9k
    cr->size = 0;
523
16.9k
    cr->points = NULL;
524
16.9k
    ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
525
16.9k
    cr_free(&a);
526
16.9k
    return ret;
527
16.9k
}
528
529
int cr_invert(CharRange *cr)
530
6.74k
{
531
6.74k
    int len;
532
6.74k
    len = cr->len;
533
6.74k
    if (cr_realloc(cr, len + 2))
534
0
        return -1;
535
6.74k
    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
536
6.74k
    cr->points[0] = 0;
537
6.74k
    cr->points[len + 1] = UINT32_MAX;
538
6.74k
    cr->len = len + 2;
539
6.74k
    cr_compress(cr);
540
6.74k
    return 0;
541
6.74k
}
542
543
663k
#define CASE_U (1 << 0)
544
319k
#define CASE_L (1 << 1)
545
319k
#define CASE_F (1 << 2)
546
547
/* use the case conversion table to generate range of characters.
548
   CASE_U: set char if modified by uppercasing,
549
   CASE_L: set char if modified by lowercasing,
550
   CASE_F: set char if modified by case folding,
551
 */
552
static int unicode_case1(CharRange *cr, int case_mask)
553
4.91k
{
554
142k
#define MR(x) (1 << RUN_TYPE_ ## x)
555
4.91k
    const uint32_t tab_run_mask[3] = {
556
4.91k
        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
557
4.91k
        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
558
559
4.91k
        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
560
561
4.91k
        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
562
4.91k
    };
563
4.91k
#undef MR
564
4.91k
    uint32_t mask, v, code, type, len, i, idx;
565
566
4.91k
    if (case_mask == 0)
567
0
        return 0;
568
4.91k
    mask = 0;
569
19.6k
    for(i = 0; i < 3; i++) {
570
14.7k
        if ((case_mask >> i) & 1)
571
4.91k
            mask |= tab_run_mask[i];
572
14.7k
    }
573
1.86M
    for(idx = 0; idx < countof(case_conv_table1); idx++) {
574
1.85M
        v = case_conv_table1[idx];
575
1.85M
        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
576
1.85M
        code = v >> (32 - 17);
577
1.85M
        len = (v >> (32 - 17 - 7)) & 0x7f;
578
1.85M
        if ((mask >> type) & 1) {
579
            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
580
1.27M
            switch(type) {
581
299k
            case RUN_TYPE_UL:
582
299k
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
583
0
                    goto def_case;
584
299k
                code += ((case_mask & CASE_U) != 0);
585
2.96M
                for(i = 0; i < len; i += 2) {
586
2.66M
                    if (cr_add_interval(cr, code + i, code + i + 1))
587
0
                        return -1;
588
2.66M
                }
589
299k
                break;
590
299k
            case RUN_TYPE_LSU:
591
19.6k
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
592
0
                    goto def_case;
593
19.6k
                if (!(case_mask & CASE_U)) {
594
0
                    if (cr_add_interval(cr, code, code + 1))
595
0
                        return -1;
596
0
                }
597
19.6k
                if (cr_add_interval(cr, code + 1, code + 2))
598
0
                    return -1;
599
19.6k
                if (case_mask & CASE_U) {
600
19.6k
                    if (cr_add_interval(cr, code + 2, code + 3))
601
0
                        return -1;
602
19.6k
                }
603
19.6k
                break;
604
958k
            default:
605
958k
            def_case:
606
958k
                if (cr_add_interval(cr, code, code + len))
607
0
                    return -1;
608
958k
                break;
609
1.27M
            }
610
1.27M
        }
611
1.85M
    }
612
4.91k
    return 0;
613
4.91k
}
614
615
static int point_cmp(const void *p1, const void *p2, void *arg)
616
2.36M
{
617
2.36M
    uint32_t v1 = *(uint32_t *)p1;
618
2.36M
    uint32_t v2 = *(uint32_t *)p2;
619
2.36M
    return (v1 > v2) - (v1 < v2);
620
2.36M
}
621
622
static void cr_sort_and_remove_overlap(CharRange *cr)
623
4.91k
{
624
4.91k
    uint32_t start, end, start1, end1, i, j;
625
626
    /* the resulting ranges are not necessarily sorted and may overlap */
627
4.91k
    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
628
4.91k
    j = 0;
629
259k
    for(i = 0; i < cr->len; ) {
630
254k
        start = cr->points[i];
631
254k
        end = cr->points[i + 1];
632
254k
        i += 2;
633
294k
        while (i < cr->len) {
634
292k
            start1 = cr->points[i];
635
292k
            end1 = cr->points[i + 1];
636
292k
            if (start1 > end) {
637
                /* |------|
638
                 *           |-------| */
639
253k
                break;
640
253k
            } else if (end1 <= end) {
641
                /* |------|
642
                 *    |--| */
643
10.8k
                i += 2;
644
28.6k
            } else {
645
                /* |------|
646
                 *     |-------| */
647
28.6k
                end = end1;
648
28.6k
                i += 2;
649
28.6k
            }
650
292k
        }
651
254k
        cr->points[j] = start;
652
254k
        cr->points[j + 1] = end;
653
254k
        j += 2;
654
254k
    }
655
4.91k
    cr->len = j;
656
4.91k
}
657
658
/* canonicalize a character set using the JS regex case folding rules
659
   (see lre_canonicalize()) */
660
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
661
4.91k
{
662
4.91k
    CharRange cr_inter, cr_mask, cr_result, cr_sub;
663
4.91k
    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
664
665
4.91k
    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
666
4.91k
    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
667
4.91k
    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
668
4.91k
    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
669
670
4.91k
    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
671
0
        goto fail;
672
4.91k
    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
673
0
        goto fail;
674
675
4.91k
    if (cr_invert(&cr_mask))
676
0
        goto fail;
677
4.91k
    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
678
0
        goto fail;
679
680
    /* cr_inter = cr & cr_mask */
681
    /* cr_sub = cr & ~cr_mask */
682
683
    /* use the case conversion table to compute the result */
684
4.91k
    d_start = -1;
685
4.91k
    d_end = -1;
686
4.91k
    idx = 0;
687
4.91k
    v = case_conv_table1[idx];
688
4.91k
    code = v >> (32 - 17);
689
4.91k
    len = (v >> (32 - 17 - 7)) & 0x7f;
690
260k
    for(i = 0; i < cr_inter.len; i += 2) {
691
256k
        start = cr_inter.points[i];
692
256k
        end = cr_inter.points[i + 1];
693
694
765k
        for(c = start; c < end; c++) {
695
684k
            for(;;) {
696
684k
                if (c >= code && c < code + len)
697
509k
                    break;
698
175k
                idx++;
699
175k
                assert(idx < countof(case_conv_table1));
700
175k
                v = case_conv_table1[idx];
701
175k
                code = v >> (32 - 17);
702
175k
                len = (v >> (32 - 17 - 7)) & 0x7f;
703
175k
            }
704
509k
            d = lre_case_folding_entry(c, idx, v, is_unicode);
705
            /* try to merge with the current interval */
706
509k
            if (d_start == -1) {
707
1.38k
                d_start = d;
708
1.38k
                d_end = d + 1;
709
507k
            } else if (d_end == d) {
710
215k
                d_end++;
711
292k
            } else {
712
292k
                cr_add_interval(&cr_result, d_start, d_end);
713
292k
                d_start = d;
714
292k
                d_end = d + 1;
715
292k
            }
716
509k
        }
717
256k
    }
718
4.91k
    if (d_start != -1) {
719
1.38k
        if (cr_add_interval(&cr_result, d_start, d_end))
720
0
            goto fail;
721
1.38k
    }
722
723
    /* the resulting ranges are not necessarily sorted and may overlap */
724
4.91k
    cr_sort_and_remove_overlap(&cr_result);
725
726
    /* or with the character not affected by the case folding */
727
4.91k
    cr->len = 0;
728
4.91k
    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
729
0
        goto fail;
730
731
4.91k
    cr_free(&cr_inter);
732
4.91k
    cr_free(&cr_mask);
733
4.91k
    cr_free(&cr_result);
734
4.91k
    cr_free(&cr_sub);
735
4.91k
    return 0;
736
0
 fail:
737
0
    cr_free(&cr_inter);
738
0
    cr_free(&cr_mask);
739
0
    cr_free(&cr_result);
740
0
    cr_free(&cr_sub);
741
0
    return -1;
742
4.91k
}
743
744
#ifdef CONFIG_ALL_UNICODE
745
746
BOOL lre_is_id_start(uint32_t c)
747
10.0k
{
748
10.0k
    return lre_is_in_table(c, unicode_prop_ID_Start_table,
749
10.0k
                           unicode_prop_ID_Start_index,
750
10.0k
                           sizeof(unicode_prop_ID_Start_index) / 3);
751
10.0k
}
752
753
BOOL lre_is_id_continue(uint32_t c)
754
3.85k
{
755
3.85k
    return lre_is_id_start(c) ||
756
1.23k
        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
757
1.23k
                        unicode_prop_ID_Continue1_index,
758
1.23k
                        sizeof(unicode_prop_ID_Continue1_index) / 3);
759
3.85k
}
760
761
#define UNICODE_DECOMP_LEN_MAX 18
762
763
typedef enum {
764
    DECOMP_TYPE_C1, /* 16 bit char */
765
    DECOMP_TYPE_L1, /* 16 bit char table */
766
    DECOMP_TYPE_L2,
767
    DECOMP_TYPE_L3,
768
    DECOMP_TYPE_L4,
769
    DECOMP_TYPE_L5, /* XXX: not used */
770
    DECOMP_TYPE_L6, /* XXX: could remove */
771
    DECOMP_TYPE_L7, /* XXX: could remove */
772
    DECOMP_TYPE_LL1, /* 18 bit char table */
773
    DECOMP_TYPE_LL2,
774
    DECOMP_TYPE_S1, /* 8 bit char table */
775
    DECOMP_TYPE_S2,
776
    DECOMP_TYPE_S3,
777
    DECOMP_TYPE_S4,
778
    DECOMP_TYPE_S5,
779
    DECOMP_TYPE_I1, /* increment 16 bit char value */
780
    DECOMP_TYPE_I2_0,
781
    DECOMP_TYPE_I2_1,
782
    DECOMP_TYPE_I3_1,
783
    DECOMP_TYPE_I3_2,
784
    DECOMP_TYPE_I4_1,
785
    DECOMP_TYPE_I4_2,
786
    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
787
    DECOMP_TYPE_B2,
788
    DECOMP_TYPE_B3,
789
    DECOMP_TYPE_B4,
790
    DECOMP_TYPE_B5,
791
    DECOMP_TYPE_B6,
792
    DECOMP_TYPE_B7,
793
    DECOMP_TYPE_B8,
794
    DECOMP_TYPE_B18,
795
    DECOMP_TYPE_LS2,
796
    DECOMP_TYPE_PAT3,
797
    DECOMP_TYPE_S2_UL,
798
    DECOMP_TYPE_LS2_UL,
799
} DecompTypeEnum;
800
801
static uint32_t unicode_get_short_code(uint32_t c)
802
0
{
803
0
    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
804
805
0
    if (c < 0x80)
806
0
        return c;
807
0
    else if (c < 0x80 + 0x50)
808
0
        return c - 0x80 + 0x300;
809
0
    else
810
0
        return unicode_short_table[c - 0x80 - 0x50];
811
0
}
812
813
static uint32_t unicode_get_lower_simple(uint32_t c)
814
0
{
815
0
    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
816
0
        c += 0x20;
817
0
    else
818
0
        c++;
819
0
    return c;
820
0
}
821
822
static uint16_t unicode_get16(const uint8_t *p)
823
0
{
824
0
    return p[0] | (p[1] << 8);
825
0
}
826
827
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
828
                                int idx, uint32_t code, uint32_t len,
829
                                uint32_t type)
830
0
{
831
0
    uint32_t c1;
832
0
    int l, i, p;
833
0
    const uint8_t *d;
834
835
0
    if (type == DECOMP_TYPE_C1) {
836
0
        res[0] = unicode_decomp_table2[idx];
837
0
        return 1;
838
0
    } else {
839
0
        d = unicode_decomp_data + unicode_decomp_table2[idx];
840
0
        switch(type) {
841
0
        case DECOMP_TYPE_L1:
842
0
        case DECOMP_TYPE_L2:
843
0
        case DECOMP_TYPE_L3:
844
0
        case DECOMP_TYPE_L4:
845
0
        case DECOMP_TYPE_L5:
846
0
        case DECOMP_TYPE_L6:
847
0
        case DECOMP_TYPE_L7:
848
0
            l = type - DECOMP_TYPE_L1 + 1;
849
0
            d += (c - code) * l * 2;
850
0
            for(i = 0; i < l; i++) {
851
0
                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
852
0
                    return 0;
853
0
            }
854
0
            return l;
855
0
        case DECOMP_TYPE_LL1:
856
0
        case DECOMP_TYPE_LL2:
857
0
            {
858
0
                uint32_t k, p;
859
0
                l = type - DECOMP_TYPE_LL1 + 1;
860
0
                k = (c - code) * l;
861
0
                p = len * l * 2;
862
0
                for(i = 0; i < l; i++) {
863
0
                    c1 = unicode_get16(d + 2 * k) |
864
0
                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
865
0
                    if (!c1)
866
0
                        return 0;
867
0
                    res[i] = c1;
868
0
                    k++;
869
0
                }
870
0
            }
871
0
            return l;
872
0
        case DECOMP_TYPE_S1:
873
0
        case DECOMP_TYPE_S2:
874
0
        case DECOMP_TYPE_S3:
875
0
        case DECOMP_TYPE_S4:
876
0
        case DECOMP_TYPE_S5:
877
0
            l = type - DECOMP_TYPE_S1 + 1;
878
0
            d += (c - code) * l;
879
0
            for(i = 0; i < l; i++) {
880
0
                if ((res[i] = unicode_get_short_code(d[i])) == 0)
881
0
                    return 0;
882
0
            }
883
0
            return l;
884
0
        case DECOMP_TYPE_I1:
885
0
            l = 1;
886
0
            p = 0;
887
0
            goto decomp_type_i;
888
0
        case DECOMP_TYPE_I2_0:
889
0
        case DECOMP_TYPE_I2_1:
890
0
        case DECOMP_TYPE_I3_1:
891
0
        case DECOMP_TYPE_I3_2:
892
0
        case DECOMP_TYPE_I4_1:
893
0
        case DECOMP_TYPE_I4_2:
894
0
            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
895
0
            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
896
0
        decomp_type_i:
897
0
            for(i = 0; i < l; i++) {
898
0
                c1 = unicode_get16(d + 2 * i);
899
0
                if (i == p)
900
0
                    c1 += c - code;
901
0
                res[i] = c1;
902
0
            }
903
0
            return l;
904
0
        case DECOMP_TYPE_B18:
905
0
            l = 18;
906
0
            goto decomp_type_b;
907
0
        case DECOMP_TYPE_B1:
908
0
        case DECOMP_TYPE_B2:
909
0
        case DECOMP_TYPE_B3:
910
0
        case DECOMP_TYPE_B4:
911
0
        case DECOMP_TYPE_B5:
912
0
        case DECOMP_TYPE_B6:
913
0
        case DECOMP_TYPE_B7:
914
0
        case DECOMP_TYPE_B8:
915
0
            l = type - DECOMP_TYPE_B1 + 1;
916
0
        decomp_type_b:
917
0
            {
918
0
                uint32_t c_min;
919
0
                c_min = unicode_get16(d);
920
0
                d += 2 + (c - code) * l;
921
0
                for(i = 0; i < l; i++) {
922
0
                    c1 = d[i];
923
0
                    if (c1 == 0xff)
924
0
                        c1 = 0x20;
925
0
                    else
926
0
                        c1 += c_min;
927
0
                    res[i] = c1;
928
0
                }
929
0
            }
930
0
            return l;
931
0
        case DECOMP_TYPE_LS2:
932
0
            d += (c - code) * 3;
933
0
            if (!(res[0] = unicode_get16(d)))
934
0
                return 0;
935
0
            res[1] = unicode_get_short_code(d[2]);
936
0
            return 2;
937
0
        case DECOMP_TYPE_PAT3:
938
0
            res[0] = unicode_get16(d);
939
0
            res[2] = unicode_get16(d + 2);
940
0
            d += 4 + (c - code) * 2;
941
0
            res[1] = unicode_get16(d);
942
0
            return 3;
943
0
        case DECOMP_TYPE_S2_UL:
944
0
        case DECOMP_TYPE_LS2_UL:
945
0
            c1 = c - code;
946
0
            if (type == DECOMP_TYPE_S2_UL) {
947
0
                d += c1 & ~1;
948
0
                c = unicode_get_short_code(*d);
949
0
                d++;
950
0
            } else {
951
0
                d += (c1 >> 1) * 3;
952
0
                c = unicode_get16(d);
953
0
                d += 2;
954
0
            }
955
0
            if (c1 & 1)
956
0
                c = unicode_get_lower_simple(c);
957
0
            res[0] = c;
958
0
            res[1] = unicode_get_short_code(*d);
959
0
            return 2;
960
0
        }
961
0
    }
962
0
    return 0;
963
0
}
964
965
966
/* return the length of the decomposition (length <=
967
   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
968
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
969
0
{
970
0
    uint32_t v, type, is_compat, code, len;
971
0
    int idx_min, idx_max, idx;
972
973
0
    idx_min = 0;
974
0
    idx_max = countof(unicode_decomp_table1) - 1;
975
0
    while (idx_min <= idx_max) {
976
0
        idx = (idx_max + idx_min) / 2;
977
0
        v = unicode_decomp_table1[idx];
978
0
        code = v >> (32 - 18);
979
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
980
        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
981
0
        if (c < code) {
982
0
            idx_max = idx - 1;
983
0
        } else if (c >= code + len) {
984
0
            idx_min = idx + 1;
985
0
        } else {
986
0
            is_compat = v & 1;
987
0
            if (is_compat1 < is_compat)
988
0
                break;
989
0
            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
990
0
            return unicode_decomp_entry(res, c, idx, code, len, type);
991
0
        }
992
0
    }
993
0
    return 0;
994
0
}
995
996
/* return 0 if no pair found */
997
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
998
0
{
999
0
    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
1000
0
    int idx_min, idx_max, idx, d;
1001
0
    uint32_t pair[2];
1002
1003
0
    idx_min = 0;
1004
0
    idx_max = countof(unicode_comp_table) - 1;
1005
0
    while (idx_min <= idx_max) {
1006
0
        idx = (idx_max + idx_min) / 2;
1007
0
        idx1 = unicode_comp_table[idx];
1008
1009
        /* idx1 represent an entry of the decomposition table */
1010
0
        d_idx = idx1 >> 6;
1011
0
        d_offset = idx1 & 0x3f;
1012
0
        v = unicode_decomp_table1[d_idx];
1013
0
        code = v >> (32 - 18);
1014
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
1015
0
        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
1016
0
        ch = code + d_offset;
1017
0
        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1018
0
        d = c0 - pair[0];
1019
0
        if (d == 0)
1020
0
            d = c1 - pair[1];
1021
0
        if (d < 0) {
1022
0
            idx_max = idx - 1;
1023
0
        } else if (d > 0) {
1024
0
            idx_min = idx + 1;
1025
0
        } else {
1026
0
            return ch;
1027
0
        }
1028
0
    }
1029
0
    return 0;
1030
0
}
1031
1032
/* return the combining class of character c (between 0 and 255) */
1033
static int unicode_get_cc(uint32_t c)
1034
0
{
1035
0
    uint32_t code, n, type, cc, c1, b;
1036
0
    int pos;
1037
0
    const uint8_t *p;
1038
1039
0
    pos = get_index_pos(&code, c,
1040
0
                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
1041
0
    if (pos < 0)
1042
0
        return 0;
1043
0
    p = unicode_cc_table + pos;
1044
    /* Compressed run length encoding:
1045
       - 2 high order bits are combining class type
1046
       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1047
       - 00..2F: range length (add 1)
1048
       - 30..37: 3-bit range-length + 1 extra byte
1049
       - 38..3F: 3-bit range-length + 2 extra byte
1050
     */
1051
0
    for(;;) {
1052
0
        b = *p++;
1053
0
        type = b >> 6;
1054
0
        n = b & 0x3f;
1055
0
        if (n < 48) {
1056
0
        } else if (n < 56) {
1057
0
            n = (n - 48) << 8;
1058
0
            n |= *p++;
1059
0
            n += 48;
1060
0
        } else {
1061
0
            n = (n - 56) << 8;
1062
0
            n |= *p++ << 8;
1063
0
            n |= *p++;
1064
0
            n += 48 + (1 << 11);
1065
0
        }
1066
0
        if (type <= 1)
1067
0
            p++;
1068
0
        c1 = code + n + 1;
1069
0
        if (c < c1) {
1070
0
            switch(type) {
1071
0
            case 0:
1072
0
                cc = p[-1];
1073
0
                break;
1074
0
            case 1:
1075
0
                cc = p[-1] + c - code;
1076
0
                break;
1077
0
            case 2:
1078
0
                cc = 0;
1079
0
                break;
1080
0
            default:
1081
0
            case 3:
1082
0
                cc = 230;
1083
0
                break;
1084
0
            }
1085
0
            return cc;
1086
0
        }
1087
0
        code = c1;
1088
0
    }
1089
0
}
1090
1091
static void sort_cc(int *buf, int len)
1092
0
{
1093
0
    int i, j, k, cc, cc1, start, ch1;
1094
1095
0
    for(i = 0; i < len; i++) {
1096
0
        cc = unicode_get_cc(buf[i]);
1097
0
        if (cc != 0) {
1098
0
            start = i;
1099
0
            j = i + 1;
1100
0
            while (j < len) {
1101
0
                ch1 = buf[j];
1102
0
                cc1 = unicode_get_cc(ch1);
1103
0
                if (cc1 == 0)
1104
0
                    break;
1105
0
                k = j - 1;
1106
0
                while (k >= start) {
1107
0
                    if (unicode_get_cc(buf[k]) <= cc1)
1108
0
                        break;
1109
0
                    buf[k + 1] = buf[k];
1110
0
                    k--;
1111
0
                }
1112
0
                buf[k + 1] = ch1;
1113
0
                j++;
1114
0
            }
1115
#if 0
1116
            printf("cc:");
1117
            for(k = start; k < j; k++) {
1118
                printf(" %3d", unicode_get_cc(buf[k]));
1119
            }
1120
            printf("\n");
1121
#endif
1122
0
            i = j;
1123
0
        }
1124
0
    }
1125
0
}
1126
1127
static void to_nfd_rec(DynBuf *dbuf,
1128
                       const int *src, int src_len, int is_compat)
1129
0
{
1130
0
    uint32_t c, v;
1131
0
    int i, l;
1132
0
    uint32_t res[UNICODE_DECOMP_LEN_MAX];
1133
1134
0
    for(i = 0; i < src_len; i++) {
1135
0
        c = src[i];
1136
0
        if (c >= 0xac00 && c < 0xd7a4) {
1137
            /* Hangul decomposition */
1138
0
            c -= 0xac00;
1139
0
            dbuf_put_u32(dbuf, 0x1100 + c / 588);
1140
0
            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
1141
0
            v = c % 28;
1142
0
            if (v != 0)
1143
0
                dbuf_put_u32(dbuf, 0x11a7 + v);
1144
0
        } else {
1145
0
            l = unicode_decomp_char(res, c, is_compat);
1146
0
            if (l) {
1147
0
                to_nfd_rec(dbuf, (int *)res, l, is_compat);
1148
0
            } else {
1149
0
                dbuf_put_u32(dbuf, c);
1150
0
            }
1151
0
        }
1152
0
    }
1153
0
}
1154
1155
/* return 0 if not found */
1156
static int compose_pair(uint32_t c0, uint32_t c1)
1157
0
{
1158
    /* Hangul composition */
1159
0
    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
1160
0
        c1 >= 0x1161 && c1 < 0x1161 + 21) {
1161
0
        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
1162
0
    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
1163
0
               (c0 - 0xac00) % 28 == 0 &&
1164
0
               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
1165
0
        return c0 + c1 - 0x11a7;
1166
0
    } else {
1167
0
        return unicode_compose_pair(c0, c1);
1168
0
    }
1169
0
}
1170
1171
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
1172
                      UnicodeNormalizationEnum n_type,
1173
                      void *opaque, DynBufReallocFunc *realloc_func)
1174
0
{
1175
0
    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1176
0
    BOOL is_compat;
1177
0
    DynBuf dbuf_s, *dbuf = &dbuf_s;
1178
1179
0
    is_compat = n_type >> 1;
1180
1181
0
    dbuf_init2(dbuf, opaque, realloc_func);
1182
0
    if (dbuf_realloc(dbuf, sizeof(int) * src_len))
1183
0
        goto fail;
1184
1185
    /* common case: latin1 is unaffected by NFC */
1186
0
    if (n_type == UNICODE_NFC) {
1187
0
        for(i = 0; i < src_len; i++) {
1188
0
            if (src[i] >= 0x100)
1189
0
                goto not_latin1;
1190
0
        }
1191
0
        buf = (int *)dbuf->buf;
1192
0
        memcpy(buf, src, src_len * sizeof(int));
1193
0
        *pdst = (uint32_t *)buf;
1194
0
        return src_len;
1195
0
    not_latin1: ;
1196
0
    }
1197
1198
0
    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1199
0
    if (dbuf_error(dbuf)) {
1200
0
    fail:
1201
0
        *pdst = NULL;
1202
0
        return -1;
1203
0
    }
1204
0
    buf = (int *)dbuf->buf;
1205
0
    buf_len = dbuf->size / sizeof(int);
1206
1207
0
    sort_cc(buf, buf_len);
1208
1209
0
    if (buf_len <= 1 || (n_type & 1) != 0) {
1210
        /* NFD / NFKD */
1211
0
        *pdst = (uint32_t *)buf;
1212
0
        return buf_len;
1213
0
    }
1214
1215
0
    i = 1;
1216
0
    out_len = 1;
1217
0
    while (i < buf_len) {
1218
        /* find the starter character and test if it is blocked from
1219
           the character at 'i' */
1220
0
        last_cc = unicode_get_cc(buf[i]);
1221
0
        starter_pos = out_len - 1;
1222
0
        while (starter_pos >= 0) {
1223
0
            cc = unicode_get_cc(buf[starter_pos]);
1224
0
            if (cc == 0)
1225
0
                break;
1226
0
            if (cc >= last_cc)
1227
0
                goto next;
1228
0
            last_cc = 256;
1229
0
            starter_pos--;
1230
0
        }
1231
0
        if (starter_pos >= 0 &&
1232
0
            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1233
0
            buf[starter_pos] = p;
1234
0
            i++;
1235
0
        } else {
1236
0
        next:
1237
0
            buf[out_len++] = buf[i++];
1238
0
        }
1239
0
    }
1240
0
    *pdst = (uint32_t *)buf;
1241
0
    return out_len;
1242
0
}
1243
1244
/* char ranges for various unicode properties */
1245
1246
static int unicode_find_name(const char *name_table, const char *name)
1247
0
{
1248
0
    const char *p, *r;
1249
0
    int pos;
1250
0
    size_t name_len, len;
1251
1252
0
    p = name_table;
1253
0
    pos = 0;
1254
0
    name_len = strlen(name);
1255
0
    while (*p) {
1256
0
        for(;;) {
1257
0
            r = strchr(p, ',');
1258
0
            if (!r)
1259
0
                len = strlen(p);
1260
0
            else
1261
0
                len = r - p;
1262
0
            if (len == name_len && !memcmp(p, name, name_len))
1263
0
                return pos;
1264
0
            p += len + 1;
1265
0
            if (!r)
1266
0
                break;
1267
0
        }
1268
0
        pos++;
1269
0
    }
1270
0
    return -1;
1271
0
}
1272
1273
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1274
   if not found */
1275
int unicode_script(CharRange *cr,
1276
                   const char *script_name, BOOL is_ext)
1277
0
{
1278
0
    int script_idx;
1279
0
    const uint8_t *p, *p_end;
1280
0
    uint32_t c, c1, b, n, v, v_len, i, type;
1281
0
    CharRange cr1_s, *cr1;
1282
0
    CharRange cr2_s, *cr2 = &cr2_s;
1283
0
    BOOL is_common;
1284
1285
0
    script_idx = unicode_find_name(unicode_script_name_table, script_name);
1286
0
    if (script_idx < 0)
1287
0
        return -2;
1288
1289
0
    is_common = (script_idx == UNICODE_SCRIPT_Common ||
1290
0
                 script_idx == UNICODE_SCRIPT_Inherited);
1291
0
    if (is_ext) {
1292
0
        cr1 = &cr1_s;
1293
0
        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1294
0
        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1295
0
    } else {
1296
0
        cr1 = cr;
1297
0
    }
1298
1299
0
    p = unicode_script_table;
1300
0
    p_end = unicode_script_table + countof(unicode_script_table);
1301
0
    c = 0;
1302
0
    while (p < p_end) {
1303
0
        b = *p++;
1304
0
        type = b >> 7;
1305
0
        n = b & 0x7f;
1306
0
        if (n < 96) {
1307
0
        } else if (n < 112) {
1308
0
            n = (n - 96) << 8;
1309
0
            n |= *p++;
1310
0
            n += 96;
1311
0
        } else {
1312
0
            n = (n - 112) << 16;
1313
0
            n |= *p++ << 8;
1314
0
            n |= *p++;
1315
0
            n += 96 + (1 << 12);
1316
0
        }
1317
0
        c1 = c + n + 1;
1318
0
        if (type != 0) {
1319
0
            v = *p++;
1320
0
            if (v == script_idx || script_idx == UNICODE_SCRIPT_Unknown) {
1321
0
                if (cr_add_interval(cr1, c, c1))
1322
0
                    goto fail;
1323
0
            }
1324
0
        }
1325
0
        c = c1;
1326
0
    }
1327
0
    if (script_idx == UNICODE_SCRIPT_Unknown) {
1328
        /* Unknown is all the characters outside scripts */
1329
0
        if (cr_invert(cr1))
1330
0
            goto fail;
1331
0
    }
1332
1333
0
    if (is_ext) {
1334
        /* add the script extensions */
1335
0
        p = unicode_script_ext_table;
1336
0
        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1337
0
        c = 0;
1338
0
        while (p < p_end) {
1339
0
            b = *p++;
1340
0
            if (b < 128) {
1341
0
                n = b;
1342
0
            } else if (b < 128 + 64) {
1343
0
                n = (b - 128) << 8;
1344
0
                n |= *p++;
1345
0
                n += 128;
1346
0
            } else {
1347
0
                n = (b - 128 - 64) << 16;
1348
0
                n |= *p++ << 8;
1349
0
                n |= *p++;
1350
0
                n += 128 + (1 << 14);
1351
0
            }
1352
0
            c1 = c + n + 1;
1353
0
            v_len = *p++;
1354
0
            if (is_common) {
1355
0
                if (v_len != 0) {
1356
0
                    if (cr_add_interval(cr2, c, c1))
1357
0
                        goto fail;
1358
0
                }
1359
0
            } else {
1360
0
                for(i = 0; i < v_len; i++) {
1361
0
                    if (p[i] == script_idx) {
1362
0
                        if (cr_add_interval(cr2, c, c1))
1363
0
                            goto fail;
1364
0
                        break;
1365
0
                    }
1366
0
                }
1367
0
            }
1368
0
            p += v_len;
1369
0
            c = c1;
1370
0
        }
1371
0
        if (is_common) {
1372
            /* remove all the characters with script extensions */
1373
0
            if (cr_invert(cr2))
1374
0
                goto fail;
1375
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1376
0
                      CR_OP_INTER))
1377
0
                goto fail;
1378
0
        } else {
1379
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1380
0
                      CR_OP_UNION))
1381
0
                goto fail;
1382
0
        }
1383
0
        cr_free(cr1);
1384
0
        cr_free(cr2);
1385
0
    }
1386
0
    return 0;
1387
0
 fail:
1388
0
    if (is_ext) {
1389
0
        cr_free(cr1);
1390
0
        cr_free(cr2);
1391
0
    }
1392
0
    goto fail;
1393
0
}
1394
1395
0
#define M(id) (1U << UNICODE_GC_ ## id)
1396
1397
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1398
0
{
1399
0
    const uint8_t *p, *p_end;
1400
0
    uint32_t c, c0, b, n, v;
1401
1402
0
    p = unicode_gc_table;
1403
0
    p_end = unicode_gc_table + countof(unicode_gc_table);
1404
0
    c = 0;
1405
    /* Compressed range encoding:
1406
       initial byte:
1407
       bits 0..4: category number (special case 31)
1408
       bits 5..7: range length (add 1)
1409
       special case bits 5..7 == 7: read an extra byte
1410
       - 00..7F: range length (add 7 + 1)
1411
       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1412
       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1413
     */
1414
0
    while (p < p_end) {
1415
0
        b = *p++;
1416
0
        n = b >> 5;
1417
0
        v = b & 0x1f;
1418
0
        if (n == 7) {
1419
0
            n = *p++;
1420
0
            if (n < 128) {
1421
0
                n += 7;
1422
0
            } else if (n < 128 + 64) {
1423
0
                n = (n - 128) << 8;
1424
0
                n |= *p++;
1425
0
                n += 7 + 128;
1426
0
            } else {
1427
0
                n = (n - 128 - 64) << 16;
1428
0
                n |= *p++ << 8;
1429
0
                n |= *p++;
1430
0
                n += 7 + 128 + (1 << 14);
1431
0
            }
1432
0
        }
1433
0
        c0 = c;
1434
0
        c += n + 1;
1435
0
        if (v == 31) {
1436
            /* run of Lu / Ll */
1437
0
            b = gc_mask & (M(Lu) | M(Ll));
1438
0
            if (b != 0) {
1439
0
                if (b == (M(Lu) | M(Ll))) {
1440
0
                    goto add_range;
1441
0
                } else {
1442
0
                    c0 += ((gc_mask & M(Ll)) != 0);
1443
0
                    for(; c0 < c; c0 += 2) {
1444
0
                        if (cr_add_interval(cr, c0, c0 + 1))
1445
0
                            return -1;
1446
0
                    }
1447
0
                }
1448
0
            }
1449
0
        } else if ((gc_mask >> v) & 1) {
1450
0
        add_range:
1451
0
            if (cr_add_interval(cr, c0, c))
1452
0
                return -1;
1453
0
        }
1454
0
    }
1455
0
    return 0;
1456
0
}
1457
1458
static int unicode_prop1(CharRange *cr, int prop_idx)
1459
0
{
1460
0
    const uint8_t *p, *p_end;
1461
0
    uint32_t c, c0, b, bit;
1462
1463
0
    p = unicode_prop_table[prop_idx];
1464
0
    p_end = p + unicode_prop_len_table[prop_idx];
1465
0
    c = 0;
1466
0
    bit = 0;
1467
    /* Compressed range encoding:
1468
       00..3F: 2 packed lengths: 3-bit + 3-bit
1469
       40..5F: 5-bits plus extra byte for length
1470
       60..7F: 5-bits plus 2 extra bytes for length
1471
       80..FF: 7-bit length
1472
       lengths must be incremented to get character count
1473
       Ranges alternate between false and true return value.
1474
     */
1475
0
    while (p < p_end) {
1476
0
        c0 = c;
1477
0
        b = *p++;
1478
0
        if (b < 64) {
1479
0
            c += (b >> 3) + 1;
1480
0
            if (bit)  {
1481
0
                if (cr_add_interval(cr, c0, c))
1482
0
                    return -1;
1483
0
            }
1484
0
            bit ^= 1;
1485
0
            c0 = c;
1486
0
            c += (b & 7) + 1;
1487
0
        } else if (b >= 0x80) {
1488
0
            c += b - 0x80 + 1;
1489
0
        } else if (b < 0x60) {
1490
0
            c += (((b - 0x40) << 8) | p[0]) + 1;
1491
0
            p++;
1492
0
        } else {
1493
0
            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1494
0
            p += 2;
1495
0
        }
1496
0
        if (bit)  {
1497
0
            if (cr_add_interval(cr, c0, c))
1498
0
                return -1;
1499
0
        }
1500
0
        bit ^= 1;
1501
0
    }
1502
0
    return 0;
1503
0
}
1504
1505
typedef enum {
1506
    POP_GC,
1507
    POP_PROP,
1508
    POP_CASE,
1509
    POP_UNION,
1510
    POP_INTER,
1511
    POP_XOR,
1512
    POP_INVERT,
1513
    POP_END,
1514
} PropOPEnum;
1515
1516
#define POP_STACK_LEN_MAX 4
1517
1518
static int unicode_prop_ops(CharRange *cr, ...)
1519
0
{
1520
0
    va_list ap;
1521
0
    CharRange stack[POP_STACK_LEN_MAX];
1522
0
    int stack_len, op, ret, i;
1523
0
    uint32_t a;
1524
1525
0
    va_start(ap, cr);
1526
0
    stack_len = 0;
1527
0
    for(;;) {
1528
0
        op = va_arg(ap, int);
1529
0
        switch(op) {
1530
0
        case POP_GC:
1531
0
            assert(stack_len < POP_STACK_LEN_MAX);
1532
0
            a = va_arg(ap, int);
1533
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1534
0
            if (unicode_general_category1(&stack[stack_len - 1], a))
1535
0
                goto fail;
1536
0
            break;
1537
0
        case POP_PROP:
1538
0
            assert(stack_len < POP_STACK_LEN_MAX);
1539
0
            a = va_arg(ap, int);
1540
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1541
0
            if (unicode_prop1(&stack[stack_len - 1], a))
1542
0
                goto fail;
1543
0
            break;
1544
0
        case POP_CASE:
1545
0
            assert(stack_len < POP_STACK_LEN_MAX);
1546
0
            a = va_arg(ap, int);
1547
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1548
0
            if (unicode_case1(&stack[stack_len - 1], a))
1549
0
                goto fail;
1550
0
            break;
1551
0
        case POP_UNION:
1552
0
        case POP_INTER:
1553
0
        case POP_XOR:
1554
0
            {
1555
0
                CharRange *cr1, *cr2, *cr3;
1556
0
                assert(stack_len >= 2);
1557
0
                assert(stack_len < POP_STACK_LEN_MAX);
1558
0
                cr1 = &stack[stack_len - 2];
1559
0
                cr2 = &stack[stack_len - 1];
1560
0
                cr3 = &stack[stack_len++];
1561
0
                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1562
                /* CR_OP_XOR may be used here */
1563
0
                if (cr_op(cr3, cr1->points, cr1->len,
1564
0
                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1565
0
                    goto fail;
1566
0
                cr_free(cr1);
1567
0
                cr_free(cr2);
1568
0
                *cr1 = *cr3;
1569
0
                stack_len -= 2;
1570
0
            }
1571
0
            break;
1572
0
        case POP_INVERT:
1573
0
            assert(stack_len >= 1);
1574
0
            if (cr_invert(&stack[stack_len - 1]))
1575
0
                goto fail;
1576
0
            break;
1577
0
        case POP_END:
1578
0
            goto done;
1579
0
        default:
1580
0
            abort();
1581
0
        }
1582
0
    }
1583
0
 done:
1584
0
    assert(stack_len == 1);
1585
0
    ret = cr_copy(cr, &stack[0]);
1586
0
    cr_free(&stack[0]);
1587
0
    return ret;
1588
0
 fail:
1589
0
    for(i = 0; i < stack_len; i++)
1590
0
        cr_free(&stack[i]);
1591
0
    return -1;
1592
0
}
1593
1594
static const uint32_t unicode_gc_mask_table[] = {
1595
    M(Lu) | M(Ll) | M(Lt), /* LC */
1596
    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1597
    M(Mn) | M(Mc) | M(Me), /* M */
1598
    M(Nd) | M(Nl) | M(No), /* N */
1599
    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1600
    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1601
    M(Zs) | M(Zl) | M(Zp), /* Z */
1602
    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1603
};
1604
1605
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1606
   if not found */
1607
int unicode_general_category(CharRange *cr, const char *gc_name)
1608
0
{
1609
0
    int gc_idx;
1610
0
    uint32_t gc_mask;
1611
1612
0
    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1613
0
    if (gc_idx < 0)
1614
0
        return -2;
1615
0
    if (gc_idx <= UNICODE_GC_Co) {
1616
0
        gc_mask = (uint64_t)1 << gc_idx;
1617
0
    } else {
1618
0
        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1619
0
    }
1620
0
    return unicode_general_category1(cr, gc_mask);
1621
0
}
1622
1623
1624
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1625
   if not found */
1626
int unicode_prop(CharRange *cr, const char *prop_name)
1627
0
{
1628
0
    int prop_idx, ret;
1629
1630
0
    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1631
0
    if (prop_idx < 0)
1632
0
        return -2;
1633
0
    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1634
1635
0
    ret = 0;
1636
0
    switch(prop_idx) {
1637
0
    case UNICODE_PROP_ASCII:
1638
0
        if (cr_add_interval(cr, 0x00, 0x7f + 1))
1639
0
            return -1;
1640
0
        break;
1641
0
    case UNICODE_PROP_Any:
1642
0
        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1643
0
            return -1;
1644
0
        break;
1645
0
    case UNICODE_PROP_Assigned:
1646
0
        ret = unicode_prop_ops(cr,
1647
0
                               POP_GC, M(Cn),
1648
0
                               POP_INVERT,
1649
0
                               POP_END);
1650
0
        break;
1651
0
    case UNICODE_PROP_Math:
1652
0
        ret = unicode_prop_ops(cr,
1653
0
                               POP_GC, M(Sm),
1654
0
                               POP_PROP, UNICODE_PROP_Other_Math,
1655
0
                               POP_UNION,
1656
0
                               POP_END);
1657
0
        break;
1658
0
    case UNICODE_PROP_Lowercase:
1659
0
        ret = unicode_prop_ops(cr,
1660
0
                               POP_GC, M(Ll),
1661
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1662
0
                               POP_UNION,
1663
0
                               POP_END);
1664
0
        break;
1665
0
    case UNICODE_PROP_Uppercase:
1666
0
        ret = unicode_prop_ops(cr,
1667
0
                               POP_GC, M(Lu),
1668
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1669
0
                               POP_UNION,
1670
0
                               POP_END);
1671
0
        break;
1672
0
    case UNICODE_PROP_Cased:
1673
0
        ret = unicode_prop_ops(cr,
1674
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt),
1675
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1676
0
                               POP_UNION,
1677
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1678
0
                               POP_UNION,
1679
0
                               POP_END);
1680
0
        break;
1681
0
    case UNICODE_PROP_Alphabetic:
1682
0
        ret = unicode_prop_ops(cr,
1683
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1684
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1685
0
                               POP_UNION,
1686
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1687
0
                               POP_UNION,
1688
0
                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
1689
0
                               POP_UNION,
1690
0
                               POP_END);
1691
0
        break;
1692
0
    case UNICODE_PROP_Grapheme_Base:
1693
0
        ret = unicode_prop_ops(cr,
1694
0
                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1695
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1696
0
                               POP_UNION,
1697
0
                               POP_INVERT,
1698
0
                               POP_END);
1699
0
        break;
1700
0
    case UNICODE_PROP_Grapheme_Extend:
1701
0
        ret = unicode_prop_ops(cr,
1702
0
                               POP_GC, M(Me) | M(Mn),
1703
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1704
0
                               POP_UNION,
1705
0
                               POP_END);
1706
0
        break;
1707
0
    case UNICODE_PROP_XID_Start:
1708
0
        ret = unicode_prop_ops(cr,
1709
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1710
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1711
0
                               POP_UNION,
1712
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1713
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1714
0
                               POP_UNION,
1715
0
                               POP_PROP, UNICODE_PROP_XID_Start1,
1716
0
                               POP_UNION,
1717
0
                               POP_INVERT,
1718
0
                               POP_INTER,
1719
0
                               POP_END);
1720
0
        break;
1721
0
    case UNICODE_PROP_XID_Continue:
1722
0
        ret = unicode_prop_ops(cr,
1723
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1724
0
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1725
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1726
0
                               POP_UNION,
1727
0
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1728
0
                               POP_UNION,
1729
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1730
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1731
0
                               POP_UNION,
1732
0
                               POP_PROP, UNICODE_PROP_XID_Continue1,
1733
0
                               POP_UNION,
1734
0
                               POP_INVERT,
1735
0
                               POP_INTER,
1736
0
                               POP_END);
1737
0
        break;
1738
0
    case UNICODE_PROP_Changes_When_Uppercased:
1739
0
        ret = unicode_case1(cr, CASE_U);
1740
0
        break;
1741
0
    case UNICODE_PROP_Changes_When_Lowercased:
1742
0
        ret = unicode_case1(cr, CASE_L);
1743
0
        break;
1744
0
    case UNICODE_PROP_Changes_When_Casemapped:
1745
0
        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1746
0
        break;
1747
0
    case UNICODE_PROP_Changes_When_Titlecased:
1748
0
        ret = unicode_prop_ops(cr,
1749
0
                               POP_CASE, CASE_U,
1750
0
                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1751
0
                               POP_XOR,
1752
0
                               POP_END);
1753
0
        break;
1754
0
    case UNICODE_PROP_Changes_When_Casefolded:
1755
0
        ret = unicode_prop_ops(cr,
1756
0
                               POP_CASE, CASE_F,
1757
0
                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1758
0
                               POP_XOR,
1759
0
                               POP_END);
1760
0
        break;
1761
0
    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1762
0
        ret = unicode_prop_ops(cr,
1763
0
                               POP_CASE, CASE_F,
1764
0
                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1765
0
                               POP_XOR,
1766
0
                               POP_END);
1767
0
        break;
1768
#if 0
1769
    case UNICODE_PROP_ID_Start:
1770
        ret = unicode_prop_ops(cr,
1771
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1772
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1773
                               POP_UNION,
1774
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1775
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1776
                               POP_UNION,
1777
                               POP_INVERT,
1778
                               POP_INTER,
1779
                               POP_END);
1780
        break;
1781
    case UNICODE_PROP_ID_Continue:
1782
        ret = unicode_prop_ops(cr,
1783
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1784
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1785
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1786
                               POP_UNION,
1787
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1788
                               POP_UNION,
1789
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1790
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1791
                               POP_UNION,
1792
                               POP_INVERT,
1793
                               POP_INTER,
1794
                               POP_END);
1795
        break;
1796
    case UNICODE_PROP_Case_Ignorable:
1797
        ret = unicode_prop_ops(cr,
1798
                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1799
                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
1800
                               POP_XOR,
1801
                               POP_END);
1802
        break;
1803
#else
1804
        /* we use the existing tables */
1805
0
    case UNICODE_PROP_ID_Continue:
1806
0
        ret = unicode_prop_ops(cr,
1807
0
                               POP_PROP, UNICODE_PROP_ID_Start,
1808
0
                               POP_PROP, UNICODE_PROP_ID_Continue1,
1809
0
                               POP_XOR,
1810
0
                               POP_END);
1811
0
        break;
1812
0
#endif
1813
0
    default:
1814
0
        if (prop_idx >= countof(unicode_prop_table))
1815
0
            return -2;
1816
0
        ret = unicode_prop1(cr, prop_idx);
1817
0
        break;
1818
0
    }
1819
0
    return ret;
1820
0
}
1821
1822
#endif /* CONFIG_ALL_UNICODE */
1823
1824
/*---- lre codepoint categorizing functions ----*/
1825
1826
#define S  UNICODE_C_SPACE
1827
#define D  UNICODE_C_DIGIT
1828
#define X  UNICODE_C_XDIGIT
1829
#define U  UNICODE_C_UPPER
1830
#define L  UNICODE_C_LOWER
1831
#define _  UNICODE_C_UNDER
1832
#define d  UNICODE_C_DOLLAR
1833
1834
uint8_t const lre_ctype_bits[256] = {
1835
    0, 0, 0, 0, 0, 0, 0, 0,
1836
    0, S, S, S, S, S, 0, 0,
1837
    0, 0, 0, 0, 0, 0, 0, 0,
1838
    0, 0, 0, 0, 0, 0, 0, 0,
1839
1840
    S, 0, 0, 0, d, 0, 0, 0,
1841
    0, 0, 0, 0, 0, 0, 0, 0,
1842
    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1843
    X|D, X|D, 0, 0, 0, 0, 0, 0,
1844
1845
    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1846
    U, U, U, U, U, U, U, U,
1847
    U, U, U, U, U, U, U, U,
1848
    U, U, U, 0, 0, 0, 0, _,
1849
1850
    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1851
    L, L, L, L, L, L, L, L,
1852
    L, L, L, L, L, L, L, L,
1853
    L, L, L, 0, 0, 0, 0, 0,
1854
1855
    0, 0, 0, 0, 0, 0, 0, 0,
1856
    0, 0, 0, 0, 0, 0, 0, 0,
1857
    0, 0, 0, 0, 0, 0, 0, 0,
1858
    0, 0, 0, 0, 0, 0, 0, 0,
1859
1860
    S, 0, 0, 0, 0, 0, 0, 0,
1861
    0, 0, 0, 0, 0, 0, 0, 0,
1862
    0, 0, 0, 0, 0, 0, 0, 0,
1863
    0, 0, 0, 0, 0, 0, 0, 0,
1864
1865
    0, 0, 0, 0, 0, 0, 0, 0,
1866
    0, 0, 0, 0, 0, 0, 0, 0,
1867
    0, 0, 0, 0, 0, 0, 0, 0,
1868
    0, 0, 0, 0, 0, 0, 0, 0,
1869
1870
    0, 0, 0, 0, 0, 0, 0, 0,
1871
    0, 0, 0, 0, 0, 0, 0, 0,
1872
    0, 0, 0, 0, 0, 0, 0, 0,
1873
    0, 0, 0, 0, 0, 0, 0, 0,
1874
};
1875
1876
#undef S
1877
#undef D
1878
#undef X
1879
#undef U
1880
#undef L
1881
#undef _
1882
#undef d
1883
1884
/* code point ranges for Zs,Zl or Zp property */
1885
static const uint16_t char_range_s[] = {
1886
    10,
1887
    0x0009, 0x000D + 1,
1888
    0x0020, 0x0020 + 1,
1889
    0x00A0, 0x00A0 + 1,
1890
    0x1680, 0x1680 + 1,
1891
    0x2000, 0x200A + 1,
1892
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1893
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1894
    0x2028, 0x2029 + 1,
1895
    0x202F, 0x202F + 1,
1896
    0x205F, 0x205F + 1,
1897
    0x3000, 0x3000 + 1,
1898
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1899
    0xFEFF, 0xFEFF + 1,
1900
};
1901
1902
BOOL lre_is_space_non_ascii(uint32_t c)
1903
0
{
1904
0
    size_t i, n;
1905
1906
0
    n = countof(char_range_s);
1907
0
    for(i = 5; i < n; i += 2) {
1908
0
        uint32_t low = char_range_s[i];
1909
0
        uint32_t high = char_range_s[i + 1];
1910
0
        if (c < low)
1911
0
            return FALSE;
1912
0
        if (c < high)
1913
0
            return TRUE;
1914
0
    }
1915
0
    return FALSE;
1916
0
}
1917
1918
#define SEQ_MAX_LEN 16
1919
1920
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
1921
                                  CharRange *cr)
1922
0
{
1923
0
    int i, c, j;
1924
0
    uint32_t seq[SEQ_MAX_LEN];
1925
    
1926
0
    switch(seq_prop_idx) {
1927
0
    case UNICODE_SEQUENCE_PROP_Basic_Emoji:
1928
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
1929
0
            return -1;
1930
0
        for(i = 0; i < cr->len; i += 2) {
1931
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1932
0
                seq[0] = c;
1933
0
                cb(opaque, seq, 1);
1934
0
            }
1935
0
        }
1936
1937
0
        cr->len = 0;
1938
1939
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
1940
0
            return -1;
1941
0
        for(i = 0; i < cr->len; i += 2) {
1942
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1943
0
                seq[0] = c;
1944
0
                seq[1] = 0xfe0f;
1945
0
                cb(opaque, seq, 2);
1946
0
            }
1947
0
        }
1948
1949
0
        break;
1950
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
1951
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
1952
0
            return -1;
1953
0
        for(i = 0; i < cr->len; i += 2) {
1954
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1955
0
                for(j = 0; j < 5; j++) {
1956
0
                    seq[0] = c;
1957
0
                    seq[1] = 0x1f3fb + j;
1958
0
                    cb(opaque, seq, 2);
1959
0
                }
1960
0
            }
1961
0
        }
1962
0
        break;
1963
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
1964
0
        if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
1965
0
            return -1;
1966
0
        for(i = 0; i < cr->len; i += 2) {
1967
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1968
0
                int c0, c1;
1969
0
                c0 = c / 26;
1970
0
                c1 = c % 26;
1971
0
                seq[0] = 0x1F1E6 + c0;
1972
0
                seq[1] = 0x1F1E6 + c1;
1973
0
                cb(opaque, seq, 2);
1974
0
            }
1975
0
        }
1976
0
        break;
1977
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
1978
0
        {
1979
0
            int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
1980
0
            int mod_idx, hc_idx, i0, i1;
1981
0
            const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
1982
            
1983
0
            for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
1984
0
                len = tab[i++];
1985
0
                k = 0;
1986
0
                mod = 0;
1987
0
                mod_count = 0;
1988
0
                hc_pos = -1;
1989
0
                for(j = 0; j < len; j++) {
1990
0
                    code = tab[i++];
1991
0
                    code |= tab[i++] << 8;
1992
0
                    pres = code >> 15;
1993
0
                    mod1 = (code >> 13) & 3;
1994
0
                    code &= 0x1fff;
1995
0
                    if (code < 0x1000) {
1996
0
                        c = code + 0x2000;
1997
0
                    } else {
1998
0
                        c = 0x1f000 + (code - 0x1000);
1999
0
                    }
2000
0
                    if (c == 0x1f9b0)
2001
0
                        hc_pos = k;
2002
0
                    seq[k++] = c;
2003
0
                    if (mod1 != 0) {
2004
0
                        assert(mod_count < 2);
2005
0
                        mod = mod1;
2006
0
                        mod_pos[mod_count++] = k;
2007
0
                        seq[k++] = 0; /* will be filled later */
2008
0
                    }
2009
0
                    if (pres) {
2010
0
                        seq[k++] = 0xfe0f;
2011
0
                    }
2012
0
                    if (j < len - 1) {
2013
0
                        seq[k++] = 0x200d;
2014
0
                    }
2015
0
                }
2016
2017
                /* genrate all the variants */
2018
0
                switch(mod) {
2019
0
                case 1:
2020
0
                    n_mod = 5;
2021
0
                    break;
2022
0
                case 2:
2023
0
                    n_mod = 25;
2024
0
                    break;
2025
0
                case 3:
2026
0
                    n_mod = 20;
2027
0
                    break;
2028
0
                default:
2029
0
                    n_mod = 1;
2030
0
                    break;
2031
0
                }
2032
0
                if (hc_pos >= 0)
2033
0
                    n_hc = 4;
2034
0
                else
2035
0
                    n_hc = 1;
2036
0
                for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
2037
0
                    for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
2038
0
                        if (hc_pos >= 0)
2039
0
                            seq[hc_pos] = 0x1f9b0 + hc_idx;
2040
                        
2041
0
                        switch(mod) {
2042
0
                        case 1:
2043
0
                            seq[mod_pos[0]] = 0x1f3fb + mod_idx;
2044
0
                            break;
2045
0
                        case 2:
2046
0
                        case 3:
2047
0
                            i0 = mod_idx / 5;
2048
0
                            i1 = mod_idx % 5;
2049
                            /* avoid identical values */
2050
0
                            if (mod == 3 && i0 >= i1)
2051
0
                                i0++;
2052
0
                            seq[mod_pos[0]] = 0x1f3fb + i0;
2053
0
                            seq[mod_pos[1]] = 0x1f3fb + i1;
2054
0
                            break;
2055
0
                        default:
2056
0
                            break;
2057
0
                        }
2058
#if 0
2059
                        for(j = 0; j < k; j++)
2060
                            printf(" %04x", seq[j]);
2061
                        printf("\n");
2062
#endif                
2063
0
                        cb(opaque, seq, k);
2064
0
                    }
2065
0
                }
2066
0
            }
2067
0
        }
2068
0
        break;
2069
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
2070
0
        {
2071
0
            for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
2072
0
                j = 0;
2073
0
                seq[j++] = 0x1F3F4;
2074
0
                for(;;) {
2075
0
                    c = unicode_rgi_emoji_tag_sequence[i++];
2076
0
                    if (c == 0x00)
2077
0
                        break;
2078
0
                    seq[j++] = 0xe0000 + c;
2079
0
                }
2080
0
                seq[j++] = 0xe007f;
2081
0
                cb(opaque, seq, j);
2082
0
            }
2083
0
        }
2084
0
        break;
2085
0
    case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
2086
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
2087
0
            return -1;
2088
0
        for(i = 0; i < cr->len; i += 2) {
2089
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
2090
0
                seq[0] = c;
2091
0
                seq[1] = 0xfe0f;
2092
0
                seq[2] = 0x20e3;
2093
0
                cb(opaque, seq, 3);
2094
0
            }
2095
0
        }
2096
0
        break;
2097
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji:
2098
        /* all prevous sequences */
2099
0
        for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
2100
0
            int ret;
2101
0
            ret = unicode_sequence_prop1(i, cb, opaque, cr);
2102
0
            if (ret < 0)
2103
0
                return ret;
2104
0
            cr->len = 0;
2105
0
        }
2106
0
        break;
2107
0
    default:
2108
0
        return -2;
2109
0
    }
2110
0
    return 0;
2111
0
}
2112
2113
/* build a unicode sequence property */
2114
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2115
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
2116
                          CharRange *cr)
2117
0
{
2118
0
    int seq_prop_idx;
2119
0
    seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
2120
0
    if (seq_prop_idx < 0)
2121
0
        return -2;
2122
0
    return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
2123
0
}