Coverage Report

Created: 2025-07-18 06:55

/src/quickjs/libunicode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Unicode utilities
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <string.h>
28
#include <assert.h>
29
30
#include "cutils.h"
31
#include "libunicode.h"
32
#include "libunicode-table.h"
33
34
enum {
35
    RUN_TYPE_U,
36
    RUN_TYPE_L,
37
    RUN_TYPE_UF,
38
    RUN_TYPE_LF,
39
    RUN_TYPE_UL,
40
    RUN_TYPE_LSU,
41
    RUN_TYPE_U2L_399_EXT2,
42
    RUN_TYPE_UF_D20,
43
    RUN_TYPE_UF_D1_EXT,
44
    RUN_TYPE_U_EXT,
45
    RUN_TYPE_LF_EXT,
46
    RUN_TYPE_UF_EXT2,
47
    RUN_TYPE_LF_EXT2,
48
    RUN_TYPE_UF_EXT3,
49
};
50
51
static int lre_case_conv1(uint32_t c, int conv_type)
52
0
{
53
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
54
0
    lre_case_conv(res, c, conv_type);
55
0
    return res[0];
56
0
}
57
58
/* case conversion using the table entry 'idx' with value 'v' */
59
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
0
{
61
0
    uint32_t code, data, type, a, is_lower;
62
0
    is_lower = (conv_type != 0);
63
0
    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
0
    data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
0
    code = v >> (32 - 17);
66
0
    switch(type) {
67
0
    case RUN_TYPE_U:
68
0
    case RUN_TYPE_L:
69
0
    case RUN_TYPE_UF:
70
0
    case RUN_TYPE_LF:
71
0
        if (conv_type == (type & 1) ||
72
0
            (type >= RUN_TYPE_UF && conv_type == 2)) {
73
0
            c = c - code + (case_conv_table1[data] >> (32 - 17));
74
0
        }
75
0
        break;
76
0
    case RUN_TYPE_UL:
77
0
        a = c - code;
78
0
        if ((a & 1) != (1 - is_lower))
79
0
            break;
80
0
        c = (a ^ 1) + code;
81
0
        break;
82
0
    case RUN_TYPE_LSU:
83
0
        a = c - code;
84
0
        if (a == 1) {
85
0
            c += 2 * is_lower - 1;
86
0
        } else if (a == (1 - is_lower) * 2) {
87
0
            c += (2 * is_lower - 1) * 2;
88
0
        }
89
0
        break;
90
0
    case RUN_TYPE_U2L_399_EXT2:
91
0
        if (!is_lower) {
92
0
            res[0] = c - code + case_conv_ext[data >> 6];
93
0
            res[1] = 0x399;
94
0
            return 2;
95
0
        } else {
96
0
            c = c - code + case_conv_ext[data & 0x3f];
97
0
        }
98
0
        break;
99
0
    case RUN_TYPE_UF_D20:
100
0
        if (conv_type == 1)
101
0
            break;
102
0
        c = data + (conv_type == 2) * 0x20;
103
0
        break;
104
0
    case RUN_TYPE_UF_D1_EXT:
105
0
        if (conv_type == 1)
106
0
            break;
107
0
        c = case_conv_ext[data] + (conv_type == 2);
108
0
        break;
109
0
    case RUN_TYPE_U_EXT:
110
0
    case RUN_TYPE_LF_EXT:
111
0
        if (is_lower != (type - RUN_TYPE_U_EXT))
112
0
            break;
113
0
        c = case_conv_ext[data];
114
0
        break;
115
0
    case RUN_TYPE_LF_EXT2:
116
0
        if (!is_lower)
117
0
            break;
118
0
        res[0] = c - code + case_conv_ext[data >> 6];
119
0
        res[1] = case_conv_ext[data & 0x3f];
120
0
        return 2;
121
0
    case RUN_TYPE_UF_EXT2:
122
0
        if (conv_type == 1)
123
0
            break;
124
0
        res[0] = c - code + case_conv_ext[data >> 6];
125
0
        res[1] = case_conv_ext[data & 0x3f];
126
0
        if (conv_type == 2) {
127
            /* convert to lower */
128
0
            res[0] = lre_case_conv1(res[0], 1);
129
0
            res[1] = lre_case_conv1(res[1], 1);
130
0
        }
131
0
        return 2;
132
0
    default:
133
0
    case RUN_TYPE_UF_EXT3:
134
0
        if (conv_type == 1)
135
0
            break;
136
0
        res[0] = case_conv_ext[data >> 8];
137
0
        res[1] = case_conv_ext[(data >> 4) & 0xf];
138
0
        res[2] = case_conv_ext[data & 0xf];
139
0
        if (conv_type == 2) {
140
            /* convert to lower */
141
0
            res[0] = lre_case_conv1(res[0], 1);
142
0
            res[1] = lre_case_conv1(res[1], 1);
143
0
            res[2] = lre_case_conv1(res[2], 1);
144
0
        }
145
0
        return 3;
146
0
    }
147
0
    res[0] = c;
148
0
    return 1;
149
0
}
150
151
/* conv_type:
152
   0 = to upper
153
   1 = to lower
154
   2 = case folding (= to lower with modifications)
155
*/
156
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
0
{
158
0
    if (c < 128) {
159
0
        if (conv_type) {
160
0
            if (c >= 'A' && c <= 'Z') {
161
0
                c = c - 'A' + 'a';
162
0
            }
163
0
        } else {
164
0
            if (c >= 'a' && c <= 'z') {
165
0
                c = c - 'a' + 'A';
166
0
            }
167
0
        }
168
0
    } else {
169
0
        uint32_t v, code, len;
170
0
        int idx, idx_min, idx_max;
171
172
0
        idx_min = 0;
173
0
        idx_max = countof(case_conv_table1) - 1;
174
0
        while (idx_min <= idx_max) {
175
0
            idx = (unsigned)(idx_max + idx_min) / 2;
176
0
            v = case_conv_table1[idx];
177
0
            code = v >> (32 - 17);
178
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
179
0
            if (c < code) {
180
0
                idx_max = idx - 1;
181
0
            } else if (c >= code + len) {
182
0
                idx_min = idx + 1;
183
0
            } else {
184
0
                return lre_case_conv_entry(res, c, conv_type, idx, v);
185
0
            }
186
0
        }
187
0
    }
188
0
    res[0] = c;
189
0
    return 1;
190
0
}
191
192
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
0
{
194
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
195
0
    int len;
196
197
0
    if (is_unicode) {
198
0
        len = lre_case_conv_entry(res, c, 2, idx, v);
199
0
        if (len == 1) {
200
0
            c = res[0];
201
0
        } else {
202
            /* handle the few specific multi-character cases (see
203
               unicode_gen.c:dump_case_folding_special_cases()) */
204
0
            if (c == 0xfb06) {
205
0
                c = 0xfb05;
206
0
            } else if (c == 0x01fd3) {
207
0
                c = 0x390;
208
0
            } else if (c == 0x01fe3) {
209
0
                c = 0x3b0;
210
0
            }
211
0
        }
212
0
    } else {
213
0
        if (likely(c < 128)) {
214
0
            if (c >= 'a' && c <= 'z')
215
0
                c = c - 'a' + 'A';
216
0
        } else {
217
            /* legacy regexp: to upper case if single char >= 128 */
218
0
            len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
0
            if (len == 1 && res[0] >= 128)
220
0
                c = res[0];
221
0
        }
222
0
    }
223
0
    return c;
224
0
}
225
226
/* JS regexp specific rules for case folding */
227
int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
0
{
229
0
    if (c < 128) {
230
        /* fast case */
231
0
        if (is_unicode) {
232
0
            if (c >= 'A' && c <= 'Z') {
233
0
                c = c - 'A' + 'a';
234
0
            }
235
0
        } else {
236
0
            if (c >= 'a' && c <= 'z') {
237
0
                c = c - 'a' + 'A';
238
0
            }
239
0
        }
240
0
    } else {
241
0
        uint32_t v, code, len;
242
0
        int idx, idx_min, idx_max;
243
244
0
        idx_min = 0;
245
0
        idx_max = countof(case_conv_table1) - 1;
246
0
        while (idx_min <= idx_max) {
247
0
            idx = (unsigned)(idx_max + idx_min) / 2;
248
0
            v = case_conv_table1[idx];
249
0
            code = v >> (32 - 17);
250
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
251
0
            if (c < code) {
252
0
                idx_max = idx - 1;
253
0
            } else if (c >= code + len) {
254
0
                idx_min = idx + 1;
255
0
            } else {
256
0
                return lre_case_folding_entry(c, idx, v, is_unicode);
257
0
            }
258
0
        }
259
0
    }
260
0
    return c;
261
0
}
262
263
static uint32_t get_le24(const uint8_t *ptr)
264
1
{
265
1
    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
1
}
267
268
0
#define UNICODE_INDEX_BLOCK_LEN 32
269
270
/* return -1 if not in table, otherwise the offset in the block */
271
static int get_index_pos(uint32_t *pcode, uint32_t c,
272
                         const uint8_t *index_table, int index_table_len)
273
1
{
274
1
    uint32_t code, v;
275
1
    int idx_min, idx_max, idx;
276
277
1
    idx_min = 0;
278
1
    v = get_le24(index_table);
279
1
    code = v & ((1 << 21) - 1);
280
1
    if (c < code) {
281
1
        *pcode = 0;
282
1
        return 0;
283
1
    }
284
0
    idx_max = index_table_len - 1;
285
0
    code = get_le24(index_table + idx_max * 3);
286
0
    if (c >= code)
287
0
        return -1;
288
    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
0
    while ((idx_max - idx_min) > 1) {
290
0
        idx = (idx_max + idx_min) / 2;
291
0
        v = get_le24(index_table + idx * 3);
292
0
        code = v & ((1 << 21) - 1);
293
0
        if (c < code) {
294
0
            idx_max = idx;
295
0
        } else {
296
0
            idx_min = idx;
297
0
        }
298
0
    }
299
0
    v = get_le24(index_table + idx_min * 3);
300
0
    *pcode = v & ((1 << 21) - 1);
301
0
    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
0
}
303
304
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
                            const uint8_t *index_table, int index_table_len)
306
1
{
307
1
    uint32_t code, b, bit;
308
1
    int pos;
309
1
    const uint8_t *p;
310
311
1
    pos = get_index_pos(&code, c, index_table, index_table_len);
312
1
    if (pos < 0)
313
0
        return FALSE; /* outside the table */
314
1
    p = table + pos;
315
1
    bit = 0;
316
    /* Compressed run length encoding:
317
       00..3F: 2 packed lengths: 3-bit + 3-bit
318
       40..5F: 5-bits plus extra byte for length
319
       60..7F: 5-bits plus 2 extra bytes for length
320
       80..FF: 7-bit length
321
       lengths must be incremented to get character count
322
       Ranges alternate between false and true return value.
323
     */
324
14
    for(;;) {
325
14
        b = *p++;
326
14
        if (b < 64) {
327
2
            code += (b >> 3) + 1;
328
2
            if (c < code)
329
0
                return bit;
330
2
            bit ^= 1;
331
2
            code += (b & 7) + 1;
332
12
        } else if (b >= 0x80) {
333
11
            code += b - 0x80 + 1;
334
11
        } else if (b < 0x60) {
335
1
            code += (((b - 0x40) << 8) | p[0]) + 1;
336
1
            p++;
337
1
        } else {
338
0
            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
0
            p += 2;
340
0
        }
341
14
        if (c < code)
342
1
            return bit;
343
13
        bit ^= 1;
344
13
    }
345
1
}
346
347
BOOL lre_is_cased(uint32_t c)
348
0
{
349
0
    uint32_t v, code, len;
350
0
    int idx, idx_min, idx_max;
351
352
0
    idx_min = 0;
353
0
    idx_max = countof(case_conv_table1) - 1;
354
0
    while (idx_min <= idx_max) {
355
0
        idx = (unsigned)(idx_max + idx_min) / 2;
356
0
        v = case_conv_table1[idx];
357
0
        code = v >> (32 - 17);
358
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
359
0
        if (c < code) {
360
0
            idx_max = idx - 1;
361
0
        } else if (c >= code + len) {
362
0
            idx_min = idx + 1;
363
0
        } else {
364
0
            return TRUE;
365
0
        }
366
0
    }
367
0
    return lre_is_in_table(c, unicode_prop_Cased1_table,
368
0
                           unicode_prop_Cased1_index,
369
0
                           sizeof(unicode_prop_Cased1_index) / 3);
370
0
}
371
372
BOOL lre_is_case_ignorable(uint32_t c)
373
0
{
374
0
    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
0
                           unicode_prop_Case_Ignorable_index,
376
0
                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
0
}
378
379
/* character range */
380
381
static __maybe_unused void cr_dump(CharRange *cr)
382
0
{
383
0
    int i;
384
0
    for(i = 0; i < cr->len; i++)
385
0
        printf("%d: 0x%04x\n", i, cr->points[i]);
386
0
}
387
388
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
0
{
390
0
    return realloc(ptr, size);
391
0
}
392
393
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
2
{
395
2
    cr->len = cr->size = 0;
396
2
    cr->points = NULL;
397
2
    cr->mem_opaque = mem_opaque;
398
2
    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
2
}
400
401
void cr_free(CharRange *cr)
402
2
{
403
2
    cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
2
}
405
406
int cr_realloc(CharRange *cr, int size)
407
10
{
408
10
    int new_size;
409
10
    uint32_t *new_buf;
410
411
10
    if (size > cr->size) {
412
9
        new_size = max_int(size, cr->size * 3 / 2);
413
9
        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
9
                                   new_size * sizeof(cr->points[0]));
415
9
        if (!new_buf)
416
0
            return -1;
417
9
        cr->points = new_buf;
418
9
        cr->size = new_size;
419
9
    }
420
10
    return 0;
421
10
}
422
423
int cr_copy(CharRange *cr, const CharRange *cr1)
424
0
{
425
0
    if (cr_realloc(cr, cr1->len))
426
0
        return -1;
427
0
    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
0
    cr->len = cr1->len;
429
0
    return 0;
430
0
}
431
432
/* merge consecutive intervals and remove empty intervals */
433
static void cr_compress(CharRange *cr)
434
1
{
435
1
    int i, j, k, len;
436
1
    uint32_t *pt;
437
438
1
    pt = cr->points;
439
1
    len = cr->len;
440
1
    i = 0;
441
1
    j = 0;
442
1
    k = 0;
443
12
    while ((i + 1) < len) {
444
11
        if (pt[i] == pt[i + 1]) {
445
            /* empty interval */
446
0
            i += 2;
447
11
        } else {
448
11
            j = i;
449
11
            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
0
                j += 2;
451
            /* just copy */
452
11
            pt[k] = pt[i];
453
11
            pt[k + 1] = pt[j + 1];
454
11
            k += 2;
455
11
            i = j + 2;
456
11
        }
457
11
    }
458
1
    cr->len = k;
459
1
}
460
461
/* union or intersection */
462
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
          const uint32_t *b_pt, int b_len, int op)
464
0
{
465
0
    int a_idx, b_idx, is_in;
466
0
    uint32_t v;
467
468
0
    a_idx = 0;
469
0
    b_idx = 0;
470
0
    for(;;) {
471
        /* get one more point from a or b in increasing order */
472
0
        if (a_idx < a_len && b_idx < b_len) {
473
0
            if (a_pt[a_idx] < b_pt[b_idx]) {
474
0
                goto a_add;
475
0
            } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
0
                v = a_pt[a_idx];
477
0
                a_idx++;
478
0
                b_idx++;
479
0
            } else {
480
0
                goto b_add;
481
0
            }
482
0
        } else if (a_idx < a_len) {
483
0
        a_add:
484
0
            v = a_pt[a_idx++];
485
0
        } else if (b_idx < b_len) {
486
0
        b_add:
487
0
            v = b_pt[b_idx++];
488
0
        } else {
489
0
            break;
490
0
        }
491
        /* add the point if the in/out status changes */
492
0
        switch(op) {
493
0
        case CR_OP_UNION:
494
0
            is_in = (a_idx & 1) | (b_idx & 1);
495
0
            break;
496
0
        case CR_OP_INTER:
497
0
            is_in = (a_idx & 1) & (b_idx & 1);
498
0
            break;
499
0
        case CR_OP_XOR:
500
0
            is_in = (a_idx & 1) ^ (b_idx & 1);
501
0
            break;
502
0
        case CR_OP_SUB:
503
0
            is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
504
0
            break;
505
0
        default:
506
0
            abort();
507
0
        }
508
0
        if (is_in != (cr->len & 1)) {
509
0
            if (cr_add_point(cr, v))
510
0
                return -1;
511
0
        }
512
0
    }
513
0
    cr_compress(cr);
514
0
    return 0;
515
0
}
516
517
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
518
0
{
519
0
    CharRange a = *cr;
520
0
    int ret;
521
0
    cr->len = 0;
522
0
    cr->size = 0;
523
0
    cr->points = NULL;
524
0
    ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
525
0
    cr_free(&a);
526
0
    return ret;
527
0
}
528
529
int cr_invert(CharRange *cr)
530
1
{
531
1
    int len;
532
1
    len = cr->len;
533
1
    if (cr_realloc(cr, len + 2))
534
0
        return -1;
535
1
    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
536
1
    cr->points[0] = 0;
537
1
    cr->points[len + 1] = UINT32_MAX;
538
1
    cr->len = len + 2;
539
1
    cr_compress(cr);
540
1
    return 0;
541
1
}
542
543
0
#define CASE_U (1 << 0)
544
0
#define CASE_L (1 << 1)
545
0
#define CASE_F (1 << 2)
546
547
/* use the case conversion table to generate range of characters.
548
   CASE_U: set char if modified by uppercasing,
549
   CASE_L: set char if modified by lowercasing,
550
   CASE_F: set char if modified by case folding,
551
 */
552
static int unicode_case1(CharRange *cr, int case_mask)
553
0
{
554
0
#define MR(x) (1 << RUN_TYPE_ ## x)
555
0
    const uint32_t tab_run_mask[3] = {
556
0
        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
557
0
        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
558
559
0
        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
560
561
0
        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
562
0
    };
563
0
#undef MR
564
0
    uint32_t mask, v, code, type, len, i, idx;
565
566
0
    if (case_mask == 0)
567
0
        return 0;
568
0
    mask = 0;
569
0
    for(i = 0; i < 3; i++) {
570
0
        if ((case_mask >> i) & 1)
571
0
            mask |= tab_run_mask[i];
572
0
    }
573
0
    for(idx = 0; idx < countof(case_conv_table1); idx++) {
574
0
        v = case_conv_table1[idx];
575
0
        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
576
0
        code = v >> (32 - 17);
577
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
578
0
        if ((mask >> type) & 1) {
579
            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
580
0
            switch(type) {
581
0
            case RUN_TYPE_UL:
582
0
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
583
0
                    goto def_case;
584
0
                code += ((case_mask & CASE_U) != 0);
585
0
                for(i = 0; i < len; i += 2) {
586
0
                    if (cr_add_interval(cr, code + i, code + i + 1))
587
0
                        return -1;
588
0
                }
589
0
                break;
590
0
            case RUN_TYPE_LSU:
591
0
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
592
0
                    goto def_case;
593
0
                if (!(case_mask & CASE_U)) {
594
0
                    if (cr_add_interval(cr, code, code + 1))
595
0
                        return -1;
596
0
                }
597
0
                if (cr_add_interval(cr, code + 1, code + 2))
598
0
                    return -1;
599
0
                if (case_mask & CASE_U) {
600
0
                    if (cr_add_interval(cr, code + 2, code + 3))
601
0
                        return -1;
602
0
                }
603
0
                break;
604
0
            default:
605
0
            def_case:
606
0
                if (cr_add_interval(cr, code, code + len))
607
0
                    return -1;
608
0
                break;
609
0
            }
610
0
        }
611
0
    }
612
0
    return 0;
613
0
}
614
615
static int point_cmp(const void *p1, const void *p2, void *arg)
616
0
{
617
0
    uint32_t v1 = *(uint32_t *)p1;
618
0
    uint32_t v2 = *(uint32_t *)p2;
619
0
    return (v1 > v2) - (v1 < v2);
620
0
}
621
622
static void cr_sort_and_remove_overlap(CharRange *cr)
623
0
{
624
0
    uint32_t start, end, start1, end1, i, j;
625
626
    /* the resulting ranges are not necessarily sorted and may overlap */
627
0
    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
628
0
    j = 0;
629
0
    for(i = 0; i < cr->len; ) {
630
0
        start = cr->points[i];
631
0
        end = cr->points[i + 1];
632
0
        i += 2;
633
0
        while (i < cr->len) {
634
0
            start1 = cr->points[i];
635
0
            end1 = cr->points[i + 1];
636
0
            if (start1 > end) {
637
                /* |------|
638
                 *           |-------| */
639
0
                break;
640
0
            } else if (end1 <= end) {
641
                /* |------|
642
                 *    |--| */
643
0
                i += 2;
644
0
            } else {
645
                /* |------|
646
                 *     |-------| */
647
0
                end = end1;
648
0
                i += 2;
649
0
            }
650
0
        }
651
0
        cr->points[j] = start;
652
0
        cr->points[j + 1] = end;
653
0
        j += 2;
654
0
    }
655
0
    cr->len = j;
656
0
}
657
658
/* canonicalize a character set using the JS regex case folding rules
659
   (see lre_canonicalize()) */
660
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
661
0
{
662
0
    CharRange cr_inter, cr_mask, cr_result, cr_sub;
663
0
    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
664
665
0
    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
666
0
    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
667
0
    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
668
0
    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
669
670
0
    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
671
0
        goto fail;
672
0
    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
673
0
        goto fail;
674
675
0
    if (cr_invert(&cr_mask))
676
0
        goto fail;
677
0
    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
678
0
        goto fail;
679
680
    /* cr_inter = cr & cr_mask */
681
    /* cr_sub = cr & ~cr_mask */
682
683
    /* use the case conversion table to compute the result */
684
0
    d_start = -1;
685
0
    d_end = -1;
686
0
    idx = 0;
687
0
    v = case_conv_table1[idx];
688
0
    code = v >> (32 - 17);
689
0
    len = (v >> (32 - 17 - 7)) & 0x7f;
690
0
    for(i = 0; i < cr_inter.len; i += 2) {
691
0
        start = cr_inter.points[i];
692
0
        end = cr_inter.points[i + 1];
693
694
0
        for(c = start; c < end; c++) {
695
0
            for(;;) {
696
0
                if (c >= code && c < code + len)
697
0
                    break;
698
0
                idx++;
699
0
                assert(idx < countof(case_conv_table1));
700
0
                v = case_conv_table1[idx];
701
0
                code = v >> (32 - 17);
702
0
                len = (v >> (32 - 17 - 7)) & 0x7f;
703
0
            }
704
0
            d = lre_case_folding_entry(c, idx, v, is_unicode);
705
            /* try to merge with the current interval */
706
0
            if (d_start == -1) {
707
0
                d_start = d;
708
0
                d_end = d + 1;
709
0
            } else if (d_end == d) {
710
0
                d_end++;
711
0
            } else {
712
0
                cr_add_interval(&cr_result, d_start, d_end);
713
0
                d_start = d;
714
0
                d_end = d + 1;
715
0
            }
716
0
        }
717
0
    }
718
0
    if (d_start != -1) {
719
0
        if (cr_add_interval(&cr_result, d_start, d_end))
720
0
            goto fail;
721
0
    }
722
723
    /* the resulting ranges are not necessarily sorted and may overlap */
724
0
    cr_sort_and_remove_overlap(&cr_result);
725
726
    /* or with the character not affected by the case folding */
727
0
    cr->len = 0;
728
0
    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
729
0
        goto fail;
730
731
0
    cr_free(&cr_inter);
732
0
    cr_free(&cr_mask);
733
0
    cr_free(&cr_result);
734
0
    cr_free(&cr_sub);
735
0
    return 0;
736
0
 fail:
737
0
    cr_free(&cr_inter);
738
0
    cr_free(&cr_mask);
739
0
    cr_free(&cr_result);
740
0
    cr_free(&cr_sub);
741
0
    return -1;
742
0
}
743
744
#ifdef CONFIG_ALL_UNICODE
745
746
BOOL lre_is_id_start(uint32_t c)
747
1
{
748
1
    return lre_is_in_table(c, unicode_prop_ID_Start_table,
749
1
                           unicode_prop_ID_Start_index,
750
1
                           sizeof(unicode_prop_ID_Start_index) / 3);
751
1
}
752
753
BOOL lre_is_id_continue(uint32_t c)
754
1
{
755
1
    return lre_is_id_start(c) ||
756
1
        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
757
0
                        unicode_prop_ID_Continue1_index,
758
0
                        sizeof(unicode_prop_ID_Continue1_index) / 3);
759
1
}
760
761
#define UNICODE_DECOMP_LEN_MAX 18
762
763
typedef enum {
764
    DECOMP_TYPE_C1, /* 16 bit char */
765
    DECOMP_TYPE_L1, /* 16 bit char table */
766
    DECOMP_TYPE_L2,
767
    DECOMP_TYPE_L3,
768
    DECOMP_TYPE_L4,
769
    DECOMP_TYPE_L5, /* XXX: not used */
770
    DECOMP_TYPE_L6, /* XXX: could remove */
771
    DECOMP_TYPE_L7, /* XXX: could remove */
772
    DECOMP_TYPE_LL1, /* 18 bit char table */
773
    DECOMP_TYPE_LL2,
774
    DECOMP_TYPE_S1, /* 8 bit char table */
775
    DECOMP_TYPE_S2,
776
    DECOMP_TYPE_S3,
777
    DECOMP_TYPE_S4,
778
    DECOMP_TYPE_S5,
779
    DECOMP_TYPE_I1, /* increment 16 bit char value */
780
    DECOMP_TYPE_I2_0,
781
    DECOMP_TYPE_I2_1,
782
    DECOMP_TYPE_I3_1,
783
    DECOMP_TYPE_I3_2,
784
    DECOMP_TYPE_I4_1,
785
    DECOMP_TYPE_I4_2,
786
    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
787
    DECOMP_TYPE_B2,
788
    DECOMP_TYPE_B3,
789
    DECOMP_TYPE_B4,
790
    DECOMP_TYPE_B5,
791
    DECOMP_TYPE_B6,
792
    DECOMP_TYPE_B7,
793
    DECOMP_TYPE_B8,
794
    DECOMP_TYPE_B18,
795
    DECOMP_TYPE_LS2,
796
    DECOMP_TYPE_PAT3,
797
    DECOMP_TYPE_S2_UL,
798
    DECOMP_TYPE_LS2_UL,
799
} DecompTypeEnum;
800
801
static uint32_t unicode_get_short_code(uint32_t c)
802
0
{
803
0
    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
804
805
0
    if (c < 0x80)
806
0
        return c;
807
0
    else if (c < 0x80 + 0x50)
808
0
        return c - 0x80 + 0x300;
809
0
    else
810
0
        return unicode_short_table[c - 0x80 - 0x50];
811
0
}
812
813
static uint32_t unicode_get_lower_simple(uint32_t c)
814
0
{
815
0
    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
816
0
        c += 0x20;
817
0
    else
818
0
        c++;
819
0
    return c;
820
0
}
821
822
static uint16_t unicode_get16(const uint8_t *p)
823
0
{
824
0
    return p[0] | (p[1] << 8);
825
0
}
826
827
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
828
                                int idx, uint32_t code, uint32_t len,
829
                                uint32_t type)
830
0
{
831
0
    uint32_t c1;
832
0
    int l, i, p;
833
0
    const uint8_t *d;
834
835
0
    if (type == DECOMP_TYPE_C1) {
836
0
        res[0] = unicode_decomp_table2[idx];
837
0
        return 1;
838
0
    } else {
839
0
        d = unicode_decomp_data + unicode_decomp_table2[idx];
840
0
        switch(type) {
841
0
        case DECOMP_TYPE_L1:
842
0
        case DECOMP_TYPE_L2:
843
0
        case DECOMP_TYPE_L3:
844
0
        case DECOMP_TYPE_L4:
845
0
        case DECOMP_TYPE_L5:
846
0
        case DECOMP_TYPE_L6:
847
0
        case DECOMP_TYPE_L7:
848
0
            l = type - DECOMP_TYPE_L1 + 1;
849
0
            d += (c - code) * l * 2;
850
0
            for(i = 0; i < l; i++) {
851
0
                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
852
0
                    return 0;
853
0
            }
854
0
            return l;
855
0
        case DECOMP_TYPE_LL1:
856
0
        case DECOMP_TYPE_LL2:
857
0
            {
858
0
                uint32_t k, p;
859
0
                l = type - DECOMP_TYPE_LL1 + 1;
860
0
                k = (c - code) * l;
861
0
                p = len * l * 2;
862
0
                for(i = 0; i < l; i++) {
863
0
                    c1 = unicode_get16(d + 2 * k) |
864
0
                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
865
0
                    if (!c1)
866
0
                        return 0;
867
0
                    res[i] = c1;
868
0
                    k++;
869
0
                }
870
0
            }
871
0
            return l;
872
0
        case DECOMP_TYPE_S1:
873
0
        case DECOMP_TYPE_S2:
874
0
        case DECOMP_TYPE_S3:
875
0
        case DECOMP_TYPE_S4:
876
0
        case DECOMP_TYPE_S5:
877
0
            l = type - DECOMP_TYPE_S1 + 1;
878
0
            d += (c - code) * l;
879
0
            for(i = 0; i < l; i++) {
880
0
                if ((res[i] = unicode_get_short_code(d[i])) == 0)
881
0
                    return 0;
882
0
            }
883
0
            return l;
884
0
        case DECOMP_TYPE_I1:
885
0
            l = 1;
886
0
            p = 0;
887
0
            goto decomp_type_i;
888
0
        case DECOMP_TYPE_I2_0:
889
0
        case DECOMP_TYPE_I2_1:
890
0
        case DECOMP_TYPE_I3_1:
891
0
        case DECOMP_TYPE_I3_2:
892
0
        case DECOMP_TYPE_I4_1:
893
0
        case DECOMP_TYPE_I4_2:
894
0
            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
895
0
            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
896
0
        decomp_type_i:
897
0
            for(i = 0; i < l; i++) {
898
0
                c1 = unicode_get16(d + 2 * i);
899
0
                if (i == p)
900
0
                    c1 += c - code;
901
0
                res[i] = c1;
902
0
            }
903
0
            return l;
904
0
        case DECOMP_TYPE_B18:
905
0
            l = 18;
906
0
            goto decomp_type_b;
907
0
        case DECOMP_TYPE_B1:
908
0
        case DECOMP_TYPE_B2:
909
0
        case DECOMP_TYPE_B3:
910
0
        case DECOMP_TYPE_B4:
911
0
        case DECOMP_TYPE_B5:
912
0
        case DECOMP_TYPE_B6:
913
0
        case DECOMP_TYPE_B7:
914
0
        case DECOMP_TYPE_B8:
915
0
            l = type - DECOMP_TYPE_B1 + 1;
916
0
        decomp_type_b:
917
0
            {
918
0
                uint32_t c_min;
919
0
                c_min = unicode_get16(d);
920
0
                d += 2 + (c - code) * l;
921
0
                for(i = 0; i < l; i++) {
922
0
                    c1 = d[i];
923
0
                    if (c1 == 0xff)
924
0
                        c1 = 0x20;
925
0
                    else
926
0
                        c1 += c_min;
927
0
                    res[i] = c1;
928
0
                }
929
0
            }
930
0
            return l;
931
0
        case DECOMP_TYPE_LS2:
932
0
            d += (c - code) * 3;
933
0
            if (!(res[0] = unicode_get16(d)))
934
0
                return 0;
935
0
            res[1] = unicode_get_short_code(d[2]);
936
0
            return 2;
937
0
        case DECOMP_TYPE_PAT3:
938
0
            res[0] = unicode_get16(d);
939
0
            res[2] = unicode_get16(d + 2);
940
0
            d += 4 + (c - code) * 2;
941
0
            res[1] = unicode_get16(d);
942
0
            return 3;
943
0
        case DECOMP_TYPE_S2_UL:
944
0
        case DECOMP_TYPE_LS2_UL:
945
0
            c1 = c - code;
946
0
            if (type == DECOMP_TYPE_S2_UL) {
947
0
                d += c1 & ~1;
948
0
                c = unicode_get_short_code(*d);
949
0
                d++;
950
0
            } else {
951
0
                d += (c1 >> 1) * 3;
952
0
                c = unicode_get16(d);
953
0
                d += 2;
954
0
            }
955
0
            if (c1 & 1)
956
0
                c = unicode_get_lower_simple(c);
957
0
            res[0] = c;
958
0
            res[1] = unicode_get_short_code(*d);
959
0
            return 2;
960
0
        }
961
0
    }
962
0
    return 0;
963
0
}
964
965
966
/* return the length of the decomposition (length <=
967
   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
968
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
969
0
{
970
0
    uint32_t v, type, is_compat, code, len;
971
0
    int idx_min, idx_max, idx;
972
973
0
    idx_min = 0;
974
0
    idx_max = countof(unicode_decomp_table1) - 1;
975
0
    while (idx_min <= idx_max) {
976
0
        idx = (idx_max + idx_min) / 2;
977
0
        v = unicode_decomp_table1[idx];
978
0
        code = v >> (32 - 18);
979
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
980
        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
981
0
        if (c < code) {
982
0
            idx_max = idx - 1;
983
0
        } else if (c >= code + len) {
984
0
            idx_min = idx + 1;
985
0
        } else {
986
0
            is_compat = v & 1;
987
0
            if (is_compat1 < is_compat)
988
0
                break;
989
0
            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
990
0
            return unicode_decomp_entry(res, c, idx, code, len, type);
991
0
        }
992
0
    }
993
0
    return 0;
994
0
}
995
996
/* return 0 if no pair found */
997
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
998
0
{
999
0
    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
1000
0
    int idx_min, idx_max, idx, d;
1001
0
    uint32_t pair[2];
1002
1003
0
    idx_min = 0;
1004
0
    idx_max = countof(unicode_comp_table) - 1;
1005
0
    while (idx_min <= idx_max) {
1006
0
        idx = (idx_max + idx_min) / 2;
1007
0
        idx1 = unicode_comp_table[idx];
1008
1009
        /* idx1 represent an entry of the decomposition table */
1010
0
        d_idx = idx1 >> 6;
1011
0
        d_offset = idx1 & 0x3f;
1012
0
        v = unicode_decomp_table1[d_idx];
1013
0
        code = v >> (32 - 18);
1014
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
1015
0
        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
1016
0
        ch = code + d_offset;
1017
0
        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1018
0
        d = c0 - pair[0];
1019
0
        if (d == 0)
1020
0
            d = c1 - pair[1];
1021
0
        if (d < 0) {
1022
0
            idx_max = idx - 1;
1023
0
        } else if (d > 0) {
1024
0
            idx_min = idx + 1;
1025
0
        } else {
1026
0
            return ch;
1027
0
        }
1028
0
    }
1029
0
    return 0;
1030
0
}
1031
1032
/* return the combining class of character c (between 0 and 255) */
1033
static int unicode_get_cc(uint32_t c)
1034
0
{
1035
0
    uint32_t code, n, type, cc, c1, b;
1036
0
    int pos;
1037
0
    const uint8_t *p;
1038
1039
0
    pos = get_index_pos(&code, c,
1040
0
                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
1041
0
    if (pos < 0)
1042
0
        return 0;
1043
0
    p = unicode_cc_table + pos;
1044
    /* Compressed run length encoding:
1045
       - 2 high order bits are combining class type
1046
       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1047
       - 00..2F: range length (add 1)
1048
       - 30..37: 3-bit range-length + 1 extra byte
1049
       - 38..3F: 3-bit range-length + 2 extra byte
1050
     */
1051
0
    for(;;) {
1052
0
        b = *p++;
1053
0
        type = b >> 6;
1054
0
        n = b & 0x3f;
1055
0
        if (n < 48) {
1056
0
        } else if (n < 56) {
1057
0
            n = (n - 48) << 8;
1058
0
            n |= *p++;
1059
0
            n += 48;
1060
0
        } else {
1061
0
            n = (n - 56) << 8;
1062
0
            n |= *p++ << 8;
1063
0
            n |= *p++;
1064
0
            n += 48 + (1 << 11);
1065
0
        }
1066
0
        if (type <= 1)
1067
0
            p++;
1068
0
        c1 = code + n + 1;
1069
0
        if (c < c1) {
1070
0
            switch(type) {
1071
0
            case 0:
1072
0
                cc = p[-1];
1073
0
                break;
1074
0
            case 1:
1075
0
                cc = p[-1] + c - code;
1076
0
                break;
1077
0
            case 2:
1078
0
                cc = 0;
1079
0
                break;
1080
0
            default:
1081
0
            case 3:
1082
0
                cc = 230;
1083
0
                break;
1084
0
            }
1085
0
            return cc;
1086
0
        }
1087
0
        code = c1;
1088
0
    }
1089
0
}
1090
1091
static void sort_cc(int *buf, int len)
1092
0
{
1093
0
    int i, j, k, cc, cc1, start, ch1;
1094
1095
0
    for(i = 0; i < len; i++) {
1096
0
        cc = unicode_get_cc(buf[i]);
1097
0
        if (cc != 0) {
1098
0
            start = i;
1099
0
            j = i + 1;
1100
0
            while (j < len) {
1101
0
                ch1 = buf[j];
1102
0
                cc1 = unicode_get_cc(ch1);
1103
0
                if (cc1 == 0)
1104
0
                    break;
1105
0
                k = j - 1;
1106
0
                while (k >= start) {
1107
0
                    if (unicode_get_cc(buf[k]) <= cc1)
1108
0
                        break;
1109
0
                    buf[k + 1] = buf[k];
1110
0
                    k--;
1111
0
                }
1112
0
                buf[k + 1] = ch1;
1113
0
                j++;
1114
0
            }
1115
#if 0
1116
            printf("cc:");
1117
            for(k = start; k < j; k++) {
1118
                printf(" %3d", unicode_get_cc(buf[k]));
1119
            }
1120
            printf("\n");
1121
#endif
1122
0
            i = j;
1123
0
        }
1124
0
    }
1125
0
}
1126
1127
static void to_nfd_rec(DynBuf *dbuf,
1128
                       const int *src, int src_len, int is_compat)
1129
0
{
1130
0
    uint32_t c, v;
1131
0
    int i, l;
1132
0
    uint32_t res[UNICODE_DECOMP_LEN_MAX];
1133
1134
0
    for(i = 0; i < src_len; i++) {
1135
0
        c = src[i];
1136
0
        if (c >= 0xac00 && c < 0xd7a4) {
1137
            /* Hangul decomposition */
1138
0
            c -= 0xac00;
1139
0
            dbuf_put_u32(dbuf, 0x1100 + c / 588);
1140
0
            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
1141
0
            v = c % 28;
1142
0
            if (v != 0)
1143
0
                dbuf_put_u32(dbuf, 0x11a7 + v);
1144
0
        } else {
1145
0
            l = unicode_decomp_char(res, c, is_compat);
1146
0
            if (l) {
1147
0
                to_nfd_rec(dbuf, (int *)res, l, is_compat);
1148
0
            } else {
1149
0
                dbuf_put_u32(dbuf, c);
1150
0
            }
1151
0
        }
1152
0
    }
1153
0
}
1154
1155
/* return 0 if not found */
1156
static int compose_pair(uint32_t c0, uint32_t c1)
1157
0
{
1158
    /* Hangul composition */
1159
0
    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
1160
0
        c1 >= 0x1161 && c1 < 0x1161 + 21) {
1161
0
        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
1162
0
    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
1163
0
               (c0 - 0xac00) % 28 == 0 &&
1164
0
               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
1165
0
        return c0 + c1 - 0x11a7;
1166
0
    } else {
1167
0
        return unicode_compose_pair(c0, c1);
1168
0
    }
1169
0
}
1170
1171
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
1172
                      UnicodeNormalizationEnum n_type,
1173
                      void *opaque, DynBufReallocFunc *realloc_func)
1174
0
{
1175
0
    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1176
0
    BOOL is_compat;
1177
0
    DynBuf dbuf_s, *dbuf = &dbuf_s;
1178
1179
0
    is_compat = n_type >> 1;
1180
1181
0
    dbuf_init2(dbuf, opaque, realloc_func);
1182
0
    if (dbuf_realloc(dbuf, sizeof(int) * src_len))
1183
0
        goto fail;
1184
1185
    /* common case: latin1 is unaffected by NFC */
1186
0
    if (n_type == UNICODE_NFC) {
1187
0
        for(i = 0; i < src_len; i++) {
1188
0
            if (src[i] >= 0x100)
1189
0
                goto not_latin1;
1190
0
        }
1191
0
        buf = (int *)dbuf->buf;
1192
0
        memcpy(buf, src, src_len * sizeof(int));
1193
0
        *pdst = (uint32_t *)buf;
1194
0
        return src_len;
1195
0
    not_latin1: ;
1196
0
    }
1197
1198
0
    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1199
0
    if (dbuf_error(dbuf)) {
1200
0
    fail:
1201
0
        *pdst = NULL;
1202
0
        return -1;
1203
0
    }
1204
0
    buf = (int *)dbuf->buf;
1205
0
    buf_len = dbuf->size / sizeof(int);
1206
1207
0
    sort_cc(buf, buf_len);
1208
1209
0
    if (buf_len <= 1 || (n_type & 1) != 0) {
1210
        /* NFD / NFKD */
1211
0
        *pdst = (uint32_t *)buf;
1212
0
        return buf_len;
1213
0
    }
1214
1215
0
    i = 1;
1216
0
    out_len = 1;
1217
0
    while (i < buf_len) {
1218
        /* find the starter character and test if it is blocked from
1219
           the character at 'i' */
1220
0
        last_cc = unicode_get_cc(buf[i]);
1221
0
        starter_pos = out_len - 1;
1222
0
        while (starter_pos >= 0) {
1223
0
            cc = unicode_get_cc(buf[starter_pos]);
1224
0
            if (cc == 0)
1225
0
                break;
1226
0
            if (cc >= last_cc)
1227
0
                goto next;
1228
0
            last_cc = 256;
1229
0
            starter_pos--;
1230
0
        }
1231
0
        if (starter_pos >= 0 &&
1232
0
            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1233
0
            buf[starter_pos] = p;
1234
0
            i++;
1235
0
        } else {
1236
0
        next:
1237
0
            buf[out_len++] = buf[i++];
1238
0
        }
1239
0
    }
1240
0
    *pdst = (uint32_t *)buf;
1241
0
    return out_len;
1242
0
}
1243
1244
/* char ranges for various unicode properties */
1245
1246
static int unicode_find_name(const char *name_table, const char *name)
1247
0
{
1248
0
    const char *p, *r;
1249
0
    int pos;
1250
0
    size_t name_len, len;
1251
1252
0
    p = name_table;
1253
0
    pos = 0;
1254
0
    name_len = strlen(name);
1255
0
    while (*p) {
1256
0
        for(;;) {
1257
0
            r = strchr(p, ',');
1258
0
            if (!r)
1259
0
                len = strlen(p);
1260
0
            else
1261
0
                len = r - p;
1262
0
            if (len == name_len && !memcmp(p, name, name_len))
1263
0
                return pos;
1264
0
            p += len + 1;
1265
0
            if (!r)
1266
0
                break;
1267
0
        }
1268
0
        pos++;
1269
0
    }
1270
0
    return -1;
1271
0
}
1272
1273
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1274
   if not found */
1275
int unicode_script(CharRange *cr,
1276
                   const char *script_name, BOOL is_ext)
1277
0
{
1278
0
    int script_idx;
1279
0
    const uint8_t *p, *p_end;
1280
0
    uint32_t c, c1, b, n, v, v_len, i, type;
1281
0
    CharRange cr1_s, *cr1;
1282
0
    CharRange cr2_s, *cr2 = &cr2_s;
1283
0
    BOOL is_common;
1284
1285
0
    script_idx = unicode_find_name(unicode_script_name_table, script_name);
1286
0
    if (script_idx < 0)
1287
0
        return -2;
1288
1289
0
    is_common = (script_idx == UNICODE_SCRIPT_Common ||
1290
0
                 script_idx == UNICODE_SCRIPT_Inherited);
1291
0
    if (is_ext) {
1292
0
        cr1 = &cr1_s;
1293
0
        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1294
0
        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1295
0
    } else {
1296
0
        cr1 = cr;
1297
0
    }
1298
1299
0
    p = unicode_script_table;
1300
0
    p_end = unicode_script_table + countof(unicode_script_table);
1301
0
    c = 0;
1302
0
    while (p < p_end) {
1303
0
        b = *p++;
1304
0
        type = b >> 7;
1305
0
        n = b & 0x7f;
1306
0
        if (n < 96) {
1307
0
        } else if (n < 112) {
1308
0
            n = (n - 96) << 8;
1309
0
            n |= *p++;
1310
0
            n += 96;
1311
0
        } else {
1312
0
            n = (n - 112) << 16;
1313
0
            n |= *p++ << 8;
1314
0
            n |= *p++;
1315
0
            n += 96 + (1 << 12);
1316
0
        }
1317
0
        c1 = c + n + 1;
1318
0
        if (type != 0) {
1319
0
            v = *p++;
1320
0
            if (v == script_idx || script_idx == UNICODE_SCRIPT_Unknown) {
1321
0
                if (cr_add_interval(cr1, c, c1))
1322
0
                    goto fail;
1323
0
            }
1324
0
        }
1325
0
        c = c1;
1326
0
    }
1327
0
    if (script_idx == UNICODE_SCRIPT_Unknown) {
1328
        /* Unknown is all the characters outside scripts */
1329
0
        if (cr_invert(cr1))
1330
0
            goto fail;
1331
0
    }
1332
1333
0
    if (is_ext) {
1334
        /* add the script extensions */
1335
0
        p = unicode_script_ext_table;
1336
0
        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1337
0
        c = 0;
1338
0
        while (p < p_end) {
1339
0
            b = *p++;
1340
0
            if (b < 128) {
1341
0
                n = b;
1342
0
            } else if (b < 128 + 64) {
1343
0
                n = (b - 128) << 8;
1344
0
                n |= *p++;
1345
0
                n += 128;
1346
0
            } else {
1347
0
                n = (b - 128 - 64) << 16;
1348
0
                n |= *p++ << 8;
1349
0
                n |= *p++;
1350
0
                n += 128 + (1 << 14);
1351
0
            }
1352
0
            c1 = c + n + 1;
1353
0
            v_len = *p++;
1354
0
            if (is_common) {
1355
0
                if (v_len != 0) {
1356
0
                    if (cr_add_interval(cr2, c, c1))
1357
0
                        goto fail;
1358
0
                }
1359
0
            } else {
1360
0
                for(i = 0; i < v_len; i++) {
1361
0
                    if (p[i] == script_idx) {
1362
0
                        if (cr_add_interval(cr2, c, c1))
1363
0
                            goto fail;
1364
0
                        break;
1365
0
                    }
1366
0
                }
1367
0
            }
1368
0
            p += v_len;
1369
0
            c = c1;
1370
0
        }
1371
0
        if (is_common) {
1372
            /* remove all the characters with script extensions */
1373
0
            if (cr_invert(cr2))
1374
0
                goto fail;
1375
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1376
0
                      CR_OP_INTER))
1377
0
                goto fail;
1378
0
        } else {
1379
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1380
0
                      CR_OP_UNION))
1381
0
                goto fail;
1382
0
        }
1383
0
        cr_free(cr1);
1384
0
        cr_free(cr2);
1385
0
    }
1386
0
    return 0;
1387
0
 fail:
1388
0
    if (is_ext) {
1389
0
        cr_free(cr1);
1390
0
        cr_free(cr2);
1391
0
    }
1392
0
    goto fail;
1393
0
}
1394
1395
0
#define M(id) (1U << UNICODE_GC_ ## id)
1396
1397
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1398
0
{
1399
0
    const uint8_t *p, *p_end;
1400
0
    uint32_t c, c0, b, n, v;
1401
1402
0
    p = unicode_gc_table;
1403
0
    p_end = unicode_gc_table + countof(unicode_gc_table);
1404
0
    c = 0;
1405
    /* Compressed range encoding:
1406
       initial byte:
1407
       bits 0..4: category number (special case 31)
1408
       bits 5..7: range length (add 1)
1409
       special case bits 5..7 == 7: read an extra byte
1410
       - 00..7F: range length (add 7 + 1)
1411
       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1412
       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1413
     */
1414
0
    while (p < p_end) {
1415
0
        b = *p++;
1416
0
        n = b >> 5;
1417
0
        v = b & 0x1f;
1418
0
        if (n == 7) {
1419
0
            n = *p++;
1420
0
            if (n < 128) {
1421
0
                n += 7;
1422
0
            } else if (n < 128 + 64) {
1423
0
                n = (n - 128) << 8;
1424
0
                n |= *p++;
1425
0
                n += 7 + 128;
1426
0
            } else {
1427
0
                n = (n - 128 - 64) << 16;
1428
0
                n |= *p++ << 8;
1429
0
                n |= *p++;
1430
0
                n += 7 + 128 + (1 << 14);
1431
0
            }
1432
0
        }
1433
0
        c0 = c;
1434
0
        c += n + 1;
1435
0
        if (v == 31) {
1436
            /* run of Lu / Ll */
1437
0
            b = gc_mask & (M(Lu) | M(Ll));
1438
0
            if (b != 0) {
1439
0
                if (b == (M(Lu) | M(Ll))) {
1440
0
                    goto add_range;
1441
0
                } else {
1442
0
                    c0 += ((gc_mask & M(Ll)) != 0);
1443
0
                    for(; c0 < c; c0 += 2) {
1444
0
                        if (cr_add_interval(cr, c0, c0 + 1))
1445
0
                            return -1;
1446
0
                    }
1447
0
                }
1448
0
            }
1449
0
        } else if ((gc_mask >> v) & 1) {
1450
0
        add_range:
1451
0
            if (cr_add_interval(cr, c0, c))
1452
0
                return -1;
1453
0
        }
1454
0
    }
1455
0
    return 0;
1456
0
}
1457
1458
static int unicode_prop1(CharRange *cr, int prop_idx)
1459
0
{
1460
0
    const uint8_t *p, *p_end;
1461
0
    uint32_t c, c0, b, bit;
1462
1463
0
    p = unicode_prop_table[prop_idx];
1464
0
    p_end = p + unicode_prop_len_table[prop_idx];
1465
0
    c = 0;
1466
0
    bit = 0;
1467
    /* Compressed range encoding:
1468
       00..3F: 2 packed lengths: 3-bit + 3-bit
1469
       40..5F: 5-bits plus extra byte for length
1470
       60..7F: 5-bits plus 2 extra bytes for length
1471
       80..FF: 7-bit length
1472
       lengths must be incremented to get character count
1473
       Ranges alternate between false and true return value.
1474
     */
1475
0
    while (p < p_end) {
1476
0
        c0 = c;
1477
0
        b = *p++;
1478
0
        if (b < 64) {
1479
0
            c += (b >> 3) + 1;
1480
0
            if (bit)  {
1481
0
                if (cr_add_interval(cr, c0, c))
1482
0
                    return -1;
1483
0
            }
1484
0
            bit ^= 1;
1485
0
            c0 = c;
1486
0
            c += (b & 7) + 1;
1487
0
        } else if (b >= 0x80) {
1488
0
            c += b - 0x80 + 1;
1489
0
        } else if (b < 0x60) {
1490
0
            c += (((b - 0x40) << 8) | p[0]) + 1;
1491
0
            p++;
1492
0
        } else {
1493
0
            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1494
0
            p += 2;
1495
0
        }
1496
0
        if (bit)  {
1497
0
            if (cr_add_interval(cr, c0, c))
1498
0
                return -1;
1499
0
        }
1500
0
        bit ^= 1;
1501
0
    }
1502
0
    return 0;
1503
0
}
1504
1505
typedef enum {
1506
    POP_GC,
1507
    POP_PROP,
1508
    POP_CASE,
1509
    POP_UNION,
1510
    POP_INTER,
1511
    POP_XOR,
1512
    POP_INVERT,
1513
    POP_END,
1514
} PropOPEnum;
1515
1516
#define POP_STACK_LEN_MAX 4
1517
1518
static int unicode_prop_ops(CharRange *cr, ...)
1519
0
{
1520
0
    va_list ap;
1521
0
    CharRange stack[POP_STACK_LEN_MAX];
1522
0
    int stack_len, op, ret, i;
1523
0
    uint32_t a;
1524
1525
0
    va_start(ap, cr);
1526
0
    stack_len = 0;
1527
0
    for(;;) {
1528
0
        op = va_arg(ap, int);
1529
0
        switch(op) {
1530
0
        case POP_GC:
1531
0
            assert(stack_len < POP_STACK_LEN_MAX);
1532
0
            a = va_arg(ap, int);
1533
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1534
0
            if (unicode_general_category1(&stack[stack_len - 1], a))
1535
0
                goto fail;
1536
0
            break;
1537
0
        case POP_PROP:
1538
0
            assert(stack_len < POP_STACK_LEN_MAX);
1539
0
            a = va_arg(ap, int);
1540
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1541
0
            if (unicode_prop1(&stack[stack_len - 1], a))
1542
0
                goto fail;
1543
0
            break;
1544
0
        case POP_CASE:
1545
0
            assert(stack_len < POP_STACK_LEN_MAX);
1546
0
            a = va_arg(ap, int);
1547
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1548
0
            if (unicode_case1(&stack[stack_len - 1], a))
1549
0
                goto fail;
1550
0
            break;
1551
0
        case POP_UNION:
1552
0
        case POP_INTER:
1553
0
        case POP_XOR:
1554
0
            {
1555
0
                CharRange *cr1, *cr2, *cr3;
1556
0
                assert(stack_len >= 2);
1557
0
                assert(stack_len < POP_STACK_LEN_MAX);
1558
0
                cr1 = &stack[stack_len - 2];
1559
0
                cr2 = &stack[stack_len - 1];
1560
0
                cr3 = &stack[stack_len++];
1561
0
                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1562
                /* CR_OP_XOR may be used here */
1563
0
                if (cr_op(cr3, cr1->points, cr1->len,
1564
0
                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1565
0
                    goto fail;
1566
0
                cr_free(cr1);
1567
0
                cr_free(cr2);
1568
0
                *cr1 = *cr3;
1569
0
                stack_len -= 2;
1570
0
            }
1571
0
            break;
1572
0
        case POP_INVERT:
1573
0
            assert(stack_len >= 1);
1574
0
            if (cr_invert(&stack[stack_len - 1]))
1575
0
                goto fail;
1576
0
            break;
1577
0
        case POP_END:
1578
0
            goto done;
1579
0
        default:
1580
0
            abort();
1581
0
        }
1582
0
    }
1583
0
 done:
1584
0
    assert(stack_len == 1);
1585
0
    ret = cr_copy(cr, &stack[0]);
1586
0
    cr_free(&stack[0]);
1587
0
    return ret;
1588
0
 fail:
1589
0
    for(i = 0; i < stack_len; i++)
1590
0
        cr_free(&stack[i]);
1591
0
    return -1;
1592
0
}
1593
1594
static const uint32_t unicode_gc_mask_table[] = {
1595
    M(Lu) | M(Ll) | M(Lt), /* LC */
1596
    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1597
    M(Mn) | M(Mc) | M(Me), /* M */
1598
    M(Nd) | M(Nl) | M(No), /* N */
1599
    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1600
    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1601
    M(Zs) | M(Zl) | M(Zp), /* Z */
1602
    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1603
};
1604
1605
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1606
   if not found */
1607
int unicode_general_category(CharRange *cr, const char *gc_name)
1608
0
{
1609
0
    int gc_idx;
1610
0
    uint32_t gc_mask;
1611
1612
0
    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1613
0
    if (gc_idx < 0)
1614
0
        return -2;
1615
0
    if (gc_idx <= UNICODE_GC_Co) {
1616
0
        gc_mask = (uint64_t)1 << gc_idx;
1617
0
    } else {
1618
0
        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1619
0
    }
1620
0
    return unicode_general_category1(cr, gc_mask);
1621
0
}
1622
1623
1624
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1625
   if not found */
1626
int unicode_prop(CharRange *cr, const char *prop_name)
1627
0
{
1628
0
    int prop_idx, ret;
1629
1630
0
    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1631
0
    if (prop_idx < 0)
1632
0
        return -2;
1633
0
    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1634
1635
0
    ret = 0;
1636
0
    switch(prop_idx) {
1637
0
    case UNICODE_PROP_ASCII:
1638
0
        if (cr_add_interval(cr, 0x00, 0x7f + 1))
1639
0
            return -1;
1640
0
        break;
1641
0
    case UNICODE_PROP_Any:
1642
0
        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1643
0
            return -1;
1644
0
        break;
1645
0
    case UNICODE_PROP_Assigned:
1646
0
        ret = unicode_prop_ops(cr,
1647
0
                               POP_GC, M(Cn),
1648
0
                               POP_INVERT,
1649
0
                               POP_END);
1650
0
        break;
1651
0
    case UNICODE_PROP_Math:
1652
0
        ret = unicode_prop_ops(cr,
1653
0
                               POP_GC, M(Sm),
1654
0
                               POP_PROP, UNICODE_PROP_Other_Math,
1655
0
                               POP_UNION,
1656
0
                               POP_END);
1657
0
        break;
1658
0
    case UNICODE_PROP_Lowercase:
1659
0
        ret = unicode_prop_ops(cr,
1660
0
                               POP_GC, M(Ll),
1661
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1662
0
                               POP_UNION,
1663
0
                               POP_END);
1664
0
        break;
1665
0
    case UNICODE_PROP_Uppercase:
1666
0
        ret = unicode_prop_ops(cr,
1667
0
                               POP_GC, M(Lu),
1668
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1669
0
                               POP_UNION,
1670
0
                               POP_END);
1671
0
        break;
1672
0
    case UNICODE_PROP_Cased:
1673
0
        ret = unicode_prop_ops(cr,
1674
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt),
1675
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1676
0
                               POP_UNION,
1677
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1678
0
                               POP_UNION,
1679
0
                               POP_END);
1680
0
        break;
1681
0
    case UNICODE_PROP_Alphabetic:
1682
0
        ret = unicode_prop_ops(cr,
1683
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1684
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1685
0
                               POP_UNION,
1686
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1687
0
                               POP_UNION,
1688
0
                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
1689
0
                               POP_UNION,
1690
0
                               POP_END);
1691
0
        break;
1692
0
    case UNICODE_PROP_Grapheme_Base:
1693
0
        ret = unicode_prop_ops(cr,
1694
0
                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1695
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1696
0
                               POP_UNION,
1697
0
                               POP_INVERT,
1698
0
                               POP_END);
1699
0
        break;
1700
0
    case UNICODE_PROP_Grapheme_Extend:
1701
0
        ret = unicode_prop_ops(cr,
1702
0
                               POP_GC, M(Me) | M(Mn),
1703
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1704
0
                               POP_UNION,
1705
0
                               POP_END);
1706
0
        break;
1707
0
    case UNICODE_PROP_XID_Start:
1708
0
        ret = unicode_prop_ops(cr,
1709
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1710
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1711
0
                               POP_UNION,
1712
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1713
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1714
0
                               POP_UNION,
1715
0
                               POP_PROP, UNICODE_PROP_XID_Start1,
1716
0
                               POP_UNION,
1717
0
                               POP_INVERT,
1718
0
                               POP_INTER,
1719
0
                               POP_END);
1720
0
        break;
1721
0
    case UNICODE_PROP_XID_Continue:
1722
0
        ret = unicode_prop_ops(cr,
1723
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1724
0
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1725
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1726
0
                               POP_UNION,
1727
0
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1728
0
                               POP_UNION,
1729
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1730
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1731
0
                               POP_UNION,
1732
0
                               POP_PROP, UNICODE_PROP_XID_Continue1,
1733
0
                               POP_UNION,
1734
0
                               POP_INVERT,
1735
0
                               POP_INTER,
1736
0
                               POP_END);
1737
0
        break;
1738
0
    case UNICODE_PROP_Changes_When_Uppercased:
1739
0
        ret = unicode_case1(cr, CASE_U);
1740
0
        break;
1741
0
    case UNICODE_PROP_Changes_When_Lowercased:
1742
0
        ret = unicode_case1(cr, CASE_L);
1743
0
        break;
1744
0
    case UNICODE_PROP_Changes_When_Casemapped:
1745
0
        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1746
0
        break;
1747
0
    case UNICODE_PROP_Changes_When_Titlecased:
1748
0
        ret = unicode_prop_ops(cr,
1749
0
                               POP_CASE, CASE_U,
1750
0
                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1751
0
                               POP_XOR,
1752
0
                               POP_END);
1753
0
        break;
1754
0
    case UNICODE_PROP_Changes_When_Casefolded:
1755
0
        ret = unicode_prop_ops(cr,
1756
0
                               POP_CASE, CASE_F,
1757
0
                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1758
0
                               POP_XOR,
1759
0
                               POP_END);
1760
0
        break;
1761
0
    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1762
0
        ret = unicode_prop_ops(cr,
1763
0
                               POP_CASE, CASE_F,
1764
0
                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1765
0
                               POP_XOR,
1766
0
                               POP_END);
1767
0
        break;
1768
#if 0
1769
    case UNICODE_PROP_ID_Start:
1770
        ret = unicode_prop_ops(cr,
1771
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1772
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1773
                               POP_UNION,
1774
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1775
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1776
                               POP_UNION,
1777
                               POP_INVERT,
1778
                               POP_INTER,
1779
                               POP_END);
1780
        break;
1781
    case UNICODE_PROP_ID_Continue:
1782
        ret = unicode_prop_ops(cr,
1783
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1784
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1785
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1786
                               POP_UNION,
1787
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1788
                               POP_UNION,
1789
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1790
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1791
                               POP_UNION,
1792
                               POP_INVERT,
1793
                               POP_INTER,
1794
                               POP_END);
1795
        break;
1796
    case UNICODE_PROP_Case_Ignorable:
1797
        ret = unicode_prop_ops(cr,
1798
                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1799
                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
1800
                               POP_XOR,
1801
                               POP_END);
1802
        break;
1803
#else
1804
        /* we use the existing tables */
1805
0
    case UNICODE_PROP_ID_Continue:
1806
0
        ret = unicode_prop_ops(cr,
1807
0
                               POP_PROP, UNICODE_PROP_ID_Start,
1808
0
                               POP_PROP, UNICODE_PROP_ID_Continue1,
1809
0
                               POP_XOR,
1810
0
                               POP_END);
1811
0
        break;
1812
0
#endif
1813
0
    default:
1814
0
        if (prop_idx >= countof(unicode_prop_table))
1815
0
            return -2;
1816
0
        ret = unicode_prop1(cr, prop_idx);
1817
0
        break;
1818
0
    }
1819
0
    return ret;
1820
0
}
1821
1822
#endif /* CONFIG_ALL_UNICODE */
1823
1824
/*---- lre codepoint categorizing functions ----*/
1825
1826
#define S  UNICODE_C_SPACE
1827
#define D  UNICODE_C_DIGIT
1828
#define X  UNICODE_C_XDIGIT
1829
#define U  UNICODE_C_UPPER
1830
#define L  UNICODE_C_LOWER
1831
#define _  UNICODE_C_UNDER
1832
#define d  UNICODE_C_DOLLAR
1833
1834
uint8_t const lre_ctype_bits[256] = {
1835
    0, 0, 0, 0, 0, 0, 0, 0,
1836
    0, S, S, S, S, S, 0, 0,
1837
    0, 0, 0, 0, 0, 0, 0, 0,
1838
    0, 0, 0, 0, 0, 0, 0, 0,
1839
1840
    S, 0, 0, 0, d, 0, 0, 0,
1841
    0, 0, 0, 0, 0, 0, 0, 0,
1842
    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1843
    X|D, X|D, 0, 0, 0, 0, 0, 0,
1844
1845
    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1846
    U, U, U, U, U, U, U, U,
1847
    U, U, U, U, U, U, U, U,
1848
    U, U, U, 0, 0, 0, 0, _,
1849
1850
    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1851
    L, L, L, L, L, L, L, L,
1852
    L, L, L, L, L, L, L, L,
1853
    L, L, L, 0, 0, 0, 0, 0,
1854
1855
    0, 0, 0, 0, 0, 0, 0, 0,
1856
    0, 0, 0, 0, 0, 0, 0, 0,
1857
    0, 0, 0, 0, 0, 0, 0, 0,
1858
    0, 0, 0, 0, 0, 0, 0, 0,
1859
1860
    S, 0, 0, 0, 0, 0, 0, 0,
1861
    0, 0, 0, 0, 0, 0, 0, 0,
1862
    0, 0, 0, 0, 0, 0, 0, 0,
1863
    0, 0, 0, 0, 0, 0, 0, 0,
1864
1865
    0, 0, 0, 0, 0, 0, 0, 0,
1866
    0, 0, 0, 0, 0, 0, 0, 0,
1867
    0, 0, 0, 0, 0, 0, 0, 0,
1868
    0, 0, 0, 0, 0, 0, 0, 0,
1869
1870
    0, 0, 0, 0, 0, 0, 0, 0,
1871
    0, 0, 0, 0, 0, 0, 0, 0,
1872
    0, 0, 0, 0, 0, 0, 0, 0,
1873
    0, 0, 0, 0, 0, 0, 0, 0,
1874
};
1875
1876
#undef S
1877
#undef D
1878
#undef X
1879
#undef U
1880
#undef L
1881
#undef _
1882
#undef d
1883
1884
/* code point ranges for Zs,Zl or Zp property */
1885
static const uint16_t char_range_s[] = {
1886
    10,
1887
    0x0009, 0x000D + 1,
1888
    0x0020, 0x0020 + 1,
1889
    0x00A0, 0x00A0 + 1,
1890
    0x1680, 0x1680 + 1,
1891
    0x2000, 0x200A + 1,
1892
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1893
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1894
    0x2028, 0x2029 + 1,
1895
    0x202F, 0x202F + 1,
1896
    0x205F, 0x205F + 1,
1897
    0x3000, 0x3000 + 1,
1898
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1899
    0xFEFF, 0xFEFF + 1,
1900
};
1901
1902
BOOL lre_is_space_non_ascii(uint32_t c)
1903
0
{
1904
0
    size_t i, n;
1905
1906
0
    n = countof(char_range_s);
1907
0
    for(i = 5; i < n; i += 2) {
1908
0
        uint32_t low = char_range_s[i];
1909
0
        uint32_t high = char_range_s[i + 1];
1910
0
        if (c < low)
1911
0
            return FALSE;
1912
0
        if (c < high)
1913
0
            return TRUE;
1914
0
    }
1915
0
    return FALSE;
1916
0
}
1917
1918
#define SEQ_MAX_LEN 16
1919
1920
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
1921
                                  CharRange *cr)
1922
0
{
1923
0
    int i, c, j;
1924
0
    uint32_t seq[SEQ_MAX_LEN];
1925
    
1926
0
    switch(seq_prop_idx) {
1927
0
    case UNICODE_SEQUENCE_PROP_Basic_Emoji:
1928
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
1929
0
            return -1;
1930
0
        for(i = 0; i < cr->len; i += 2) {
1931
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1932
0
                seq[0] = c;
1933
0
                cb(opaque, seq, 1);
1934
0
            }
1935
0
        }
1936
1937
0
        cr->len = 0;
1938
1939
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
1940
0
            return -1;
1941
0
        for(i = 0; i < cr->len; i += 2) {
1942
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1943
0
                seq[0] = c;
1944
0
                seq[1] = 0xfe0f;
1945
0
                cb(opaque, seq, 2);
1946
0
            }
1947
0
        }
1948
1949
0
        break;
1950
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
1951
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
1952
0
            return -1;
1953
0
        for(i = 0; i < cr->len; i += 2) {
1954
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1955
0
                for(j = 0; j < 5; j++) {
1956
0
                    seq[0] = c;
1957
0
                    seq[1] = 0x1f3fb + j;
1958
0
                    cb(opaque, seq, 2);
1959
0
                }
1960
0
            }
1961
0
        }
1962
0
        break;
1963
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
1964
0
        if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
1965
0
            return -1;
1966
0
        for(i = 0; i < cr->len; i += 2) {
1967
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1968
0
                int c0, c1;
1969
0
                c0 = c / 26;
1970
0
                c1 = c % 26;
1971
0
                seq[0] = 0x1F1E6 + c0;
1972
0
                seq[1] = 0x1F1E6 + c1;
1973
0
                cb(opaque, seq, 2);
1974
0
            }
1975
0
        }
1976
0
        break;
1977
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
1978
0
        {
1979
0
            int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
1980
0
            int mod_idx, hc_idx, i0, i1;
1981
0
            const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
1982
            
1983
0
            for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
1984
0
                len = tab[i++];
1985
0
                k = 0;
1986
0
                mod = 0;
1987
0
                mod_count = 0;
1988
0
                hc_pos = -1;
1989
0
                for(j = 0; j < len; j++) {
1990
0
                    code = tab[i++];
1991
0
                    code |= tab[i++] << 8;
1992
0
                    pres = code >> 15;
1993
0
                    mod1 = (code >> 13) & 3;
1994
0
                    code &= 0x1fff;
1995
0
                    if (code < 0x1000) {
1996
0
                        c = code + 0x2000;
1997
0
                    } else {
1998
0
                        c = 0x1f000 + (code - 0x1000);
1999
0
                    }
2000
0
                    if (c == 0x1f9b0)
2001
0
                        hc_pos = k;
2002
0
                    seq[k++] = c;
2003
0
                    if (mod1 != 0) {
2004
0
                        assert(mod_count < 2);
2005
0
                        mod = mod1;
2006
0
                        mod_pos[mod_count++] = k;
2007
0
                        seq[k++] = 0; /* will be filled later */
2008
0
                    }
2009
0
                    if (pres) {
2010
0
                        seq[k++] = 0xfe0f;
2011
0
                    }
2012
0
                    if (j < len - 1) {
2013
0
                        seq[k++] = 0x200d;
2014
0
                    }
2015
0
                }
2016
2017
                /* genrate all the variants */
2018
0
                switch(mod) {
2019
0
                case 1:
2020
0
                    n_mod = 5;
2021
0
                    break;
2022
0
                case 2:
2023
0
                    n_mod = 25;
2024
0
                    break;
2025
0
                case 3:
2026
0
                    n_mod = 20;
2027
0
                    break;
2028
0
                default:
2029
0
                    n_mod = 1;
2030
0
                    break;
2031
0
                }
2032
0
                if (hc_pos >= 0)
2033
0
                    n_hc = 4;
2034
0
                else
2035
0
                    n_hc = 1;
2036
0
                for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
2037
0
                    for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
2038
0
                        if (hc_pos >= 0)
2039
0
                            seq[hc_pos] = 0x1f9b0 + hc_idx;
2040
                        
2041
0
                        switch(mod) {
2042
0
                        case 1:
2043
0
                            seq[mod_pos[0]] = 0x1f3fb + mod_idx;
2044
0
                            break;
2045
0
                        case 2:
2046
0
                        case 3:
2047
0
                            i0 = mod_idx / 5;
2048
0
                            i1 = mod_idx % 5;
2049
                            /* avoid identical values */
2050
0
                            if (mod == 3 && i0 >= i1)
2051
0
                                i0++;
2052
0
                            seq[mod_pos[0]] = 0x1f3fb + i0;
2053
0
                            seq[mod_pos[1]] = 0x1f3fb + i1;
2054
0
                            break;
2055
0
                        default:
2056
0
                            break;
2057
0
                        }
2058
#if 0
2059
                        for(j = 0; j < k; j++)
2060
                            printf(" %04x", seq[j]);
2061
                        printf("\n");
2062
#endif                
2063
0
                        cb(opaque, seq, k);
2064
0
                    }
2065
0
                }
2066
0
            }
2067
0
        }
2068
0
        break;
2069
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
2070
0
        {
2071
0
            for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
2072
0
                j = 0;
2073
0
                seq[j++] = 0x1F3F4;
2074
0
                for(;;) {
2075
0
                    c = unicode_rgi_emoji_tag_sequence[i++];
2076
0
                    if (c == 0x00)
2077
0
                        break;
2078
0
                    seq[j++] = 0xe0000 + c;
2079
0
                }
2080
0
                seq[j++] = 0xe007f;
2081
0
                cb(opaque, seq, j);
2082
0
            }
2083
0
        }
2084
0
        break;
2085
0
    case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
2086
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
2087
0
            return -1;
2088
0
        for(i = 0; i < cr->len; i += 2) {
2089
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
2090
0
                seq[0] = c;
2091
0
                seq[1] = 0xfe0f;
2092
0
                seq[2] = 0x20e3;
2093
0
                cb(opaque, seq, 3);
2094
0
            }
2095
0
        }
2096
0
        break;
2097
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji:
2098
        /* all prevous sequences */
2099
0
        for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
2100
0
            int ret;
2101
0
            ret = unicode_sequence_prop1(i, cb, opaque, cr);
2102
0
            if (ret < 0)
2103
0
                return ret;
2104
0
            cr->len = 0;
2105
0
        }
2106
0
        break;
2107
0
    default:
2108
0
        return -2;
2109
0
    }
2110
0
    return 0;
2111
0
}
2112
2113
/* build a unicode sequence property */
2114
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2115
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
2116
                          CharRange *cr)
2117
0
{
2118
0
    int seq_prop_idx;
2119
0
    seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
2120
0
    if (seq_prop_idx < 0)
2121
0
        return -2;
2122
0
    return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
2123
0
}