Coverage Report

Created: 2026-03-31 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quickjs/libunicode.c
Line
Count
Source
1
/*
2
 * Unicode utilities
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <string.h>
28
#include <assert.h>
29
30
#include "cutils.h"
31
#include "libunicode.h"
32
#include "libunicode-table.h"
33
34
enum {
35
    RUN_TYPE_U,
36
    RUN_TYPE_L,
37
    RUN_TYPE_UF,
38
    RUN_TYPE_LF,
39
    RUN_TYPE_UL,
40
    RUN_TYPE_LSU,
41
    RUN_TYPE_U2L_399_EXT2,
42
    RUN_TYPE_UF_D20,
43
    RUN_TYPE_UF_D1_EXT,
44
    RUN_TYPE_U_EXT,
45
    RUN_TYPE_LF_EXT,
46
    RUN_TYPE_UF_EXT2,
47
    RUN_TYPE_LF_EXT2,
48
    RUN_TYPE_UF_EXT3,
49
};
50
51
static int lre_case_conv1(uint32_t c, int conv_type)
52
0
{
53
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
54
0
    lre_case_conv(res, c, conv_type);
55
0
    return res[0];
56
0
}
57
58
/* case conversion using the table entry 'idx' with value 'v' */
59
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
94.2M
{
61
94.2M
    uint32_t code, data, type, a, is_lower;
62
94.2M
    is_lower = (conv_type != 0);
63
94.2M
    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
94.2M
    data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
94.2M
    code = v >> (32 - 17);
66
94.2M
    switch(type) {
67
36.6M
    case RUN_TYPE_U:
68
36.6M
    case RUN_TYPE_L:
69
43.2M
    case RUN_TYPE_UF:
70
43.2M
    case RUN_TYPE_LF:
71
43.2M
        if (conv_type == (type & 1) ||
72
43.2M
            (type >= RUN_TYPE_UF && conv_type == 2)) {
73
43.2M
            c = c - code + (case_conv_table1[data] >> (32 - 17));
74
43.2M
        }
75
43.2M
        break;
76
40.8M
    case RUN_TYPE_UL:
77
40.8M
        a = c - code;
78
40.8M
        if ((a & 1) != (1 - is_lower))
79
1.89k
            break;
80
40.8M
        c = (a ^ 1) + code;
81
40.8M
        break;
82
550k
    case RUN_TYPE_LSU:
83
550k
        a = c - code;
84
550k
        if (a == 1) {
85
275k
            c += 2 * is_lower - 1;
86
275k
        } else if (a == (1 - is_lower) * 2) {
87
275k
            c += (2 * is_lower - 1) * 2;
88
275k
        }
89
550k
        break;
90
2.06M
    case RUN_TYPE_U2L_399_EXT2:
91
2.06M
        if (!is_lower) {
92
2.06M
            res[0] = c - code + case_conv_ext[data >> 6];
93
2.06M
            res[1] = 0x399;
94
2.06M
            return 2;
95
2.06M
        } else {
96
0
            c = c - code + case_conv_ext[data & 0x3f];
97
0
        }
98
0
        break;
99
1.44M
    case RUN_TYPE_UF_D20:
100
1.44M
        if (conv_type == 1)
101
0
            break;
102
1.44M
        c = data + (conv_type == 2) * 0x20;
103
1.44M
        break;
104
229k
    case RUN_TYPE_UF_D1_EXT:
105
229k
        if (conv_type == 1)
106
0
            break;
107
229k
        c = case_conv_ext[data] + (conv_type == 2);
108
229k
        break;
109
68.7k
    case RUN_TYPE_U_EXT:
110
68.7k
    case RUN_TYPE_LF_EXT:
111
68.7k
        if (is_lower != (type - RUN_TYPE_U_EXT))
112
0
            break;
113
68.7k
        c = case_conv_ext[data];
114
68.7k
        break;
115
0
    case RUN_TYPE_LF_EXT2:
116
0
        if (!is_lower)
117
0
            break;
118
0
        res[0] = c - code + case_conv_ext[data >> 6];
119
0
        res[1] = case_conv_ext[data & 0x3f];
120
0
        return 2;
121
4.48M
    case RUN_TYPE_UF_EXT2:
122
4.48M
        if (conv_type == 1)
123
0
            break;
124
4.48M
        res[0] = c - code + case_conv_ext[data >> 6];
125
4.48M
        res[1] = case_conv_ext[data & 0x3f];
126
4.48M
        if (conv_type == 2) {
127
            /* convert to lower */
128
0
            res[0] = lre_case_conv1(res[0], 1);
129
0
            res[1] = lre_case_conv1(res[1], 1);
130
0
        }
131
4.48M
        return 2;
132
0
    default:
133
1.22M
    case RUN_TYPE_UF_EXT3:
134
1.22M
        if (conv_type == 1)
135
0
            break;
136
1.22M
        res[0] = case_conv_ext[data >> 8];
137
1.22M
        res[1] = case_conv_ext[(data >> 4) & 0xf];
138
1.22M
        res[2] = case_conv_ext[data & 0xf];
139
1.22M
        if (conv_type == 2) {
140
            /* convert to lower */
141
0
            res[0] = lre_case_conv1(res[0], 1);
142
0
            res[1] = lre_case_conv1(res[1], 1);
143
0
            res[2] = lre_case_conv1(res[2], 1);
144
0
        }
145
1.22M
        return 3;
146
94.2M
    }
147
86.4M
    res[0] = c;
148
86.4M
    return 1;
149
94.2M
}
150
151
/* conv_type:
152
   0 = to upper
153
   1 = to lower
154
   2 = case folding (= to lower with modifications)
155
*/
156
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
0
{
158
0
    if (c < 128) {
159
0
        if (conv_type) {
160
0
            if (c >= 'A' && c <= 'Z') {
161
0
                c = c - 'A' + 'a';
162
0
            }
163
0
        } else {
164
0
            if (c >= 'a' && c <= 'z') {
165
0
                c = c - 'a' + 'A';
166
0
            }
167
0
        }
168
0
    } else {
169
0
        uint32_t v, code, len;
170
0
        int idx, idx_min, idx_max;
171
172
0
        idx_min = 0;
173
0
        idx_max = countof(case_conv_table1) - 1;
174
0
        while (idx_min <= idx_max) {
175
0
            idx = (unsigned)(idx_max + idx_min) / 2;
176
0
            v = case_conv_table1[idx];
177
0
            code = v >> (32 - 17);
178
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
179
0
            if (c < code) {
180
0
                idx_max = idx - 1;
181
0
            } else if (c >= code + len) {
182
0
                idx_min = idx + 1;
183
0
            } else {
184
0
                return lre_case_conv_entry(res, c, conv_type, idx, v);
185
0
            }
186
0
        }
187
0
    }
188
0
    res[0] = c;
189
0
    return 1;
190
0
}
191
192
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
96.0M
{
194
96.0M
    uint32_t res[LRE_CC_RES_LEN_MAX];
195
96.0M
    int len;
196
197
96.0M
    if (is_unicode) {
198
0
        len = lre_case_conv_entry(res, c, 2, idx, v);
199
0
        if (len == 1) {
200
0
            c = res[0];
201
0
        } else {
202
            /* handle the few specific multi-character cases (see
203
               unicode_gen.c:dump_case_folding_special_cases()) */
204
0
            if (c == 0xfb06) {
205
0
                c = 0xfb05;
206
0
            } else if (c == 0x01fd3) {
207
0
                c = 0x390;
208
0
            } else if (c == 0x01fe3) {
209
0
                c = 0x3b0;
210
0
            }
211
0
        }
212
96.0M
    } else {
213
96.0M
        if (likely(c < 128)) {
214
1.78M
            if (c >= 'a' && c <= 'z')
215
1.78M
                c = c - 'a' + 'A';
216
94.2M
        } else {
217
            /* legacy regexp: to upper case if single char >= 128 */
218
94.2M
            len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
94.2M
            if (len == 1 && res[0] >= 128)
220
86.3M
                c = res[0];
221
94.2M
        }
222
96.0M
    }
223
96.0M
    return c;
224
96.0M
}
225
226
/* JS regexp specific rules for case folding */
227
int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
1.42M
{
229
1.42M
    if (c < 128) {
230
        /* fast case */
231
1.38M
        if (is_unicode) {
232
0
            if (c >= 'A' && c <= 'Z') {
233
0
                c = c - 'A' + 'a';
234
0
            }
235
1.38M
        } else {
236
1.38M
            if (c >= 'a' && c <= 'z') {
237
718k
                c = c - 'a' + 'A';
238
718k
            }
239
1.38M
        }
240
1.38M
    } else {
241
33.2k
        uint32_t v, code, len;
242
33.2k
        int idx, idx_min, idx_max;
243
244
33.2k
        idx_min = 0;
245
33.2k
        idx_max = countof(case_conv_table1) - 1;
246
330k
        while (idx_min <= idx_max) {
247
299k
            idx = (unsigned)(idx_max + idx_min) / 2;
248
299k
            v = case_conv_table1[idx];
249
299k
            code = v >> (32 - 17);
250
299k
            len = (v >> (32 - 17 - 7)) & 0x7f;
251
299k
            if (c < code) {
252
37.3k
                idx_max = idx - 1;
253
261k
            } else if (c >= code + len) {
254
259k
                idx_min = idx + 1;
255
259k
            } else {
256
1.90k
                return lre_case_folding_entry(c, idx, v, is_unicode);
257
1.90k
            }
258
299k
        }
259
33.2k
    }
260
1.41M
    return c;
261
1.42M
}
262
263
static uint32_t get_le24(const uint8_t *ptr)
264
148
{
265
148
    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
148
}
267
268
0
#define UNICODE_INDEX_BLOCK_LEN 32
269
270
/* return -1 if not in table, otherwise the offset in the block */
271
static int get_index_pos(uint32_t *pcode, uint32_t c,
272
                         const uint8_t *index_table, int index_table_len)
273
148
{
274
148
    uint32_t code, v;
275
148
    int idx_min, idx_max, idx;
276
277
148
    idx_min = 0;
278
148
    v = get_le24(index_table);
279
148
    code = v & ((1 << 21) - 1);
280
148
    if (c < code) {
281
148
        *pcode = 0;
282
148
        return 0;
283
148
    }
284
0
    idx_max = index_table_len - 1;
285
0
    code = get_le24(index_table + idx_max * 3);
286
0
    if (c >= code)
287
0
        return -1;
288
    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
0
    while ((idx_max - idx_min) > 1) {
290
0
        idx = (idx_max + idx_min) / 2;
291
0
        v = get_le24(index_table + idx * 3);
292
0
        code = v & ((1 << 21) - 1);
293
0
        if (c < code) {
294
0
            idx_max = idx;
295
0
        } else {
296
0
            idx_min = idx;
297
0
        }
298
0
    }
299
0
    v = get_le24(index_table + idx_min * 3);
300
0
    *pcode = v & ((1 << 21) - 1);
301
0
    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
0
}
303
304
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
                            const uint8_t *index_table, int index_table_len)
306
148
{
307
148
    uint32_t code, b, bit;
308
148
    int pos;
309
148
    const uint8_t *p;
310
311
148
    pos = get_index_pos(&code, c, index_table, index_table_len);
312
148
    if (pos < 0)
313
0
        return FALSE; /* outside the table */
314
148
    p = table + pos;
315
148
    bit = 0;
316
    /* Compressed run length encoding:
317
       00..3F: 2 packed lengths: 3-bit + 3-bit
318
       40..5F: 5-bits plus extra byte for length
319
       60..7F: 5-bits plus 2 extra bytes for length
320
       80..FF: 7-bit length
321
       lengths must be incremented to get character count
322
       Ranges alternate between false and true return value.
323
     */
324
2.07k
    for(;;) {
325
2.07k
        b = *p++;
326
2.07k
        if (b < 64) {
327
296
            code += (b >> 3) + 1;
328
296
            if (c < code)
329
0
                return bit;
330
296
            bit ^= 1;
331
296
            code += (b & 7) + 1;
332
1.77k
        } else if (b >= 0x80) {
333
1.62k
            code += b - 0x80 + 1;
334
1.62k
        } else if (b < 0x60) {
335
148
            code += (((b - 0x40) << 8) | p[0]) + 1;
336
148
            p++;
337
148
        } else {
338
0
            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
0
            p += 2;
340
0
        }
341
2.07k
        if (c < code)
342
148
            return bit;
343
1.92k
        bit ^= 1;
344
1.92k
    }
345
148
}
346
347
BOOL lre_is_cased(uint32_t c)
348
0
{
349
0
    uint32_t v, code, len;
350
0
    int idx, idx_min, idx_max;
351
352
0
    idx_min = 0;
353
0
    idx_max = countof(case_conv_table1) - 1;
354
0
    while (idx_min <= idx_max) {
355
0
        idx = (unsigned)(idx_max + idx_min) / 2;
356
0
        v = case_conv_table1[idx];
357
0
        code = v >> (32 - 17);
358
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
359
0
        if (c < code) {
360
0
            idx_max = idx - 1;
361
0
        } else if (c >= code + len) {
362
0
            idx_min = idx + 1;
363
0
        } else {
364
0
            return TRUE;
365
0
        }
366
0
    }
367
0
    return lre_is_in_table(c, unicode_prop_Cased1_table,
368
0
                           unicode_prop_Cased1_index,
369
0
                           sizeof(unicode_prop_Cased1_index) / 3);
370
0
}
371
372
BOOL lre_is_case_ignorable(uint32_t c)
373
0
{
374
0
    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
0
                           unicode_prop_Case_Ignorable_index,
376
0
                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
0
}
378
379
/* character range */
380
381
static __maybe_unused void cr_dump(CharRange *cr)
382
0
{
383
0
    int i;
384
0
    for(i = 0; i < cr->len; i++)
385
0
        printf("%d: 0x%04x\n", i, cr->points[i]);
386
0
}
387
388
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
0
{
390
0
    return realloc(ptr, size);
391
0
}
392
393
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
763k
{
395
763k
    cr->len = cr->size = 0;
396
763k
    cr->points = NULL;
397
763k
    cr->mem_opaque = mem_opaque;
398
763k
    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
763k
}
400
401
void cr_free(CharRange *cr)
402
2.06M
{
403
2.06M
    cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
2.06M
}
405
406
int cr_realloc(CharRange *cr, int size)
407
20.0M
{
408
20.0M
    int new_size;
409
20.0M
    uint32_t *new_buf;
410
411
20.0M
    if (size > cr->size) {
412
19.8M
        new_size = max_int(size, cr->size * 3 / 2);
413
19.8M
        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
19.8M
                                   new_size * sizeof(cr->points[0]));
415
19.8M
        if (!new_buf)
416
0
            return -1;
417
19.8M
        cr->points = new_buf;
418
19.8M
        cr->size = new_size;
419
19.8M
    }
420
20.0M
    return 0;
421
20.0M
}
422
423
int cr_copy(CharRange *cr, const CharRange *cr1)
424
0
{
425
0
    if (cr_realloc(cr, cr1->len))
426
0
        return -1;
427
0
    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
0
    cr->len = cr1->len;
429
0
    return 0;
430
0
}
431
432
/* merge consecutive intervals and remove empty intervals */
433
static void cr_compress(CharRange *cr)
434
1.80M
{
435
1.80M
    int i, j, k, len;
436
1.80M
    uint32_t *pt;
437
438
1.80M
    pt = cr->points;
439
1.80M
    len = cr->len;
440
1.80M
    i = 0;
441
1.80M
    j = 0;
442
1.80M
    k = 0;
443
344M
    while ((i + 1) < len) {
444
342M
        if (pt[i] == pt[i + 1]) {
445
            /* empty interval */
446
12.3M
            i += 2;
447
330M
        } else {
448
330M
            j = i;
449
338M
            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
8.70M
                j += 2;
451
            /* just copy */
452
330M
            pt[k] = pt[i];
453
330M
            pt[k + 1] = pt[j + 1];
454
330M
            k += 2;
455
330M
            i = j + 2;
456
330M
        }
457
342M
    }
458
1.80M
    cr->len = k;
459
1.80M
}
460
461
/* union or intersection */
462
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
          const uint32_t *b_pt, int b_len, int op)
464
1.62M
{
465
1.62M
    int a_idx, b_idx, is_in;
466
1.62M
    uint32_t v;
467
468
1.62M
    a_idx = 0;
469
1.62M
    b_idx = 0;
470
644M
    for(;;) {
471
        /* get one more point from a or b in increasing order */
472
644M
        if (a_idx < a_len && b_idx < b_len) {
473
340M
            if (a_pt[a_idx] < b_pt[b_idx]) {
474
242M
                goto a_add;
475
242M
            } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
86.1M
                v = a_pt[a_idx];
477
86.1M
                a_idx++;
478
86.1M
                b_idx++;
479
86.1M
            } else {
480
12.3M
                goto b_add;
481
12.3M
            }
482
340M
        } else if (a_idx < a_len) {
483
453M
        a_add:
484
453M
            v = a_pt[a_idx++];
485
453M
        } else if (b_idx < b_len) {
486
103M
        b_add:
487
103M
            v = b_pt[b_idx++];
488
103M
        } else {
489
1.62M
            break;
490
1.62M
        }
491
        /* add the point if the in/out status changes */
492
643M
        switch(op) {
493
344M
        case CR_OP_UNION:
494
344M
            is_in = (a_idx & 1) | (b_idx & 1);
495
344M
            break;
496
298M
        case CR_OP_INTER:
497
298M
            is_in = (a_idx & 1) & (b_idx & 1);
498
298M
            break;
499
0
        case CR_OP_XOR:
500
0
            is_in = (a_idx & 1) ^ (b_idx & 1);
501
0
            break;
502
0
        case CR_OP_SUB:
503
0
            is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
504
0
            break;
505
0
        default:
506
0
            abort();
507
643M
        }
508
643M
        if (is_in != (cr->len & 1)) {
509
539M
            if (cr_add_point(cr, v))
510
0
                return -1;
511
539M
        }
512
643M
    }
513
1.62M
    cr_compress(cr);
514
1.62M
    return 0;
515
1.62M
}
516
517
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
518
1.30M
{
519
1.30M
    CharRange a = *cr;
520
1.30M
    int ret;
521
1.30M
    cr->len = 0;
522
1.30M
    cr->size = 0;
523
1.30M
    cr->points = NULL;
524
1.30M
    ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
525
1.30M
    cr_free(&a);
526
1.30M
    return ret;
527
1.30M
}
528
529
int cr_invert(CharRange *cr)
530
179k
{
531
179k
    int len;
532
179k
    len = cr->len;
533
179k
    if (cr_realloc(cr, len + 2))
534
0
        return -1;
535
179k
    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
536
179k
    cr->points[0] = 0;
537
179k
    cr->points[len + 1] = UINT32_MAX;
538
179k
    cr->len = len + 2;
539
179k
    cr_compress(cr);
540
179k
    return 0;
541
179k
}
542
543
14.1M
#define CASE_U (1 << 0)
544
6.78M
#define CASE_L (1 << 1)
545
6.78M
#define CASE_F (1 << 2)
546
547
/* use the case conversion table to generate range of characters.
548
   CASE_U: set char if modified by uppercasing,
549
   CASE_L: set char if modified by lowercasing,
550
   CASE_F: set char if modified by case folding,
551
 */
552
static int unicode_case1(CharRange *cr, int case_mask)
553
107k
{
554
3.12M
#define MR(x) (1 << RUN_TYPE_ ## x)
555
107k
    const uint32_t tab_run_mask[3] = {
556
107k
        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
557
107k
        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
558
559
107k
        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
560
561
107k
        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
562
107k
    };
563
107k
#undef MR
564
107k
    uint32_t mask, v, code, type, len, i, idx;
565
566
107k
    if (case_mask == 0)
567
0
        return 0;
568
107k
    mask = 0;
569
430k
    for(i = 0; i < 3; i++) {
570
322k
        if ((case_mask >> i) & 1)
571
107k
            mask |= tab_run_mask[i];
572
322k
    }
573
40.7M
    for(idx = 0; idx < countof(case_conv_table1); idx++) {
574
40.6M
        v = case_conv_table1[idx];
575
40.6M
        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
576
40.6M
        code = v >> (32 - 17);
577
40.6M
        len = (v >> (32 - 17 - 7)) & 0x7f;
578
40.6M
        if ((mask >> type) & 1) {
579
            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
580
27.8M
            switch(type) {
581
6.35M
            case RUN_TYPE_UL:
582
6.35M
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
583
0
                    goto def_case;
584
6.35M
                code += ((case_mask & CASE_U) != 0);
585
65.0M
                for(i = 0; i < len; i += 2) {
586
58.6M
                    if (cr_add_interval(cr, code + i, code + i + 1))
587
0
                        return -1;
588
58.6M
                }
589
6.35M
                break;
590
6.35M
            case RUN_TYPE_LSU:
591
430k
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
592
0
                    goto def_case;
593
430k
                if (!(case_mask & CASE_U)) {
594
0
                    if (cr_add_interval(cr, code, code + 1))
595
0
                        return -1;
596
0
                }
597
430k
                if (cr_add_interval(cr, code + 1, code + 2))
598
0
                    return -1;
599
430k
                if (case_mask & CASE_U) {
600
430k
                    if (cr_add_interval(cr, code + 2, code + 3))
601
0
                        return -1;
602
430k
                }
603
430k
                break;
604
21.0M
            default:
605
21.0M
            def_case:
606
21.0M
                if (cr_add_interval(cr, code, code + len))
607
0
                    return -1;
608
21.0M
                break;
609
27.8M
            }
610
27.8M
        }
611
40.6M
    }
612
107k
    return 0;
613
107k
}
614
615
static int point_cmp(const void *p1, const void *p2, void *arg)
616
440M
{
617
440M
    uint32_t v1 = *(uint32_t *)p1;
618
440M
    uint32_t v2 = *(uint32_t *)p2;
619
440M
    return (v1 > v2) - (v1 < v2);
620
440M
}
621
622
static void cr_sort_and_remove_overlap(CharRange *cr)
623
107k
{
624
107k
    uint32_t start, end, start1, end1, i, j;
625
626
    /* the resulting ranges are not necessarily sorted and may overlap */
627
107k
    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
628
107k
    j = 0;
629
46.5M
    for(i = 0; i < cr->len; ) {
630
46.3M
        start = cr->points[i];
631
46.3M
        end = cr->points[i + 1];
632
46.3M
        i += 2;
633
52.4M
        while (i < cr->len) {
634
52.3M
            start1 = cr->points[i];
635
52.3M
            end1 = cr->points[i + 1];
636
52.3M
            if (start1 > end) {
637
                /* |------|
638
                 *           |-------| */
639
46.2M
                break;
640
46.2M
            } else if (end1 <= end) {
641
                /* |------|
642
                 *    |--| */
643
1.74M
                i += 2;
644
4.35M
            } else {
645
                /* |------|
646
                 *     |-------| */
647
4.35M
                end = end1;
648
4.35M
                i += 2;
649
4.35M
            }
650
52.3M
        }
651
46.3M
        cr->points[j] = start;
652
46.3M
        cr->points[j + 1] = end;
653
46.3M
        j += 2;
654
46.3M
    }
655
107k
    cr->len = j;
656
107k
}
657
658
/* canonicalize a character set using the JS regex case folding rules
659
   (see lre_canonicalize()) */
660
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
661
107k
{
662
107k
    CharRange cr_inter, cr_mask, cr_result, cr_sub;
663
107k
    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
664
665
107k
    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
666
107k
    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
667
107k
    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
668
107k
    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
669
670
107k
    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
671
0
        goto fail;
672
107k
    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
673
0
        goto fail;
674
675
107k
    if (cr_invert(&cr_mask))
676
0
        goto fail;
677
107k
    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
678
0
        goto fail;
679
680
    /* cr_inter = cr & cr_mask */
681
    /* cr_sub = cr & ~cr_mask */
682
683
    /* use the case conversion table to compute the result */
684
107k
    d_start = -1;
685
107k
    d_end = -1;
686
107k
    idx = 0;
687
107k
    v = case_conv_table1[idx];
688
107k
    code = v >> (32 - 17);
689
107k
    len = (v >> (32 - 17 - 7)) & 0x7f;
690
46.7M
    for(i = 0; i < cr_inter.len; i += 2) {
691
46.6M
        start = cr_inter.points[i];
692
46.6M
        end = cr_inter.points[i + 1];
693
694
142M
        for(c = start; c < end; c++) {
695
124M
            for(;;) {
696
124M
                if (c >= code && c < code + len)
697
96.0M
                    break;
698
28.5M
                idx++;
699
28.5M
                assert(idx < countof(case_conv_table1));
700
28.5M
                v = case_conv_table1[idx];
701
28.5M
                code = v >> (32 - 17);
702
28.5M
                len = (v >> (32 - 17 - 7)) & 0x7f;
703
28.5M
            }
704
96.0M
            d = lre_case_folding_entry(c, idx, v, is_unicode);
705
            /* try to merge with the current interval */
706
96.0M
            if (d_start == -1) {
707
101k
                d_start = d;
708
101k
                d_end = d + 1;
709
95.9M
            } else if (d_end == d) {
710
43.5M
                d_end++;
711
52.3M
            } else {
712
52.3M
                cr_add_interval(&cr_result, d_start, d_end);
713
52.3M
                d_start = d;
714
52.3M
                d_end = d + 1;
715
52.3M
            }
716
96.0M
        }
717
46.6M
    }
718
107k
    if (d_start != -1) {
719
101k
        if (cr_add_interval(&cr_result, d_start, d_end))
720
0
            goto fail;
721
101k
    }
722
723
    /* the resulting ranges are not necessarily sorted and may overlap */
724
107k
    cr_sort_and_remove_overlap(&cr_result);
725
726
    /* or with the character not affected by the case folding */
727
107k
    cr->len = 0;
728
107k
    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
729
0
        goto fail;
730
731
107k
    cr_free(&cr_inter);
732
107k
    cr_free(&cr_mask);
733
107k
    cr_free(&cr_result);
734
107k
    cr_free(&cr_sub);
735
107k
    return 0;
736
0
 fail:
737
0
    cr_free(&cr_inter);
738
0
    cr_free(&cr_mask);
739
0
    cr_free(&cr_result);
740
0
    cr_free(&cr_sub);
741
0
    return -1;
742
107k
}
743
744
#ifdef CONFIG_ALL_UNICODE
745
746
BOOL lre_is_id_start(uint32_t c)
747
148
{
748
148
    return lre_is_in_table(c, unicode_prop_ID_Start_table,
749
148
                           unicode_prop_ID_Start_index,
750
148
                           sizeof(unicode_prop_ID_Start_index) / 3);
751
148
}
752
753
BOOL lre_is_id_continue(uint32_t c)
754
0
{
755
0
    return lre_is_id_start(c) ||
756
0
        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
757
0
                        unicode_prop_ID_Continue1_index,
758
0
                        sizeof(unicode_prop_ID_Continue1_index) / 3);
759
0
}
760
761
#define UNICODE_DECOMP_LEN_MAX 18
762
763
typedef enum {
764
    DECOMP_TYPE_C1, /* 16 bit char */
765
    DECOMP_TYPE_L1, /* 16 bit char table */
766
    DECOMP_TYPE_L2,
767
    DECOMP_TYPE_L3,
768
    DECOMP_TYPE_L4,
769
    DECOMP_TYPE_L5, /* XXX: not used */
770
    DECOMP_TYPE_L6, /* XXX: could remove */
771
    DECOMP_TYPE_L7, /* XXX: could remove */
772
    DECOMP_TYPE_LL1, /* 18 bit char table */
773
    DECOMP_TYPE_LL2,
774
    DECOMP_TYPE_S1, /* 8 bit char table */
775
    DECOMP_TYPE_S2,
776
    DECOMP_TYPE_S3,
777
    DECOMP_TYPE_S4,
778
    DECOMP_TYPE_S5,
779
    DECOMP_TYPE_I1, /* increment 16 bit char value */
780
    DECOMP_TYPE_I2_0,
781
    DECOMP_TYPE_I2_1,
782
    DECOMP_TYPE_I3_1,
783
    DECOMP_TYPE_I3_2,
784
    DECOMP_TYPE_I4_1,
785
    DECOMP_TYPE_I4_2,
786
    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
787
    DECOMP_TYPE_B2,
788
    DECOMP_TYPE_B3,
789
    DECOMP_TYPE_B4,
790
    DECOMP_TYPE_B5,
791
    DECOMP_TYPE_B6,
792
    DECOMP_TYPE_B7,
793
    DECOMP_TYPE_B8,
794
    DECOMP_TYPE_B18,
795
    DECOMP_TYPE_LS2,
796
    DECOMP_TYPE_PAT3,
797
    DECOMP_TYPE_S2_UL,
798
    DECOMP_TYPE_LS2_UL,
799
} DecompTypeEnum;
800
801
static uint32_t unicode_get_short_code(uint32_t c)
802
0
{
803
0
    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
804
805
0
    if (c < 0x80)
806
0
        return c;
807
0
    else if (c < 0x80 + 0x50)
808
0
        return c - 0x80 + 0x300;
809
0
    else
810
0
        return unicode_short_table[c - 0x80 - 0x50];
811
0
}
812
813
static uint32_t unicode_get_lower_simple(uint32_t c)
814
0
{
815
0
    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
816
0
        c += 0x20;
817
0
    else
818
0
        c++;
819
0
    return c;
820
0
}
821
822
static uint16_t unicode_get16(const uint8_t *p)
823
0
{
824
0
    return p[0] | (p[1] << 8);
825
0
}
826
827
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
828
                                int idx, uint32_t code, uint32_t len,
829
                                uint32_t type)
830
0
{
831
0
    uint32_t c1;
832
0
    int l, i, p;
833
0
    const uint8_t *d;
834
835
0
    if (type == DECOMP_TYPE_C1) {
836
0
        res[0] = unicode_decomp_table2[idx];
837
0
        return 1;
838
0
    } else {
839
0
        d = unicode_decomp_data + unicode_decomp_table2[idx];
840
0
        switch(type) {
841
0
        case DECOMP_TYPE_L1:
842
0
        case DECOMP_TYPE_L2:
843
0
        case DECOMP_TYPE_L3:
844
0
        case DECOMP_TYPE_L4:
845
0
        case DECOMP_TYPE_L5:
846
0
        case DECOMP_TYPE_L6:
847
0
        case DECOMP_TYPE_L7:
848
0
            l = type - DECOMP_TYPE_L1 + 1;
849
0
            d += (c - code) * l * 2;
850
0
            for(i = 0; i < l; i++) {
851
0
                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
852
0
                    return 0;
853
0
            }
854
0
            return l;
855
0
        case DECOMP_TYPE_LL1:
856
0
        case DECOMP_TYPE_LL2:
857
0
            {
858
0
                uint32_t k, p;
859
0
                l = type - DECOMP_TYPE_LL1 + 1;
860
0
                k = (c - code) * l;
861
0
                p = len * l * 2;
862
0
                for(i = 0; i < l; i++) {
863
0
                    c1 = unicode_get16(d + 2 * k) |
864
0
                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
865
0
                    if (!c1)
866
0
                        return 0;
867
0
                    res[i] = c1;
868
0
                    k++;
869
0
                }
870
0
            }
871
0
            return l;
872
0
        case DECOMP_TYPE_S1:
873
0
        case DECOMP_TYPE_S2:
874
0
        case DECOMP_TYPE_S3:
875
0
        case DECOMP_TYPE_S4:
876
0
        case DECOMP_TYPE_S5:
877
0
            l = type - DECOMP_TYPE_S1 + 1;
878
0
            d += (c - code) * l;
879
0
            for(i = 0; i < l; i++) {
880
0
                if ((res[i] = unicode_get_short_code(d[i])) == 0)
881
0
                    return 0;
882
0
            }
883
0
            return l;
884
0
        case DECOMP_TYPE_I1:
885
0
            l = 1;
886
0
            p = 0;
887
0
            goto decomp_type_i;
888
0
        case DECOMP_TYPE_I2_0:
889
0
        case DECOMP_TYPE_I2_1:
890
0
        case DECOMP_TYPE_I3_1:
891
0
        case DECOMP_TYPE_I3_2:
892
0
        case DECOMP_TYPE_I4_1:
893
0
        case DECOMP_TYPE_I4_2:
894
0
            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
895
0
            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
896
0
        decomp_type_i:
897
0
            for(i = 0; i < l; i++) {
898
0
                c1 = unicode_get16(d + 2 * i);
899
0
                if (i == p)
900
0
                    c1 += c - code;
901
0
                res[i] = c1;
902
0
            }
903
0
            return l;
904
0
        case DECOMP_TYPE_B18:
905
0
            l = 18;
906
0
            goto decomp_type_b;
907
0
        case DECOMP_TYPE_B1:
908
0
        case DECOMP_TYPE_B2:
909
0
        case DECOMP_TYPE_B3:
910
0
        case DECOMP_TYPE_B4:
911
0
        case DECOMP_TYPE_B5:
912
0
        case DECOMP_TYPE_B6:
913
0
        case DECOMP_TYPE_B7:
914
0
        case DECOMP_TYPE_B8:
915
0
            l = type - DECOMP_TYPE_B1 + 1;
916
0
        decomp_type_b:
917
0
            {
918
0
                uint32_t c_min;
919
0
                c_min = unicode_get16(d);
920
0
                d += 2 + (c - code) * l;
921
0
                for(i = 0; i < l; i++) {
922
0
                    c1 = d[i];
923
0
                    if (c1 == 0xff)
924
0
                        c1 = 0x20;
925
0
                    else
926
0
                        c1 += c_min;
927
0
                    res[i] = c1;
928
0
                }
929
0
            }
930
0
            return l;
931
0
        case DECOMP_TYPE_LS2:
932
0
            d += (c - code) * 3;
933
0
            if (!(res[0] = unicode_get16(d)))
934
0
                return 0;
935
0
            res[1] = unicode_get_short_code(d[2]);
936
0
            return 2;
937
0
        case DECOMP_TYPE_PAT3:
938
0
            res[0] = unicode_get16(d);
939
0
            res[2] = unicode_get16(d + 2);
940
0
            d += 4 + (c - code) * 2;
941
0
            res[1] = unicode_get16(d);
942
0
            return 3;
943
0
        case DECOMP_TYPE_S2_UL:
944
0
        case DECOMP_TYPE_LS2_UL:
945
0
            c1 = c - code;
946
0
            if (type == DECOMP_TYPE_S2_UL) {
947
0
                d += c1 & ~1;
948
0
                c = unicode_get_short_code(*d);
949
0
                d++;
950
0
            } else {
951
0
                d += (c1 >> 1) * 3;
952
0
                c = unicode_get16(d);
953
0
                d += 2;
954
0
            }
955
0
            if (c1 & 1)
956
0
                c = unicode_get_lower_simple(c);
957
0
            res[0] = c;
958
0
            res[1] = unicode_get_short_code(*d);
959
0
            return 2;
960
0
        }
961
0
    }
962
0
    return 0;
963
0
}
964
965
966
/* return the length of the decomposition (length <=
967
   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
968
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
969
0
{
970
0
    uint32_t v, type, is_compat, code, len;
971
0
    int idx_min, idx_max, idx;
972
973
0
    idx_min = 0;
974
0
    idx_max = countof(unicode_decomp_table1) - 1;
975
0
    while (idx_min <= idx_max) {
976
0
        idx = (idx_max + idx_min) / 2;
977
0
        v = unicode_decomp_table1[idx];
978
0
        code = v >> (32 - 18);
979
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
980
        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
981
0
        if (c < code) {
982
0
            idx_max = idx - 1;
983
0
        } else if (c >= code + len) {
984
0
            idx_min = idx + 1;
985
0
        } else {
986
0
            is_compat = v & 1;
987
0
            if (is_compat1 < is_compat)
988
0
                break;
989
0
            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
990
0
            return unicode_decomp_entry(res, c, idx, code, len, type);
991
0
        }
992
0
    }
993
0
    return 0;
994
0
}
995
996
/* return 0 if no pair found */
997
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
998
0
{
999
0
    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
1000
0
    int idx_min, idx_max, idx, d;
1001
0
    uint32_t pair[2];
1002
1003
0
    idx_min = 0;
1004
0
    idx_max = countof(unicode_comp_table) - 1;
1005
0
    while (idx_min <= idx_max) {
1006
0
        idx = (idx_max + idx_min) / 2;
1007
0
        idx1 = unicode_comp_table[idx];
1008
1009
        /* idx1 represent an entry of the decomposition table */
1010
0
        d_idx = idx1 >> 6;
1011
0
        d_offset = idx1 & 0x3f;
1012
0
        v = unicode_decomp_table1[d_idx];
1013
0
        code = v >> (32 - 18);
1014
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
1015
0
        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
1016
0
        ch = code + d_offset;
1017
0
        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1018
0
        d = c0 - pair[0];
1019
0
        if (d == 0)
1020
0
            d = c1 - pair[1];
1021
0
        if (d < 0) {
1022
0
            idx_max = idx - 1;
1023
0
        } else if (d > 0) {
1024
0
            idx_min = idx + 1;
1025
0
        } else {
1026
0
            return ch;
1027
0
        }
1028
0
    }
1029
0
    return 0;
1030
0
}
1031
1032
/* return the combining class of character c (between 0 and 255) */
1033
static int unicode_get_cc(uint32_t c)
1034
0
{
1035
0
    uint32_t code, n, type, cc, c1, b;
1036
0
    int pos;
1037
0
    const uint8_t *p;
1038
1039
0
    pos = get_index_pos(&code, c,
1040
0
                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
1041
0
    if (pos < 0)
1042
0
        return 0;
1043
0
    p = unicode_cc_table + pos;
1044
    /* Compressed run length encoding:
1045
       - 2 high order bits are combining class type
1046
       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1047
       - 00..2F: range length (add 1)
1048
       - 30..37: 3-bit range-length + 1 extra byte
1049
       - 38..3F: 3-bit range-length + 2 extra byte
1050
     */
1051
0
    for(;;) {
1052
0
        b = *p++;
1053
0
        type = b >> 6;
1054
0
        n = b & 0x3f;
1055
0
        if (n < 48) {
1056
0
        } else if (n < 56) {
1057
0
            n = (n - 48) << 8;
1058
0
            n |= *p++;
1059
0
            n += 48;
1060
0
        } else {
1061
0
            n = (n - 56) << 8;
1062
0
            n |= *p++ << 8;
1063
0
            n |= *p++;
1064
0
            n += 48 + (1 << 11);
1065
0
        }
1066
0
        if (type <= 1)
1067
0
            p++;
1068
0
        c1 = code + n + 1;
1069
0
        if (c < c1) {
1070
0
            switch(type) {
1071
0
            case 0:
1072
0
                cc = p[-1];
1073
0
                break;
1074
0
            case 1:
1075
0
                cc = p[-1] + c - code;
1076
0
                break;
1077
0
            case 2:
1078
0
                cc = 0;
1079
0
                break;
1080
0
            default:
1081
0
            case 3:
1082
0
                cc = 230;
1083
0
                break;
1084
0
            }
1085
0
            return cc;
1086
0
        }
1087
0
        code = c1;
1088
0
    }
1089
0
}
1090
1091
static void sort_cc(int *buf, int len)
1092
0
{
1093
0
    int i, j, k, cc, cc1, start, ch1;
1094
1095
0
    for(i = 0; i < len; i++) {
1096
0
        cc = unicode_get_cc(buf[i]);
1097
0
        if (cc != 0) {
1098
0
            start = i;
1099
0
            j = i + 1;
1100
0
            while (j < len) {
1101
0
                ch1 = buf[j];
1102
0
                cc1 = unicode_get_cc(ch1);
1103
0
                if (cc1 == 0)
1104
0
                    break;
1105
0
                k = j - 1;
1106
0
                while (k >= start) {
1107
0
                    if (unicode_get_cc(buf[k]) <= cc1)
1108
0
                        break;
1109
0
                    buf[k + 1] = buf[k];
1110
0
                    k--;
1111
0
                }
1112
0
                buf[k + 1] = ch1;
1113
0
                j++;
1114
0
            }
1115
#if 0
1116
            printf("cc:");
1117
            for(k = start; k < j; k++) {
1118
                printf(" %3d", unicode_get_cc(buf[k]));
1119
            }
1120
            printf("\n");
1121
#endif
1122
0
            i = j;
1123
0
        }
1124
0
    }
1125
0
}
1126
1127
static void to_nfd_rec(DynBuf *dbuf,
1128
                       const int *src, int src_len, int is_compat)
1129
0
{
1130
0
    uint32_t c, v;
1131
0
    int i, l;
1132
0
    uint32_t res[UNICODE_DECOMP_LEN_MAX];
1133
1134
0
    for(i = 0; i < src_len; i++) {
1135
0
        c = src[i];
1136
0
        if (c >= 0xac00 && c < 0xd7a4) {
1137
            /* Hangul decomposition */
1138
0
            c -= 0xac00;
1139
0
            dbuf_put_u32(dbuf, 0x1100 + c / 588);
1140
0
            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
1141
0
            v = c % 28;
1142
0
            if (v != 0)
1143
0
                dbuf_put_u32(dbuf, 0x11a7 + v);
1144
0
        } else {
1145
0
            l = unicode_decomp_char(res, c, is_compat);
1146
0
            if (l) {
1147
0
                to_nfd_rec(dbuf, (int *)res, l, is_compat);
1148
0
            } else {
1149
0
                dbuf_put_u32(dbuf, c);
1150
0
            }
1151
0
        }
1152
0
    }
1153
0
}
1154
1155
/* return 0 if not found */
1156
static int compose_pair(uint32_t c0, uint32_t c1)
1157
0
{
1158
    /* Hangul composition */
1159
0
    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
1160
0
        c1 >= 0x1161 && c1 < 0x1161 + 21) {
1161
0
        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
1162
0
    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
1163
0
               (c0 - 0xac00) % 28 == 0 &&
1164
0
               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
1165
0
        return c0 + c1 - 0x11a7;
1166
0
    } else {
1167
0
        return unicode_compose_pair(c0, c1);
1168
0
    }
1169
0
}
1170
1171
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
1172
                      UnicodeNormalizationEnum n_type,
1173
                      void *opaque, DynBufReallocFunc *realloc_func)
1174
0
{
1175
0
    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1176
0
    BOOL is_compat;
1177
0
    DynBuf dbuf_s, *dbuf = &dbuf_s;
1178
1179
0
    is_compat = n_type >> 1;
1180
1181
0
    dbuf_init2(dbuf, opaque, realloc_func);
1182
0
    if (dbuf_claim(dbuf, sizeof(int) * src_len))
1183
0
        goto fail;
1184
1185
    /* common case: latin1 is unaffected by NFC */
1186
0
    if (n_type == UNICODE_NFC) {
1187
0
        for(i = 0; i < src_len; i++) {
1188
0
            if (src[i] >= 0x100)
1189
0
                goto not_latin1;
1190
0
        }
1191
0
        buf = (int *)dbuf->buf;
1192
0
        memcpy(buf, src, src_len * sizeof(int));
1193
0
        *pdst = (uint32_t *)buf;
1194
0
        return src_len;
1195
0
    not_latin1: ;
1196
0
    }
1197
1198
0
    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1199
0
    if (dbuf_error(dbuf)) {
1200
0
    fail:
1201
0
        *pdst = NULL;
1202
0
        return -1;
1203
0
    }
1204
0
    buf = (int *)dbuf->buf;
1205
0
    buf_len = dbuf->size / sizeof(int);
1206
1207
0
    sort_cc(buf, buf_len);
1208
1209
0
    if (buf_len <= 1 || (n_type & 1) != 0) {
1210
        /* NFD / NFKD */
1211
0
        *pdst = (uint32_t *)buf;
1212
0
        return buf_len;
1213
0
    }
1214
1215
0
    i = 1;
1216
0
    out_len = 1;
1217
0
    while (i < buf_len) {
1218
        /* find the starter character and test if it is blocked from
1219
           the character at 'i' */
1220
0
        last_cc = unicode_get_cc(buf[i]);
1221
0
        starter_pos = out_len - 1;
1222
0
        while (starter_pos >= 0) {
1223
0
            cc = unicode_get_cc(buf[starter_pos]);
1224
0
            if (cc == 0)
1225
0
                break;
1226
0
            if (cc >= last_cc)
1227
0
                goto next;
1228
0
            last_cc = 256;
1229
0
            starter_pos--;
1230
0
        }
1231
0
        if (starter_pos >= 0 &&
1232
0
            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1233
0
            buf[starter_pos] = p;
1234
0
            i++;
1235
0
        } else {
1236
0
        next:
1237
0
            buf[out_len++] = buf[i++];
1238
0
        }
1239
0
    }
1240
0
    *pdst = (uint32_t *)buf;
1241
0
    return out_len;
1242
0
}
1243
1244
/* char ranges for various unicode properties */
1245
1246
static int unicode_find_name(const char *name_table, const char *name)
1247
0
{
1248
0
    const char *p, *r;
1249
0
    int pos;
1250
0
    size_t name_len, len;
1251
1252
0
    p = name_table;
1253
0
    pos = 0;
1254
0
    name_len = strlen(name);
1255
0
    while (*p) {
1256
0
        for(;;) {
1257
0
            r = strchr(p, ',');
1258
0
            if (!r)
1259
0
                len = strlen(p);
1260
0
            else
1261
0
                len = r - p;
1262
0
            if (len == name_len && !memcmp(p, name, name_len))
1263
0
                return pos;
1264
0
            p += len + 1;
1265
0
            if (!r)
1266
0
                break;
1267
0
        }
1268
0
        pos++;
1269
0
    }
1270
0
    return -1;
1271
0
}
1272
1273
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1274
   if not found */
1275
int unicode_script(CharRange *cr,
1276
                   const char *script_name, BOOL is_ext)
1277
0
{
1278
0
    int script_idx;
1279
0
    const uint8_t *p, *p_end;
1280
0
    uint32_t c, c1, b, n, v, v_len, i, type;
1281
0
    CharRange cr1_s, *cr1;
1282
0
    CharRange cr2_s, *cr2 = &cr2_s;
1283
0
    BOOL is_common;
1284
1285
0
    script_idx = unicode_find_name(unicode_script_name_table, script_name);
1286
0
    if (script_idx < 0)
1287
0
        return -2;
1288
1289
0
    is_common = (script_idx == UNICODE_SCRIPT_Common ||
1290
0
                 script_idx == UNICODE_SCRIPT_Inherited);
1291
0
    if (is_ext) {
1292
0
        cr1 = &cr1_s;
1293
0
        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1294
0
        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1295
0
    } else {
1296
0
        cr1 = cr;
1297
0
    }
1298
1299
0
    p = unicode_script_table;
1300
0
    p_end = unicode_script_table + countof(unicode_script_table);
1301
0
    c = 0;
1302
0
    while (p < p_end) {
1303
0
        b = *p++;
1304
0
        type = b >> 7;
1305
0
        n = b & 0x7f;
1306
0
        if (n < 96) {
1307
0
        } else if (n < 112) {
1308
0
            n = (n - 96) << 8;
1309
0
            n |= *p++;
1310
0
            n += 96;
1311
0
        } else {
1312
0
            n = (n - 112) << 16;
1313
0
            n |= *p++ << 8;
1314
0
            n |= *p++;
1315
0
            n += 96 + (1 << 12);
1316
0
        }
1317
0
        c1 = c + n + 1;
1318
0
        if (type != 0) {
1319
0
            v = *p++;
1320
0
            if (v == script_idx || script_idx == UNICODE_SCRIPT_Unknown) {
1321
0
                if (cr_add_interval(cr1, c, c1))
1322
0
                    goto fail;
1323
0
            }
1324
0
        }
1325
0
        c = c1;
1326
0
    }
1327
0
    if (script_idx == UNICODE_SCRIPT_Unknown) {
1328
        /* Unknown is all the characters outside scripts */
1329
0
        if (cr_invert(cr1))
1330
0
            goto fail;
1331
0
    }
1332
1333
0
    if (is_ext) {
1334
        /* add the script extensions */
1335
0
        p = unicode_script_ext_table;
1336
0
        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1337
0
        c = 0;
1338
0
        while (p < p_end) {
1339
0
            b = *p++;
1340
0
            if (b < 128) {
1341
0
                n = b;
1342
0
            } else if (b < 128 + 64) {
1343
0
                n = (b - 128) << 8;
1344
0
                n |= *p++;
1345
0
                n += 128;
1346
0
            } else {
1347
0
                n = (b - 128 - 64) << 16;
1348
0
                n |= *p++ << 8;
1349
0
                n |= *p++;
1350
0
                n += 128 + (1 << 14);
1351
0
            }
1352
0
            c1 = c + n + 1;
1353
0
            v_len = *p++;
1354
0
            if (is_common) {
1355
0
                if (v_len != 0) {
1356
0
                    if (cr_add_interval(cr2, c, c1))
1357
0
                        goto fail;
1358
0
                }
1359
0
            } else {
1360
0
                for(i = 0; i < v_len; i++) {
1361
0
                    if (p[i] == script_idx) {
1362
0
                        if (cr_add_interval(cr2, c, c1))
1363
0
                            goto fail;
1364
0
                        break;
1365
0
                    }
1366
0
                }
1367
0
            }
1368
0
            p += v_len;
1369
0
            c = c1;
1370
0
        }
1371
0
        if (is_common) {
1372
            /* remove all the characters with script extensions */
1373
0
            if (cr_invert(cr2))
1374
0
                goto fail;
1375
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1376
0
                      CR_OP_INTER))
1377
0
                goto fail;
1378
0
        } else {
1379
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1380
0
                      CR_OP_UNION))
1381
0
                goto fail;
1382
0
        }
1383
0
        cr_free(cr1);
1384
0
        cr_free(cr2);
1385
0
    }
1386
0
    return 0;
1387
0
 fail:
1388
0
    if (is_ext) {
1389
0
        cr_free(cr1);
1390
0
        cr_free(cr2);
1391
0
    }
1392
0
    goto fail;
1393
0
}
1394
1395
0
#define M(id) (1U << UNICODE_GC_ ## id)
1396
1397
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1398
0
{
1399
0
    const uint8_t *p, *p_end;
1400
0
    uint32_t c, c0, b, n, v;
1401
1402
0
    p = unicode_gc_table;
1403
0
    p_end = unicode_gc_table + countof(unicode_gc_table);
1404
0
    c = 0;
1405
    /* Compressed range encoding:
1406
       initial byte:
1407
       bits 0..4: category number (special case 31)
1408
       bits 5..7: range length (add 1)
1409
       special case bits 5..7 == 7: read an extra byte
1410
       - 00..7F: range length (add 7 + 1)
1411
       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1412
       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1413
     */
1414
0
    while (p < p_end) {
1415
0
        b = *p++;
1416
0
        n = b >> 5;
1417
0
        v = b & 0x1f;
1418
0
        if (n == 7) {
1419
0
            n = *p++;
1420
0
            if (n < 128) {
1421
0
                n += 7;
1422
0
            } else if (n < 128 + 64) {
1423
0
                n = (n - 128) << 8;
1424
0
                n |= *p++;
1425
0
                n += 7 + 128;
1426
0
            } else {
1427
0
                n = (n - 128 - 64) << 16;
1428
0
                n |= *p++ << 8;
1429
0
                n |= *p++;
1430
0
                n += 7 + 128 + (1 << 14);
1431
0
            }
1432
0
        }
1433
0
        c0 = c;
1434
0
        c += n + 1;
1435
0
        if (v == 31) {
1436
            /* run of Lu / Ll */
1437
0
            b = gc_mask & (M(Lu) | M(Ll));
1438
0
            if (b != 0) {
1439
0
                if (b == (M(Lu) | M(Ll))) {
1440
0
                    goto add_range;
1441
0
                } else {
1442
0
                    c0 += ((gc_mask & M(Ll)) != 0);
1443
0
                    for(; c0 < c; c0 += 2) {
1444
0
                        if (cr_add_interval(cr, c0, c0 + 1))
1445
0
                            return -1;
1446
0
                    }
1447
0
                }
1448
0
            }
1449
0
        } else if ((gc_mask >> v) & 1) {
1450
0
        add_range:
1451
0
            if (cr_add_interval(cr, c0, c))
1452
0
                return -1;
1453
0
        }
1454
0
    }
1455
0
    return 0;
1456
0
}
1457
1458
static int unicode_prop1(CharRange *cr, int prop_idx)
1459
0
{
1460
0
    const uint8_t *p, *p_end;
1461
0
    uint32_t c, c0, b, bit;
1462
1463
0
    p = unicode_prop_table[prop_idx];
1464
0
    p_end = p + unicode_prop_len_table[prop_idx];
1465
0
    c = 0;
1466
0
    bit = 0;
1467
    /* Compressed range encoding:
1468
       00..3F: 2 packed lengths: 3-bit + 3-bit
1469
       40..5F: 5-bits plus extra byte for length
1470
       60..7F: 5-bits plus 2 extra bytes for length
1471
       80..FF: 7-bit length
1472
       lengths must be incremented to get character count
1473
       Ranges alternate between false and true return value.
1474
     */
1475
0
    while (p < p_end) {
1476
0
        c0 = c;
1477
0
        b = *p++;
1478
0
        if (b < 64) {
1479
0
            c += (b >> 3) + 1;
1480
0
            if (bit)  {
1481
0
                if (cr_add_interval(cr, c0, c))
1482
0
                    return -1;
1483
0
            }
1484
0
            bit ^= 1;
1485
0
            c0 = c;
1486
0
            c += (b & 7) + 1;
1487
0
        } else if (b >= 0x80) {
1488
0
            c += b - 0x80 + 1;
1489
0
        } else if (b < 0x60) {
1490
0
            c += (((b - 0x40) << 8) | p[0]) + 1;
1491
0
            p++;
1492
0
        } else {
1493
0
            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1494
0
            p += 2;
1495
0
        }
1496
0
        if (bit)  {
1497
0
            if (cr_add_interval(cr, c0, c))
1498
0
                return -1;
1499
0
        }
1500
0
        bit ^= 1;
1501
0
    }
1502
0
    return 0;
1503
0
}
1504
1505
typedef enum {
1506
    POP_GC,
1507
    POP_PROP,
1508
    POP_CASE,
1509
    POP_UNION,
1510
    POP_INTER,
1511
    POP_XOR,
1512
    POP_INVERT,
1513
    POP_END,
1514
} PropOPEnum;
1515
1516
#define POP_STACK_LEN_MAX 4
1517
1518
static int unicode_prop_ops(CharRange *cr, ...)
1519
0
{
1520
0
    va_list ap;
1521
0
    CharRange stack[POP_STACK_LEN_MAX];
1522
0
    int stack_len, op, ret, i;
1523
0
    uint32_t a;
1524
1525
0
    va_start(ap, cr);
1526
0
    stack_len = 0;
1527
0
    for(;;) {
1528
0
        op = va_arg(ap, int);
1529
0
        switch(op) {
1530
0
        case POP_GC:
1531
0
            assert(stack_len < POP_STACK_LEN_MAX);
1532
0
            a = va_arg(ap, int);
1533
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1534
0
            if (unicode_general_category1(&stack[stack_len - 1], a))
1535
0
                goto fail;
1536
0
            break;
1537
0
        case POP_PROP:
1538
0
            assert(stack_len < POP_STACK_LEN_MAX);
1539
0
            a = va_arg(ap, int);
1540
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1541
0
            if (unicode_prop1(&stack[stack_len - 1], a))
1542
0
                goto fail;
1543
0
            break;
1544
0
        case POP_CASE:
1545
0
            assert(stack_len < POP_STACK_LEN_MAX);
1546
0
            a = va_arg(ap, int);
1547
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1548
0
            if (unicode_case1(&stack[stack_len - 1], a))
1549
0
                goto fail;
1550
0
            break;
1551
0
        case POP_UNION:
1552
0
        case POP_INTER:
1553
0
        case POP_XOR:
1554
0
            {
1555
0
                CharRange *cr1, *cr2, *cr3;
1556
0
                assert(stack_len >= 2);
1557
0
                assert(stack_len < POP_STACK_LEN_MAX);
1558
0
                cr1 = &stack[stack_len - 2];
1559
0
                cr2 = &stack[stack_len - 1];
1560
0
                cr3 = &stack[stack_len++];
1561
0
                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1562
                /* CR_OP_XOR may be used here */
1563
0
                if (cr_op(cr3, cr1->points, cr1->len,
1564
0
                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1565
0
                    goto fail;
1566
0
                cr_free(cr1);
1567
0
                cr_free(cr2);
1568
0
                *cr1 = *cr3;
1569
0
                stack_len -= 2;
1570
0
            }
1571
0
            break;
1572
0
        case POP_INVERT:
1573
0
            assert(stack_len >= 1);
1574
0
            if (cr_invert(&stack[stack_len - 1]))
1575
0
                goto fail;
1576
0
            break;
1577
0
        case POP_END:
1578
0
            goto done;
1579
0
        default:
1580
0
            abort();
1581
0
        }
1582
0
    }
1583
0
 done:
1584
0
    assert(stack_len == 1);
1585
0
    ret = cr_copy(cr, &stack[0]);
1586
0
    cr_free(&stack[0]);
1587
0
    return ret;
1588
0
 fail:
1589
0
    for(i = 0; i < stack_len; i++)
1590
0
        cr_free(&stack[i]);
1591
0
    return -1;
1592
0
}
1593
1594
static const uint32_t unicode_gc_mask_table[] = {
1595
    M(Lu) | M(Ll) | M(Lt), /* LC */
1596
    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1597
    M(Mn) | M(Mc) | M(Me), /* M */
1598
    M(Nd) | M(Nl) | M(No), /* N */
1599
    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1600
    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1601
    M(Zs) | M(Zl) | M(Zp), /* Z */
1602
    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1603
};
1604
1605
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1606
   if not found */
1607
int unicode_general_category(CharRange *cr, const char *gc_name)
1608
0
{
1609
0
    int gc_idx;
1610
0
    uint32_t gc_mask;
1611
1612
0
    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1613
0
    if (gc_idx < 0)
1614
0
        return -2;
1615
0
    if (gc_idx <= UNICODE_GC_Co) {
1616
0
        gc_mask = (uint64_t)1 << gc_idx;
1617
0
    } else {
1618
0
        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1619
0
    }
1620
0
    return unicode_general_category1(cr, gc_mask);
1621
0
}
1622
1623
1624
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1625
   if not found */
1626
int unicode_prop(CharRange *cr, const char *prop_name)
1627
0
{
1628
0
    int prop_idx, ret;
1629
1630
0
    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1631
0
    if (prop_idx < 0)
1632
0
        return -2;
1633
0
    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1634
1635
0
    ret = 0;
1636
0
    switch(prop_idx) {
1637
0
    case UNICODE_PROP_ASCII:
1638
0
        if (cr_add_interval(cr, 0x00, 0x7f + 1))
1639
0
            return -1;
1640
0
        break;
1641
0
    case UNICODE_PROP_Any:
1642
0
        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1643
0
            return -1;
1644
0
        break;
1645
0
    case UNICODE_PROP_Assigned:
1646
0
        ret = unicode_prop_ops(cr,
1647
0
                               POP_GC, M(Cn),
1648
0
                               POP_INVERT,
1649
0
                               POP_END);
1650
0
        break;
1651
0
    case UNICODE_PROP_Math:
1652
0
        ret = unicode_prop_ops(cr,
1653
0
                               POP_GC, M(Sm),
1654
0
                               POP_PROP, UNICODE_PROP_Other_Math,
1655
0
                               POP_UNION,
1656
0
                               POP_END);
1657
0
        break;
1658
0
    case UNICODE_PROP_Lowercase:
1659
0
        ret = unicode_prop_ops(cr,
1660
0
                               POP_GC, M(Ll),
1661
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1662
0
                               POP_UNION,
1663
0
                               POP_END);
1664
0
        break;
1665
0
    case UNICODE_PROP_Uppercase:
1666
0
        ret = unicode_prop_ops(cr,
1667
0
                               POP_GC, M(Lu),
1668
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1669
0
                               POP_UNION,
1670
0
                               POP_END);
1671
0
        break;
1672
0
    case UNICODE_PROP_Cased:
1673
0
        ret = unicode_prop_ops(cr,
1674
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt),
1675
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1676
0
                               POP_UNION,
1677
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1678
0
                               POP_UNION,
1679
0
                               POP_END);
1680
0
        break;
1681
0
    case UNICODE_PROP_Alphabetic:
1682
0
        ret = unicode_prop_ops(cr,
1683
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1684
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1685
0
                               POP_UNION,
1686
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1687
0
                               POP_UNION,
1688
0
                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
1689
0
                               POP_UNION,
1690
0
                               POP_END);
1691
0
        break;
1692
0
    case UNICODE_PROP_Grapheme_Base:
1693
0
        ret = unicode_prop_ops(cr,
1694
0
                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1695
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1696
0
                               POP_UNION,
1697
0
                               POP_INVERT,
1698
0
                               POP_END);
1699
0
        break;
1700
0
    case UNICODE_PROP_Grapheme_Extend:
1701
0
        ret = unicode_prop_ops(cr,
1702
0
                               POP_GC, M(Me) | M(Mn),
1703
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1704
0
                               POP_UNION,
1705
0
                               POP_END);
1706
0
        break;
1707
0
    case UNICODE_PROP_XID_Start:
1708
0
        ret = unicode_prop_ops(cr,
1709
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1710
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1711
0
                               POP_UNION,
1712
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1713
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1714
0
                               POP_UNION,
1715
0
                               POP_PROP, UNICODE_PROP_XID_Start1,
1716
0
                               POP_UNION,
1717
0
                               POP_INVERT,
1718
0
                               POP_INTER,
1719
0
                               POP_END);
1720
0
        break;
1721
0
    case UNICODE_PROP_XID_Continue:
1722
0
        ret = unicode_prop_ops(cr,
1723
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1724
0
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1725
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1726
0
                               POP_UNION,
1727
0
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1728
0
                               POP_UNION,
1729
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1730
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1731
0
                               POP_UNION,
1732
0
                               POP_PROP, UNICODE_PROP_XID_Continue1,
1733
0
                               POP_UNION,
1734
0
                               POP_INVERT,
1735
0
                               POP_INTER,
1736
0
                               POP_END);
1737
0
        break;
1738
0
    case UNICODE_PROP_Changes_When_Uppercased:
1739
0
        ret = unicode_case1(cr, CASE_U);
1740
0
        break;
1741
0
    case UNICODE_PROP_Changes_When_Lowercased:
1742
0
        ret = unicode_case1(cr, CASE_L);
1743
0
        break;
1744
0
    case UNICODE_PROP_Changes_When_Casemapped:
1745
0
        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1746
0
        break;
1747
0
    case UNICODE_PROP_Changes_When_Titlecased:
1748
0
        ret = unicode_prop_ops(cr,
1749
0
                               POP_CASE, CASE_U,
1750
0
                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1751
0
                               POP_XOR,
1752
0
                               POP_END);
1753
0
        break;
1754
0
    case UNICODE_PROP_Changes_When_Casefolded:
1755
0
        ret = unicode_prop_ops(cr,
1756
0
                               POP_CASE, CASE_F,
1757
0
                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1758
0
                               POP_XOR,
1759
0
                               POP_END);
1760
0
        break;
1761
0
    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1762
0
        ret = unicode_prop_ops(cr,
1763
0
                               POP_CASE, CASE_F,
1764
0
                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1765
0
                               POP_XOR,
1766
0
                               POP_END);
1767
0
        break;
1768
#if 0
1769
    case UNICODE_PROP_ID_Start:
1770
        ret = unicode_prop_ops(cr,
1771
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1772
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1773
                               POP_UNION,
1774
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1775
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1776
                               POP_UNION,
1777
                               POP_INVERT,
1778
                               POP_INTER,
1779
                               POP_END);
1780
        break;
1781
    case UNICODE_PROP_ID_Continue:
1782
        ret = unicode_prop_ops(cr,
1783
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1784
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1785
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1786
                               POP_UNION,
1787
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1788
                               POP_UNION,
1789
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1790
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1791
                               POP_UNION,
1792
                               POP_INVERT,
1793
                               POP_INTER,
1794
                               POP_END);
1795
        break;
1796
    case UNICODE_PROP_Case_Ignorable:
1797
        ret = unicode_prop_ops(cr,
1798
                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1799
                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
1800
                               POP_XOR,
1801
                               POP_END);
1802
        break;
1803
#else
1804
        /* we use the existing tables */
1805
0
    case UNICODE_PROP_ID_Continue:
1806
0
        ret = unicode_prop_ops(cr,
1807
0
                               POP_PROP, UNICODE_PROP_ID_Start,
1808
0
                               POP_PROP, UNICODE_PROP_ID_Continue1,
1809
0
                               POP_XOR,
1810
0
                               POP_END);
1811
0
        break;
1812
0
#endif
1813
0
    default:
1814
0
        if (prop_idx >= countof(unicode_prop_table))
1815
0
            return -2;
1816
0
        ret = unicode_prop1(cr, prop_idx);
1817
0
        break;
1818
0
    }
1819
0
    return ret;
1820
0
}
1821
1822
#endif /* CONFIG_ALL_UNICODE */
1823
1824
/*---- lre codepoint categorizing functions ----*/
1825
1826
#define S  UNICODE_C_SPACE
1827
#define D  UNICODE_C_DIGIT
1828
#define X  UNICODE_C_XDIGIT
1829
#define U  UNICODE_C_UPPER
1830
#define L  UNICODE_C_LOWER
1831
#define _  UNICODE_C_UNDER
1832
#define d  UNICODE_C_DOLLAR
1833
1834
uint8_t const lre_ctype_bits[256] = {
1835
    0, 0, 0, 0, 0, 0, 0, 0,
1836
    0, S, S, S, S, S, 0, 0,
1837
    0, 0, 0, 0, 0, 0, 0, 0,
1838
    0, 0, 0, 0, 0, 0, 0, 0,
1839
1840
    S, 0, 0, 0, d, 0, 0, 0,
1841
    0, 0, 0, 0, 0, 0, 0, 0,
1842
    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1843
    X|D, X|D, 0, 0, 0, 0, 0, 0,
1844
1845
    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1846
    U, U, U, U, U, U, U, U,
1847
    U, U, U, U, U, U, U, U,
1848
    U, U, U, 0, 0, 0, 0, _,
1849
1850
    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1851
    L, L, L, L, L, L, L, L,
1852
    L, L, L, L, L, L, L, L,
1853
    L, L, L, 0, 0, 0, 0, 0,
1854
1855
    0, 0, 0, 0, 0, 0, 0, 0,
1856
    0, 0, 0, 0, 0, 0, 0, 0,
1857
    0, 0, 0, 0, 0, 0, 0, 0,
1858
    0, 0, 0, 0, 0, 0, 0, 0,
1859
1860
    S, 0, 0, 0, 0, 0, 0, 0,
1861
    0, 0, 0, 0, 0, 0, 0, 0,
1862
    0, 0, 0, 0, 0, 0, 0, 0,
1863
    0, 0, 0, 0, 0, 0, 0, 0,
1864
1865
    0, 0, 0, 0, 0, 0, 0, 0,
1866
    0, 0, 0, 0, 0, 0, 0, 0,
1867
    0, 0, 0, 0, 0, 0, 0, 0,
1868
    0, 0, 0, 0, 0, 0, 0, 0,
1869
1870
    0, 0, 0, 0, 0, 0, 0, 0,
1871
    0, 0, 0, 0, 0, 0, 0, 0,
1872
    0, 0, 0, 0, 0, 0, 0, 0,
1873
    0, 0, 0, 0, 0, 0, 0, 0,
1874
};
1875
1876
#undef S
1877
#undef D
1878
#undef X
1879
#undef U
1880
#undef L
1881
#undef _
1882
#undef d
1883
1884
/* code point ranges for Zs,Zl or Zp property */
1885
static const uint16_t char_range_s[] = {
1886
    10,
1887
    0x0009, 0x000D + 1,
1888
    0x0020, 0x0020 + 1,
1889
    0x00A0, 0x00A0 + 1,
1890
    0x1680, 0x1680 + 1,
1891
    0x2000, 0x200A + 1,
1892
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1893
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1894
    0x2028, 0x2029 + 1,
1895
    0x202F, 0x202F + 1,
1896
    0x205F, 0x205F + 1,
1897
    0x3000, 0x3000 + 1,
1898
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1899
    0xFEFF, 0xFEFF + 1,
1900
};
1901
1902
BOOL lre_is_space_non_ascii(uint32_t c)
1903
0
{
1904
0
    size_t i, n;
1905
1906
0
    n = countof(char_range_s);
1907
0
    for(i = 5; i < n; i += 2) {
1908
0
        uint32_t low = char_range_s[i];
1909
0
        uint32_t high = char_range_s[i + 1];
1910
0
        if (c < low)
1911
0
            return FALSE;
1912
0
        if (c < high)
1913
0
            return TRUE;
1914
0
    }
1915
0
    return FALSE;
1916
0
}
1917
1918
#define SEQ_MAX_LEN 16
1919
1920
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
1921
                                  CharRange *cr)
1922
0
{
1923
0
    int i, c, j;
1924
0
    uint32_t seq[SEQ_MAX_LEN];
1925
    
1926
0
    switch(seq_prop_idx) {
1927
0
    case UNICODE_SEQUENCE_PROP_Basic_Emoji:
1928
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
1929
0
            return -1;
1930
0
        for(i = 0; i < cr->len; i += 2) {
1931
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1932
0
                seq[0] = c;
1933
0
                cb(opaque, seq, 1);
1934
0
            }
1935
0
        }
1936
1937
0
        cr->len = 0;
1938
1939
0
        if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
1940
0
            return -1;
1941
0
        for(i = 0; i < cr->len; i += 2) {
1942
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1943
0
                seq[0] = c;
1944
0
                seq[1] = 0xfe0f;
1945
0
                cb(opaque, seq, 2);
1946
0
            }
1947
0
        }
1948
1949
0
        break;
1950
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
1951
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
1952
0
            return -1;
1953
0
        for(i = 0; i < cr->len; i += 2) {
1954
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1955
0
                for(j = 0; j < 5; j++) {
1956
0
                    seq[0] = c;
1957
0
                    seq[1] = 0x1f3fb + j;
1958
0
                    cb(opaque, seq, 2);
1959
0
                }
1960
0
            }
1961
0
        }
1962
0
        break;
1963
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
1964
0
        if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
1965
0
            return -1;
1966
0
        for(i = 0; i < cr->len; i += 2) {
1967
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
1968
0
                int c0, c1;
1969
0
                c0 = c / 26;
1970
0
                c1 = c % 26;
1971
0
                seq[0] = 0x1F1E6 + c0;
1972
0
                seq[1] = 0x1F1E6 + c1;
1973
0
                cb(opaque, seq, 2);
1974
0
            }
1975
0
        }
1976
0
        break;
1977
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
1978
0
        {
1979
0
            int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
1980
0
            int mod_idx, hc_idx, i0, i1;
1981
0
            const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
1982
            
1983
0
            for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
1984
0
                len = tab[i++];
1985
0
                k = 0;
1986
0
                mod = 0;
1987
0
                mod_count = 0;
1988
0
                hc_pos = -1;
1989
0
                for(j = 0; j < len; j++) {
1990
0
                    code = tab[i++];
1991
0
                    code |= tab[i++] << 8;
1992
0
                    pres = code >> 15;
1993
0
                    mod1 = (code >> 13) & 3;
1994
0
                    code &= 0x1fff;
1995
0
                    if (code < 0x1000) {
1996
0
                        c = code + 0x2000;
1997
0
                    } else {
1998
0
                        c = 0x1f000 + (code - 0x1000);
1999
0
                    }
2000
0
                    if (c == 0x1f9b0)
2001
0
                        hc_pos = k;
2002
0
                    seq[k++] = c;
2003
0
                    if (mod1 != 0) {
2004
0
                        assert(mod_count < 2);
2005
0
                        mod = mod1;
2006
0
                        mod_pos[mod_count++] = k;
2007
0
                        seq[k++] = 0; /* will be filled later */
2008
0
                    }
2009
0
                    if (pres) {
2010
0
                        seq[k++] = 0xfe0f;
2011
0
                    }
2012
0
                    if (j < len - 1) {
2013
0
                        seq[k++] = 0x200d;
2014
0
                    }
2015
0
                }
2016
2017
                /* genrate all the variants */
2018
0
                switch(mod) {
2019
0
                case 1:
2020
0
                    n_mod = 5;
2021
0
                    break;
2022
0
                case 2:
2023
0
                    n_mod = 25;
2024
0
                    break;
2025
0
                case 3:
2026
0
                    n_mod = 20;
2027
0
                    break;
2028
0
                default:
2029
0
                    n_mod = 1;
2030
0
                    break;
2031
0
                }
2032
0
                if (hc_pos >= 0)
2033
0
                    n_hc = 4;
2034
0
                else
2035
0
                    n_hc = 1;
2036
0
                for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
2037
0
                    for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
2038
0
                        if (hc_pos >= 0)
2039
0
                            seq[hc_pos] = 0x1f9b0 + hc_idx;
2040
                        
2041
0
                        switch(mod) {
2042
0
                        case 1:
2043
0
                            seq[mod_pos[0]] = 0x1f3fb + mod_idx;
2044
0
                            break;
2045
0
                        case 2:
2046
0
                        case 3:
2047
0
                            i0 = mod_idx / 5;
2048
0
                            i1 = mod_idx % 5;
2049
                            /* avoid identical values */
2050
0
                            if (mod == 3 && i0 >= i1)
2051
0
                                i0++;
2052
0
                            seq[mod_pos[0]] = 0x1f3fb + i0;
2053
0
                            seq[mod_pos[1]] = 0x1f3fb + i1;
2054
0
                            break;
2055
0
                        default:
2056
0
                            break;
2057
0
                        }
2058
#if 0
2059
                        for(j = 0; j < k; j++)
2060
                            printf(" %04x", seq[j]);
2061
                        printf("\n");
2062
#endif                
2063
0
                        cb(opaque, seq, k);
2064
0
                    }
2065
0
                }
2066
0
            }
2067
0
        }
2068
0
        break;
2069
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
2070
0
        {
2071
0
            for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
2072
0
                j = 0;
2073
0
                seq[j++] = 0x1F3F4;
2074
0
                for(;;) {
2075
0
                    c = unicode_rgi_emoji_tag_sequence[i++];
2076
0
                    if (c == 0x00)
2077
0
                        break;
2078
0
                    seq[j++] = 0xe0000 + c;
2079
0
                }
2080
0
                seq[j++] = 0xe007f;
2081
0
                cb(opaque, seq, j);
2082
0
            }
2083
0
        }
2084
0
        break;
2085
0
    case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
2086
0
        if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
2087
0
            return -1;
2088
0
        for(i = 0; i < cr->len; i += 2) {
2089
0
            for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
2090
0
                seq[0] = c;
2091
0
                seq[1] = 0xfe0f;
2092
0
                seq[2] = 0x20e3;
2093
0
                cb(opaque, seq, 3);
2094
0
            }
2095
0
        }
2096
0
        break;
2097
0
    case UNICODE_SEQUENCE_PROP_RGI_Emoji:
2098
        /* all prevous sequences */
2099
0
        for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
2100
0
            int ret;
2101
0
            ret = unicode_sequence_prop1(i, cb, opaque, cr);
2102
0
            if (ret < 0)
2103
0
                return ret;
2104
0
            cr->len = 0;
2105
0
        }
2106
0
        break;
2107
0
    default:
2108
0
        return -2;
2109
0
    }
2110
0
    return 0;
2111
0
}
2112
2113
/* build a unicode sequence property */
2114
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
2115
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
2116
                          CharRange *cr)
2117
0
{
2118
0
    int seq_prop_idx;
2119
0
    seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
2120
0
    if (seq_prop_idx < 0)
2121
0
        return -2;
2122
0
    return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
2123
0
}