Coverage Report

Created: 2024-10-03 06:52

/src/quickjs/libunicode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Unicode utilities
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <string.h>
28
#include <assert.h>
29
30
#include "cutils.h"
31
#include "libunicode.h"
32
#include "libunicode-table.h"
33
34
enum {
35
    RUN_TYPE_U,
36
    RUN_TYPE_L,
37
    RUN_TYPE_UF,
38
    RUN_TYPE_LF,
39
    RUN_TYPE_UL,
40
    RUN_TYPE_LSU,
41
    RUN_TYPE_U2L_399_EXT2,
42
    RUN_TYPE_UF_D20,
43
    RUN_TYPE_UF_D1_EXT,
44
    RUN_TYPE_U_EXT,
45
    RUN_TYPE_LF_EXT,
46
    RUN_TYPE_UF_EXT2,
47
    RUN_TYPE_LF_EXT2,
48
    RUN_TYPE_UF_EXT3,
49
};
50
51
static int lre_case_conv1(uint32_t c, int conv_type)
52
0
{
53
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
54
0
    lre_case_conv(res, c, conv_type);
55
0
    return res[0];
56
0
}
57
58
/* case conversion using the table entry 'idx' with value 'v' */
59
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
0
{
61
0
    uint32_t code, data, type, a, is_lower;
62
0
    is_lower = (conv_type != 0);
63
0
    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
0
    data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
0
    code = v >> (32 - 17);
66
0
    switch(type) {
67
0
    case RUN_TYPE_U:
68
0
    case RUN_TYPE_L:
69
0
    case RUN_TYPE_UF:
70
0
    case RUN_TYPE_LF:
71
0
        if (conv_type == (type & 1) ||
72
0
            (type >= RUN_TYPE_UF && conv_type == 2)) {
73
0
            c = c - code + (case_conv_table1[data] >> (32 - 17));
74
0
        }
75
0
        break;
76
0
    case RUN_TYPE_UL:
77
0
        a = c - code;
78
0
        if ((a & 1) != (1 - is_lower))
79
0
            break;
80
0
        c = (a ^ 1) + code;
81
0
        break;
82
0
    case RUN_TYPE_LSU:
83
0
        a = c - code;
84
0
        if (a == 1) {
85
0
            c += 2 * is_lower - 1;
86
0
        } else if (a == (1 - is_lower) * 2) {
87
0
            c += (2 * is_lower - 1) * 2;
88
0
        }
89
0
        break;
90
0
    case RUN_TYPE_U2L_399_EXT2:
91
0
        if (!is_lower) {
92
0
            res[0] = c - code + case_conv_ext[data >> 6];
93
0
            res[1] = 0x399;
94
0
            return 2;
95
0
        } else {
96
0
            c = c - code + case_conv_ext[data & 0x3f];
97
0
        }
98
0
        break;
99
0
    case RUN_TYPE_UF_D20:
100
0
        if (conv_type == 1)
101
0
            break;
102
0
        c = data + (conv_type == 2) * 0x20;
103
0
        break;
104
0
    case RUN_TYPE_UF_D1_EXT:
105
0
        if (conv_type == 1)
106
0
            break;
107
0
        c = case_conv_ext[data] + (conv_type == 2);
108
0
        break;
109
0
    case RUN_TYPE_U_EXT:
110
0
    case RUN_TYPE_LF_EXT:
111
0
        if (is_lower != (type - RUN_TYPE_U_EXT))
112
0
            break;
113
0
        c = case_conv_ext[data];
114
0
        break;
115
0
    case RUN_TYPE_LF_EXT2:
116
0
        if (!is_lower)
117
0
            break;
118
0
        res[0] = c - code + case_conv_ext[data >> 6];
119
0
        res[1] = case_conv_ext[data & 0x3f];
120
0
        return 2;
121
0
    case RUN_TYPE_UF_EXT2:
122
0
        if (conv_type == 1)
123
0
            break;
124
0
        res[0] = c - code + case_conv_ext[data >> 6];
125
0
        res[1] = case_conv_ext[data & 0x3f];
126
0
        if (conv_type == 2) {
127
            /* convert to lower */
128
0
            res[0] = lre_case_conv1(res[0], 1);
129
0
            res[1] = lre_case_conv1(res[1], 1);
130
0
        }
131
0
        return 2;
132
0
    default:
133
0
    case RUN_TYPE_UF_EXT3:
134
0
        if (conv_type == 1)
135
0
            break;
136
0
        res[0] = case_conv_ext[data >> 8];
137
0
        res[1] = case_conv_ext[(data >> 4) & 0xf];
138
0
        res[2] = case_conv_ext[data & 0xf];
139
0
        if (conv_type == 2) {
140
            /* convert to lower */
141
0
            res[0] = lre_case_conv1(res[0], 1);
142
0
            res[1] = lre_case_conv1(res[1], 1);
143
0
            res[2] = lre_case_conv1(res[2], 1);
144
0
        }
145
0
        return 3;
146
0
    }
147
0
    res[0] = c;
148
0
    return 1;
149
0
}
150
151
/* conv_type:
152
   0 = to upper
153
   1 = to lower
154
   2 = case folding (= to lower with modifications)
155
*/
156
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
0
{
158
0
    if (c < 128) {
159
0
        if (conv_type) {
160
0
            if (c >= 'A' && c <= 'Z') {
161
0
                c = c - 'A' + 'a';
162
0
            }
163
0
        } else {
164
0
            if (c >= 'a' && c <= 'z') {
165
0
                c = c - 'a' + 'A';
166
0
            }
167
0
        }
168
0
    } else {
169
0
        uint32_t v, code, len;
170
0
        int idx, idx_min, idx_max;
171
172
0
        idx_min = 0;
173
0
        idx_max = countof(case_conv_table1) - 1;
174
0
        while (idx_min <= idx_max) {
175
0
            idx = (unsigned)(idx_max + idx_min) / 2;
176
0
            v = case_conv_table1[idx];
177
0
            code = v >> (32 - 17);
178
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
179
0
            if (c < code) {
180
0
                idx_max = idx - 1;
181
0
            } else if (c >= code + len) {
182
0
                idx_min = idx + 1;
183
0
            } else {
184
0
                return lre_case_conv_entry(res, c, conv_type, idx, v);
185
0
            }
186
0
        }
187
0
    }
188
0
    res[0] = c;
189
0
    return 1;
190
0
}
191
192
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
0
{
194
0
    uint32_t res[LRE_CC_RES_LEN_MAX];
195
0
    int len;
196
197
0
    if (is_unicode) {
198
0
        len = lre_case_conv_entry(res, c, 2, idx, v);
199
0
        if (len == 1) {
200
0
            c = res[0];
201
0
        } else {
202
            /* handle the few specific multi-character cases (see
203
               unicode_gen.c:dump_case_folding_special_cases()) */
204
0
            if (c == 0xfb06) {
205
0
                c = 0xfb05;
206
0
            } else if (c == 0x01fd3) {
207
0
                c = 0x390;
208
0
            } else if (c == 0x01fe3) {
209
0
                c = 0x3b0;
210
0
            }
211
0
        }
212
0
    } else {
213
0
        if (likely(c < 128)) {
214
0
            if (c >= 'a' && c <= 'z')
215
0
                c = c - 'a' + 'A';
216
0
        } else {
217
            /* legacy regexp: to upper case if single char >= 128 */
218
0
            len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
0
            if (len == 1 && res[0] >= 128)
220
0
                c = res[0];
221
0
        }
222
0
    }
223
0
    return c;
224
0
}
225
226
/* JS regexp specific rules for case folding */
227
int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
0
{
229
0
    if (c < 128) {
230
        /* fast case */
231
0
        if (is_unicode) {
232
0
            if (c >= 'A' && c <= 'Z') {
233
0
                c = c - 'A' + 'a';
234
0
            }
235
0
        } else {
236
0
            if (c >= 'a' && c <= 'z') {
237
0
                c = c - 'a' + 'A';
238
0
            }
239
0
        }
240
0
    } else {
241
0
        uint32_t v, code, len;
242
0
        int idx, idx_min, idx_max;
243
244
0
        idx_min = 0;
245
0
        idx_max = countof(case_conv_table1) - 1;
246
0
        while (idx_min <= idx_max) {
247
0
            idx = (unsigned)(idx_max + idx_min) / 2;
248
0
            v = case_conv_table1[idx];
249
0
            code = v >> (32 - 17);
250
0
            len = (v >> (32 - 17 - 7)) & 0x7f;
251
0
            if (c < code) {
252
0
                idx_max = idx - 1;
253
0
            } else if (c >= code + len) {
254
0
                idx_min = idx + 1;
255
0
            } else {
256
0
                return lre_case_folding_entry(c, idx, v, is_unicode);
257
0
            }
258
0
        }
259
0
    }
260
0
    return c;
261
0
}
262
263
static uint32_t get_le24(const uint8_t *ptr)
264
0
{
265
0
    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
0
}
267
268
0
#define UNICODE_INDEX_BLOCK_LEN 32
269
270
/* return -1 if not in table, otherwise the offset in the block */
271
static int get_index_pos(uint32_t *pcode, uint32_t c,
272
                         const uint8_t *index_table, int index_table_len)
273
0
{
274
0
    uint32_t code, v;
275
0
    int idx_min, idx_max, idx;
276
277
0
    idx_min = 0;
278
0
    v = get_le24(index_table);
279
0
    code = v & ((1 << 21) - 1);
280
0
    if (c < code) {
281
0
        *pcode = 0;
282
0
        return 0;
283
0
    }
284
0
    idx_max = index_table_len - 1;
285
0
    code = get_le24(index_table + idx_max * 3);
286
0
    if (c >= code)
287
0
        return -1;
288
    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
0
    while ((idx_max - idx_min) > 1) {
290
0
        idx = (idx_max + idx_min) / 2;
291
0
        v = get_le24(index_table + idx * 3);
292
0
        code = v & ((1 << 21) - 1);
293
0
        if (c < code) {
294
0
            idx_max = idx;
295
0
        } else {
296
0
            idx_min = idx;
297
0
        }
298
0
    }
299
0
    v = get_le24(index_table + idx_min * 3);
300
0
    *pcode = v & ((1 << 21) - 1);
301
0
    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
0
}
303
304
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
                            const uint8_t *index_table, int index_table_len)
306
0
{
307
0
    uint32_t code, b, bit;
308
0
    int pos;
309
0
    const uint8_t *p;
310
311
0
    pos = get_index_pos(&code, c, index_table, index_table_len);
312
0
    if (pos < 0)
313
0
        return FALSE; /* outside the table */
314
0
    p = table + pos;
315
0
    bit = 0;
316
    /* Compressed run length encoding:
317
       00..3F: 2 packed lengths: 3-bit + 3-bit
318
       40..5F: 5-bits plus extra byte for length
319
       60..7F: 5-bits plus 2 extra bytes for length
320
       80..FF: 7-bit length
321
       lengths must be incremented to get character count
322
       Ranges alternate between false and true return value.
323
     */
324
0
    for(;;) {
325
0
        b = *p++;
326
0
        if (b < 64) {
327
0
            code += (b >> 3) + 1;
328
0
            if (c < code)
329
0
                return bit;
330
0
            bit ^= 1;
331
0
            code += (b & 7) + 1;
332
0
        } else if (b >= 0x80) {
333
0
            code += b - 0x80 + 1;
334
0
        } else if (b < 0x60) {
335
0
            code += (((b - 0x40) << 8) | p[0]) + 1;
336
0
            p++;
337
0
        } else {
338
0
            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
0
            p += 2;
340
0
        }
341
0
        if (c < code)
342
0
            return bit;
343
0
        bit ^= 1;
344
0
    }
345
0
}
346
347
BOOL lre_is_cased(uint32_t c)
348
0
{
349
0
    uint32_t v, code, len;
350
0
    int idx, idx_min, idx_max;
351
352
0
    idx_min = 0;
353
0
    idx_max = countof(case_conv_table1) - 1;
354
0
    while (idx_min <= idx_max) {
355
0
        idx = (unsigned)(idx_max + idx_min) / 2;
356
0
        v = case_conv_table1[idx];
357
0
        code = v >> (32 - 17);
358
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
359
0
        if (c < code) {
360
0
            idx_max = idx - 1;
361
0
        } else if (c >= code + len) {
362
0
            idx_min = idx + 1;
363
0
        } else {
364
0
            return TRUE;
365
0
        }
366
0
    }
367
0
    return lre_is_in_table(c, unicode_prop_Cased1_table,
368
0
                           unicode_prop_Cased1_index,
369
0
                           sizeof(unicode_prop_Cased1_index) / 3);
370
0
}
371
372
BOOL lre_is_case_ignorable(uint32_t c)
373
0
{
374
0
    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
0
                           unicode_prop_Case_Ignorable_index,
376
0
                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
0
}
378
379
/* character range */
380
381
static __maybe_unused void cr_dump(CharRange *cr)
382
0
{
383
0
    int i;
384
0
    for(i = 0; i < cr->len; i++)
385
0
        printf("%d: 0x%04x\n", i, cr->points[i]);
386
0
}
387
388
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
0
{
390
0
    return realloc(ptr, size);
391
0
}
392
393
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
2
{
395
2
    cr->len = cr->size = 0;
396
2
    cr->points = NULL;
397
2
    cr->mem_opaque = mem_opaque;
398
2
    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
2
}
400
401
void cr_free(CharRange *cr)
402
2
{
403
2
    cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
2
}
405
406
int cr_realloc(CharRange *cr, int size)
407
10
{
408
10
    int new_size;
409
10
    uint32_t *new_buf;
410
411
10
    if (size > cr->size) {
412
9
        new_size = max_int(size, cr->size * 3 / 2);
413
9
        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
9
                                   new_size * sizeof(cr->points[0]));
415
9
        if (!new_buf)
416
0
            return -1;
417
9
        cr->points = new_buf;
418
9
        cr->size = new_size;
419
9
    }
420
10
    return 0;
421
10
}
422
423
int cr_copy(CharRange *cr, const CharRange *cr1)
424
0
{
425
0
    if (cr_realloc(cr, cr1->len))
426
0
        return -1;
427
0
    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
0
    cr->len = cr1->len;
429
0
    return 0;
430
0
}
431
432
/* merge consecutive intervals and remove empty intervals */
433
static void cr_compress(CharRange *cr)
434
1
{
435
1
    int i, j, k, len;
436
1
    uint32_t *pt;
437
438
1
    pt = cr->points;
439
1
    len = cr->len;
440
1
    i = 0;
441
1
    j = 0;
442
1
    k = 0;
443
12
    while ((i + 1) < len) {
444
11
        if (pt[i] == pt[i + 1]) {
445
            /* empty interval */
446
0
            i += 2;
447
11
        } else {
448
11
            j = i;
449
11
            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
0
                j += 2;
451
            /* just copy */
452
11
            pt[k] = pt[i];
453
11
            pt[k + 1] = pt[j + 1];
454
11
            k += 2;
455
11
            i = j + 2;
456
11
        }
457
11
    }
458
1
    cr->len = k;
459
1
}
460
461
/* union or intersection */
462
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
          const uint32_t *b_pt, int b_len, int op)
464
0
{
465
0
    int a_idx, b_idx, is_in;
466
0
    uint32_t v;
467
468
0
    a_idx = 0;
469
0
    b_idx = 0;
470
0
    for(;;) {
471
        /* get one more point from a or b in increasing order */
472
0
        if (a_idx < a_len && b_idx < b_len) {
473
0
            if (a_pt[a_idx] < b_pt[b_idx]) {
474
0
                goto a_add;
475
0
            } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
0
                v = a_pt[a_idx];
477
0
                a_idx++;
478
0
                b_idx++;
479
0
            } else {
480
0
                goto b_add;
481
0
            }
482
0
        } else if (a_idx < a_len) {
483
0
        a_add:
484
0
            v = a_pt[a_idx++];
485
0
        } else if (b_idx < b_len) {
486
0
        b_add:
487
0
            v = b_pt[b_idx++];
488
0
        } else {
489
0
            break;
490
0
        }
491
        /* add the point if the in/out status changes */
492
0
        switch(op) {
493
0
        case CR_OP_UNION:
494
0
            is_in = (a_idx & 1) | (b_idx & 1);
495
0
            break;
496
0
        case CR_OP_INTER:
497
0
            is_in = (a_idx & 1) & (b_idx & 1);
498
0
            break;
499
0
        case CR_OP_XOR:
500
0
            is_in = (a_idx & 1) ^ (b_idx & 1);
501
0
            break;
502
0
        default:
503
0
            abort();
504
0
        }
505
0
        if (is_in != (cr->len & 1)) {
506
0
            if (cr_add_point(cr, v))
507
0
                return -1;
508
0
        }
509
0
    }
510
0
    cr_compress(cr);
511
0
    return 0;
512
0
}
513
514
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
515
0
{
516
0
    CharRange a = *cr;
517
0
    int ret;
518
0
    cr->len = 0;
519
0
    cr->size = 0;
520
0
    cr->points = NULL;
521
0
    ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
522
0
    cr_free(&a);
523
0
    return ret;
524
0
}
525
526
int cr_invert(CharRange *cr)
527
1
{
528
1
    int len;
529
1
    len = cr->len;
530
1
    if (cr_realloc(cr, len + 2))
531
0
        return -1;
532
1
    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
533
1
    cr->points[0] = 0;
534
1
    cr->points[len + 1] = UINT32_MAX;
535
1
    cr->len = len + 2;
536
1
    cr_compress(cr);
537
1
    return 0;
538
1
}
539
540
#ifdef CONFIG_ALL_UNICODE
541
542
BOOL lre_is_id_start(uint32_t c)
543
0
{
544
0
    return lre_is_in_table(c, unicode_prop_ID_Start_table,
545
0
                           unicode_prop_ID_Start_index,
546
0
                           sizeof(unicode_prop_ID_Start_index) / 3);
547
0
}
548
549
BOOL lre_is_id_continue(uint32_t c)
550
0
{
551
0
    return lre_is_id_start(c) ||
552
0
        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
553
0
                        unicode_prop_ID_Continue1_index,
554
0
                        sizeof(unicode_prop_ID_Continue1_index) / 3);
555
0
}
556
557
#define UNICODE_DECOMP_LEN_MAX 18
558
559
typedef enum {
560
    DECOMP_TYPE_C1, /* 16 bit char */
561
    DECOMP_TYPE_L1, /* 16 bit char table */
562
    DECOMP_TYPE_L2,
563
    DECOMP_TYPE_L3,
564
    DECOMP_TYPE_L4,
565
    DECOMP_TYPE_L5, /* XXX: not used */
566
    DECOMP_TYPE_L6, /* XXX: could remove */
567
    DECOMP_TYPE_L7, /* XXX: could remove */
568
    DECOMP_TYPE_LL1, /* 18 bit char table */
569
    DECOMP_TYPE_LL2,
570
    DECOMP_TYPE_S1, /* 8 bit char table */
571
    DECOMP_TYPE_S2,
572
    DECOMP_TYPE_S3,
573
    DECOMP_TYPE_S4,
574
    DECOMP_TYPE_S5,
575
    DECOMP_TYPE_I1, /* increment 16 bit char value */
576
    DECOMP_TYPE_I2_0,
577
    DECOMP_TYPE_I2_1,
578
    DECOMP_TYPE_I3_1,
579
    DECOMP_TYPE_I3_2,
580
    DECOMP_TYPE_I4_1,
581
    DECOMP_TYPE_I4_2,
582
    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
583
    DECOMP_TYPE_B2,
584
    DECOMP_TYPE_B3,
585
    DECOMP_TYPE_B4,
586
    DECOMP_TYPE_B5,
587
    DECOMP_TYPE_B6,
588
    DECOMP_TYPE_B7,
589
    DECOMP_TYPE_B8,
590
    DECOMP_TYPE_B18,
591
    DECOMP_TYPE_LS2,
592
    DECOMP_TYPE_PAT3,
593
    DECOMP_TYPE_S2_UL,
594
    DECOMP_TYPE_LS2_UL,
595
} DecompTypeEnum;
596
597
static uint32_t unicode_get_short_code(uint32_t c)
598
0
{
599
0
    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
600
601
0
    if (c < 0x80)
602
0
        return c;
603
0
    else if (c < 0x80 + 0x50)
604
0
        return c - 0x80 + 0x300;
605
0
    else
606
0
        return unicode_short_table[c - 0x80 - 0x50];
607
0
}
608
609
static uint32_t unicode_get_lower_simple(uint32_t c)
610
0
{
611
0
    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
612
0
        c += 0x20;
613
0
    else
614
0
        c++;
615
0
    return c;
616
0
}
617
618
static uint16_t unicode_get16(const uint8_t *p)
619
0
{
620
0
    return p[0] | (p[1] << 8);
621
0
}
622
623
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
624
                                int idx, uint32_t code, uint32_t len,
625
                                uint32_t type)
626
0
{
627
0
    uint32_t c1;
628
0
    int l, i, p;
629
0
    const uint8_t *d;
630
631
0
    if (type == DECOMP_TYPE_C1) {
632
0
        res[0] = unicode_decomp_table2[idx];
633
0
        return 1;
634
0
    } else {
635
0
        d = unicode_decomp_data + unicode_decomp_table2[idx];
636
0
        switch(type) {
637
0
        case DECOMP_TYPE_L1:
638
0
        case DECOMP_TYPE_L2:
639
0
        case DECOMP_TYPE_L3:
640
0
        case DECOMP_TYPE_L4:
641
0
        case DECOMP_TYPE_L5:
642
0
        case DECOMP_TYPE_L6:
643
0
        case DECOMP_TYPE_L7:
644
0
            l = type - DECOMP_TYPE_L1 + 1;
645
0
            d += (c - code) * l * 2;
646
0
            for(i = 0; i < l; i++) {
647
0
                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
648
0
                    return 0;
649
0
            }
650
0
            return l;
651
0
        case DECOMP_TYPE_LL1:
652
0
        case DECOMP_TYPE_LL2:
653
0
            {
654
0
                uint32_t k, p;
655
0
                l = type - DECOMP_TYPE_LL1 + 1;
656
0
                k = (c - code) * l;
657
0
                p = len * l * 2;
658
0
                for(i = 0; i < l; i++) {
659
0
                    c1 = unicode_get16(d + 2 * k) |
660
0
                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
661
0
                    if (!c1)
662
0
                        return 0;
663
0
                    res[i] = c1;
664
0
                    k++;
665
0
                }
666
0
            }
667
0
            return l;
668
0
        case DECOMP_TYPE_S1:
669
0
        case DECOMP_TYPE_S2:
670
0
        case DECOMP_TYPE_S3:
671
0
        case DECOMP_TYPE_S4:
672
0
        case DECOMP_TYPE_S5:
673
0
            l = type - DECOMP_TYPE_S1 + 1;
674
0
            d += (c - code) * l;
675
0
            for(i = 0; i < l; i++) {
676
0
                if ((res[i] = unicode_get_short_code(d[i])) == 0)
677
0
                    return 0;
678
0
            }
679
0
            return l;
680
0
        case DECOMP_TYPE_I1:
681
0
            l = 1;
682
0
            p = 0;
683
0
            goto decomp_type_i;
684
0
        case DECOMP_TYPE_I2_0:
685
0
        case DECOMP_TYPE_I2_1:
686
0
        case DECOMP_TYPE_I3_1:
687
0
        case DECOMP_TYPE_I3_2:
688
0
        case DECOMP_TYPE_I4_1:
689
0
        case DECOMP_TYPE_I4_2:
690
0
            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
691
0
            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
692
0
        decomp_type_i:
693
0
            for(i = 0; i < l; i++) {
694
0
                c1 = unicode_get16(d + 2 * i);
695
0
                if (i == p)
696
0
                    c1 += c - code;
697
0
                res[i] = c1;
698
0
            }
699
0
            return l;
700
0
        case DECOMP_TYPE_B18:
701
0
            l = 18;
702
0
            goto decomp_type_b;
703
0
        case DECOMP_TYPE_B1:
704
0
        case DECOMP_TYPE_B2:
705
0
        case DECOMP_TYPE_B3:
706
0
        case DECOMP_TYPE_B4:
707
0
        case DECOMP_TYPE_B5:
708
0
        case DECOMP_TYPE_B6:
709
0
        case DECOMP_TYPE_B7:
710
0
        case DECOMP_TYPE_B8:
711
0
            l = type - DECOMP_TYPE_B1 + 1;
712
0
        decomp_type_b:
713
0
            {
714
0
                uint32_t c_min;
715
0
                c_min = unicode_get16(d);
716
0
                d += 2 + (c - code) * l;
717
0
                for(i = 0; i < l; i++) {
718
0
                    c1 = d[i];
719
0
                    if (c1 == 0xff)
720
0
                        c1 = 0x20;
721
0
                    else
722
0
                        c1 += c_min;
723
0
                    res[i] = c1;
724
0
                }
725
0
            }
726
0
            return l;
727
0
        case DECOMP_TYPE_LS2:
728
0
            d += (c - code) * 3;
729
0
            if (!(res[0] = unicode_get16(d)))
730
0
                return 0;
731
0
            res[1] = unicode_get_short_code(d[2]);
732
0
            return 2;
733
0
        case DECOMP_TYPE_PAT3:
734
0
            res[0] = unicode_get16(d);
735
0
            res[2] = unicode_get16(d + 2);
736
0
            d += 4 + (c - code) * 2;
737
0
            res[1] = unicode_get16(d);
738
0
            return 3;
739
0
        case DECOMP_TYPE_S2_UL:
740
0
        case DECOMP_TYPE_LS2_UL:
741
0
            c1 = c - code;
742
0
            if (type == DECOMP_TYPE_S2_UL) {
743
0
                d += c1 & ~1;
744
0
                c = unicode_get_short_code(*d);
745
0
                d++;
746
0
            } else {
747
0
                d += (c1 >> 1) * 3;
748
0
                c = unicode_get16(d);
749
0
                d += 2;
750
0
            }
751
0
            if (c1 & 1)
752
0
                c = unicode_get_lower_simple(c);
753
0
            res[0] = c;
754
0
            res[1] = unicode_get_short_code(*d);
755
0
            return 2;
756
0
        }
757
0
    }
758
0
    return 0;
759
0
}
760
761
762
/* return the length of the decomposition (length <=
763
   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
764
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
765
0
{
766
0
    uint32_t v, type, is_compat, code, len;
767
0
    int idx_min, idx_max, idx;
768
769
0
    idx_min = 0;
770
0
    idx_max = countof(unicode_decomp_table1) - 1;
771
0
    while (idx_min <= idx_max) {
772
0
        idx = (idx_max + idx_min) / 2;
773
0
        v = unicode_decomp_table1[idx];
774
0
        code = v >> (32 - 18);
775
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
776
        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
777
0
        if (c < code) {
778
0
            idx_max = idx - 1;
779
0
        } else if (c >= code + len) {
780
0
            idx_min = idx + 1;
781
0
        } else {
782
0
            is_compat = v & 1;
783
0
            if (is_compat1 < is_compat)
784
0
                break;
785
0
            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
786
0
            return unicode_decomp_entry(res, c, idx, code, len, type);
787
0
        }
788
0
    }
789
0
    return 0;
790
0
}
791
792
/* return 0 if no pair found */
793
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
794
0
{
795
0
    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
796
0
    int idx_min, idx_max, idx, d;
797
0
    uint32_t pair[2];
798
799
0
    idx_min = 0;
800
0
    idx_max = countof(unicode_comp_table) - 1;
801
0
    while (idx_min <= idx_max) {
802
0
        idx = (idx_max + idx_min) / 2;
803
0
        idx1 = unicode_comp_table[idx];
804
805
        /* idx1 represent an entry of the decomposition table */
806
0
        d_idx = idx1 >> 6;
807
0
        d_offset = idx1 & 0x3f;
808
0
        v = unicode_decomp_table1[d_idx];
809
0
        code = v >> (32 - 18);
810
0
        len = (v >> (32 - 18 - 7)) & 0x7f;
811
0
        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
812
0
        ch = code + d_offset;
813
0
        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
814
0
        d = c0 - pair[0];
815
0
        if (d == 0)
816
0
            d = c1 - pair[1];
817
0
        if (d < 0) {
818
0
            idx_max = idx - 1;
819
0
        } else if (d > 0) {
820
0
            idx_min = idx + 1;
821
0
        } else {
822
0
            return ch;
823
0
        }
824
0
    }
825
0
    return 0;
826
0
}
827
828
/* return the combining class of character c (between 0 and 255) */
829
static int unicode_get_cc(uint32_t c)
830
0
{
831
0
    uint32_t code, n, type, cc, c1, b;
832
0
    int pos;
833
0
    const uint8_t *p;
834
835
0
    pos = get_index_pos(&code, c,
836
0
                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
837
0
    if (pos < 0)
838
0
        return 0;
839
0
    p = unicode_cc_table + pos;
840
    /* Compressed run length encoding:
841
       - 2 high order bits are combining class type
842
       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
843
       - 00..2F: range length (add 1)
844
       - 30..37: 3-bit range-length + 1 extra byte
845
       - 38..3F: 3-bit range-length + 2 extra byte
846
     */
847
0
    for(;;) {
848
0
        b = *p++;
849
0
        type = b >> 6;
850
0
        n = b & 0x3f;
851
0
        if (n < 48) {
852
0
        } else if (n < 56) {
853
0
            n = (n - 48) << 8;
854
0
            n |= *p++;
855
0
            n += 48;
856
0
        } else {
857
0
            n = (n - 56) << 8;
858
0
            n |= *p++ << 8;
859
0
            n |= *p++;
860
0
            n += 48 + (1 << 11);
861
0
        }
862
0
        if (type <= 1)
863
0
            p++;
864
0
        c1 = code + n + 1;
865
0
        if (c < c1) {
866
0
            switch(type) {
867
0
            case 0:
868
0
                cc = p[-1];
869
0
                break;
870
0
            case 1:
871
0
                cc = p[-1] + c - code;
872
0
                break;
873
0
            case 2:
874
0
                cc = 0;
875
0
                break;
876
0
            default:
877
0
            case 3:
878
0
                cc = 230;
879
0
                break;
880
0
            }
881
0
            return cc;
882
0
        }
883
0
        code = c1;
884
0
    }
885
0
}
886
887
static void sort_cc(int *buf, int len)
888
0
{
889
0
    int i, j, k, cc, cc1, start, ch1;
890
891
0
    for(i = 0; i < len; i++) {
892
0
        cc = unicode_get_cc(buf[i]);
893
0
        if (cc != 0) {
894
0
            start = i;
895
0
            j = i + 1;
896
0
            while (j < len) {
897
0
                ch1 = buf[j];
898
0
                cc1 = unicode_get_cc(ch1);
899
0
                if (cc1 == 0)
900
0
                    break;
901
0
                k = j - 1;
902
0
                while (k >= start) {
903
0
                    if (unicode_get_cc(buf[k]) <= cc1)
904
0
                        break;
905
0
                    buf[k + 1] = buf[k];
906
0
                    k--;
907
0
                }
908
0
                buf[k + 1] = ch1;
909
0
                j++;
910
0
            }
911
#if 0
912
            printf("cc:");
913
            for(k = start; k < j; k++) {
914
                printf(" %3d", unicode_get_cc(buf[k]));
915
            }
916
            printf("\n");
917
#endif
918
0
            i = j;
919
0
        }
920
0
    }
921
0
}
922
923
static void to_nfd_rec(DynBuf *dbuf,
924
                       const int *src, int src_len, int is_compat)
925
0
{
926
0
    uint32_t c, v;
927
0
    int i, l;
928
0
    uint32_t res[UNICODE_DECOMP_LEN_MAX];
929
930
0
    for(i = 0; i < src_len; i++) {
931
0
        c = src[i];
932
0
        if (c >= 0xac00 && c < 0xd7a4) {
933
            /* Hangul decomposition */
934
0
            c -= 0xac00;
935
0
            dbuf_put_u32(dbuf, 0x1100 + c / 588);
936
0
            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
937
0
            v = c % 28;
938
0
            if (v != 0)
939
0
                dbuf_put_u32(dbuf, 0x11a7 + v);
940
0
        } else {
941
0
            l = unicode_decomp_char(res, c, is_compat);
942
0
            if (l) {
943
0
                to_nfd_rec(dbuf, (int *)res, l, is_compat);
944
0
            } else {
945
0
                dbuf_put_u32(dbuf, c);
946
0
            }
947
0
        }
948
0
    }
949
0
}
950
951
/* return 0 if not found */
952
static int compose_pair(uint32_t c0, uint32_t c1)
953
0
{
954
    /* Hangul composition */
955
0
    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
956
0
        c1 >= 0x1161 && c1 < 0x1161 + 21) {
957
0
        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
958
0
    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
959
0
               (c0 - 0xac00) % 28 == 0 &&
960
0
               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
961
0
        return c0 + c1 - 0x11a7;
962
0
    } else {
963
0
        return unicode_compose_pair(c0, c1);
964
0
    }
965
0
}
966
967
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
968
                      UnicodeNormalizationEnum n_type,
969
                      void *opaque, DynBufReallocFunc *realloc_func)
970
0
{
971
0
    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
972
0
    BOOL is_compat;
973
0
    DynBuf dbuf_s, *dbuf = &dbuf_s;
974
975
0
    is_compat = n_type >> 1;
976
977
0
    dbuf_init2(dbuf, opaque, realloc_func);
978
0
    if (dbuf_realloc(dbuf, sizeof(int) * src_len))
979
0
        goto fail;
980
981
    /* common case: latin1 is unaffected by NFC */
982
0
    if (n_type == UNICODE_NFC) {
983
0
        for(i = 0; i < src_len; i++) {
984
0
            if (src[i] >= 0x100)
985
0
                goto not_latin1;
986
0
        }
987
0
        buf = (int *)dbuf->buf;
988
0
        memcpy(buf, src, src_len * sizeof(int));
989
0
        *pdst = (uint32_t *)buf;
990
0
        return src_len;
991
0
    not_latin1: ;
992
0
    }
993
994
0
    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
995
0
    if (dbuf_error(dbuf)) {
996
0
    fail:
997
0
        *pdst = NULL;
998
0
        return -1;
999
0
    }
1000
0
    buf = (int *)dbuf->buf;
1001
0
    buf_len = dbuf->size / sizeof(int);
1002
1003
0
    sort_cc(buf, buf_len);
1004
1005
0
    if (buf_len <= 1 || (n_type & 1) != 0) {
1006
        /* NFD / NFKD */
1007
0
        *pdst = (uint32_t *)buf;
1008
0
        return buf_len;
1009
0
    }
1010
1011
0
    i = 1;
1012
0
    out_len = 1;
1013
0
    while (i < buf_len) {
1014
        /* find the starter character and test if it is blocked from
1015
           the character at 'i' */
1016
0
        last_cc = unicode_get_cc(buf[i]);
1017
0
        starter_pos = out_len - 1;
1018
0
        while (starter_pos >= 0) {
1019
0
            cc = unicode_get_cc(buf[starter_pos]);
1020
0
            if (cc == 0)
1021
0
                break;
1022
0
            if (cc >= last_cc)
1023
0
                goto next;
1024
0
            last_cc = 256;
1025
0
            starter_pos--;
1026
0
        }
1027
0
        if (starter_pos >= 0 &&
1028
0
            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1029
0
            buf[starter_pos] = p;
1030
0
            i++;
1031
0
        } else {
1032
0
        next:
1033
0
            buf[out_len++] = buf[i++];
1034
0
        }
1035
0
    }
1036
0
    *pdst = (uint32_t *)buf;
1037
0
    return out_len;
1038
0
}
1039
1040
/* char ranges for various unicode properties */
1041
1042
static int unicode_find_name(const char *name_table, const char *name)
1043
0
{
1044
0
    const char *p, *r;
1045
0
    int pos;
1046
0
    size_t name_len, len;
1047
1048
0
    p = name_table;
1049
0
    pos = 0;
1050
0
    name_len = strlen(name);
1051
0
    while (*p) {
1052
0
        for(;;) {
1053
0
            r = strchr(p, ',');
1054
0
            if (!r)
1055
0
                len = strlen(p);
1056
0
            else
1057
0
                len = r - p;
1058
0
            if (len == name_len && !memcmp(p, name, name_len))
1059
0
                return pos;
1060
0
            p += len + 1;
1061
0
            if (!r)
1062
0
                break;
1063
0
        }
1064
0
        pos++;
1065
0
    }
1066
0
    return -1;
1067
0
}
1068
1069
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1070
   if not found */
1071
int unicode_script(CharRange *cr,
1072
                   const char *script_name, BOOL is_ext)
1073
0
{
1074
0
    int script_idx;
1075
0
    const uint8_t *p, *p_end;
1076
0
    uint32_t c, c1, b, n, v, v_len, i, type;
1077
0
    CharRange cr1_s, *cr1;
1078
0
    CharRange cr2_s, *cr2 = &cr2_s;
1079
0
    BOOL is_common;
1080
1081
0
    script_idx = unicode_find_name(unicode_script_name_table, script_name);
1082
0
    if (script_idx < 0)
1083
0
        return -2;
1084
    /* Note: we remove the "Unknown" Script */
1085
0
    script_idx += UNICODE_SCRIPT_Unknown + 1;
1086
1087
0
    is_common = (script_idx == UNICODE_SCRIPT_Common ||
1088
0
                 script_idx == UNICODE_SCRIPT_Inherited);
1089
0
    if (is_ext) {
1090
0
        cr1 = &cr1_s;
1091
0
        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1092
0
        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1093
0
    } else {
1094
0
        cr1 = cr;
1095
0
    }
1096
1097
0
    p = unicode_script_table;
1098
0
    p_end = unicode_script_table + countof(unicode_script_table);
1099
0
    c = 0;
1100
0
    while (p < p_end) {
1101
0
        b = *p++;
1102
0
        type = b >> 7;
1103
0
        n = b & 0x7f;
1104
0
        if (n < 96) {
1105
0
        } else if (n < 112) {
1106
0
            n = (n - 96) << 8;
1107
0
            n |= *p++;
1108
0
            n += 96;
1109
0
        } else {
1110
0
            n = (n - 112) << 16;
1111
0
            n |= *p++ << 8;
1112
0
            n |= *p++;
1113
0
            n += 96 + (1 << 12);
1114
0
        }
1115
0
        if (type == 0)
1116
0
            v = 0;
1117
0
        else
1118
0
            v = *p++;
1119
0
        c1 = c + n + 1;
1120
0
        if (v == script_idx) {
1121
0
            if (cr_add_interval(cr1, c, c1))
1122
0
                goto fail;
1123
0
        }
1124
0
        c = c1;
1125
0
    }
1126
1127
0
    if (is_ext) {
1128
        /* add the script extensions */
1129
0
        p = unicode_script_ext_table;
1130
0
        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1131
0
        c = 0;
1132
0
        while (p < p_end) {
1133
0
            b = *p++;
1134
0
            if (b < 128) {
1135
0
                n = b;
1136
0
            } else if (b < 128 + 64) {
1137
0
                n = (b - 128) << 8;
1138
0
                n |= *p++;
1139
0
                n += 128;
1140
0
            } else {
1141
0
                n = (b - 128 - 64) << 16;
1142
0
                n |= *p++ << 8;
1143
0
                n |= *p++;
1144
0
                n += 128 + (1 << 14);
1145
0
            }
1146
0
            c1 = c + n + 1;
1147
0
            v_len = *p++;
1148
0
            if (is_common) {
1149
0
                if (v_len != 0) {
1150
0
                    if (cr_add_interval(cr2, c, c1))
1151
0
                        goto fail;
1152
0
                }
1153
0
            } else {
1154
0
                for(i = 0; i < v_len; i++) {
1155
0
                    if (p[i] == script_idx) {
1156
0
                        if (cr_add_interval(cr2, c, c1))
1157
0
                            goto fail;
1158
0
                        break;
1159
0
                    }
1160
0
                }
1161
0
            }
1162
0
            p += v_len;
1163
0
            c = c1;
1164
0
        }
1165
0
        if (is_common) {
1166
            /* remove all the characters with script extensions */
1167
0
            if (cr_invert(cr2))
1168
0
                goto fail;
1169
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1170
0
                      CR_OP_INTER))
1171
0
                goto fail;
1172
0
        } else {
1173
0
            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1174
0
                      CR_OP_UNION))
1175
0
                goto fail;
1176
0
        }
1177
0
        cr_free(cr1);
1178
0
        cr_free(cr2);
1179
0
    }
1180
0
    return 0;
1181
0
 fail:
1182
0
    if (is_ext) {
1183
0
        cr_free(cr1);
1184
0
        cr_free(cr2);
1185
0
    }
1186
0
    goto fail;
1187
0
}
1188
1189
0
#define M(id) (1U << UNICODE_GC_ ## id)
1190
1191
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1192
0
{
1193
0
    const uint8_t *p, *p_end;
1194
0
    uint32_t c, c0, b, n, v;
1195
1196
0
    p = unicode_gc_table;
1197
0
    p_end = unicode_gc_table + countof(unicode_gc_table);
1198
0
    c = 0;
1199
    /* Compressed range encoding:
1200
       initial byte:
1201
       bits 0..4: category number (special case 31)
1202
       bits 5..7: range length (add 1)
1203
       special case bits 5..7 == 7: read an extra byte
1204
       - 00..7F: range length (add 7 + 1)
1205
       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1206
       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1207
     */
1208
0
    while (p < p_end) {
1209
0
        b = *p++;
1210
0
        n = b >> 5;
1211
0
        v = b & 0x1f;
1212
0
        if (n == 7) {
1213
0
            n = *p++;
1214
0
            if (n < 128) {
1215
0
                n += 7;
1216
0
            } else if (n < 128 + 64) {
1217
0
                n = (n - 128) << 8;
1218
0
                n |= *p++;
1219
0
                n += 7 + 128;
1220
0
            } else {
1221
0
                n = (n - 128 - 64) << 16;
1222
0
                n |= *p++ << 8;
1223
0
                n |= *p++;
1224
0
                n += 7 + 128 + (1 << 14);
1225
0
            }
1226
0
        }
1227
0
        c0 = c;
1228
0
        c += n + 1;
1229
0
        if (v == 31) {
1230
            /* run of Lu / Ll */
1231
0
            b = gc_mask & (M(Lu) | M(Ll));
1232
0
            if (b != 0) {
1233
0
                if (b == (M(Lu) | M(Ll))) {
1234
0
                    goto add_range;
1235
0
                } else {
1236
0
                    c0 += ((gc_mask & M(Ll)) != 0);
1237
0
                    for(; c0 < c; c0 += 2) {
1238
0
                        if (cr_add_interval(cr, c0, c0 + 1))
1239
0
                            return -1;
1240
0
                    }
1241
0
                }
1242
0
            }
1243
0
        } else if ((gc_mask >> v) & 1) {
1244
0
        add_range:
1245
0
            if (cr_add_interval(cr, c0, c))
1246
0
                return -1;
1247
0
        }
1248
0
    }
1249
0
    return 0;
1250
0
}
1251
1252
static int unicode_prop1(CharRange *cr, int prop_idx)
1253
0
{
1254
0
    const uint8_t *p, *p_end;
1255
0
    uint32_t c, c0, b, bit;
1256
1257
0
    p = unicode_prop_table[prop_idx];
1258
0
    p_end = p + unicode_prop_len_table[prop_idx];
1259
0
    c = 0;
1260
0
    bit = 0;
1261
    /* Compressed range encoding:
1262
       00..3F: 2 packed lengths: 3-bit + 3-bit
1263
       40..5F: 5-bits plus extra byte for length
1264
       60..7F: 5-bits plus 2 extra bytes for length
1265
       80..FF: 7-bit length
1266
       lengths must be incremented to get character count
1267
       Ranges alternate between false and true return value.
1268
     */
1269
0
    while (p < p_end) {
1270
0
        c0 = c;
1271
0
        b = *p++;
1272
0
        if (b < 64) {
1273
0
            c += (b >> 3) + 1;
1274
0
            if (bit)  {
1275
0
                if (cr_add_interval(cr, c0, c))
1276
0
                    return -1;
1277
0
            }
1278
0
            bit ^= 1;
1279
0
            c0 = c;
1280
0
            c += (b & 7) + 1;
1281
0
        } else if (b >= 0x80) {
1282
0
            c += b - 0x80 + 1;
1283
0
        } else if (b < 0x60) {
1284
0
            c += (((b - 0x40) << 8) | p[0]) + 1;
1285
0
            p++;
1286
0
        } else {
1287
0
            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1288
0
            p += 2;
1289
0
        }
1290
0
        if (bit)  {
1291
0
            if (cr_add_interval(cr, c0, c))
1292
0
                return -1;
1293
0
        }
1294
0
        bit ^= 1;
1295
0
    }
1296
0
    return 0;
1297
0
}
1298
1299
0
#define CASE_U (1 << 0)
1300
0
#define CASE_L (1 << 1)
1301
0
#define CASE_F (1 << 2)
1302
1303
/* use the case conversion table to generate range of characters.
1304
   CASE_U: set char if modified by uppercasing,
1305
   CASE_L: set char if modified by lowercasing,
1306
   CASE_F: set char if modified by case folding,
1307
 */
1308
static int unicode_case1(CharRange *cr, int case_mask)
1309
0
{
1310
0
#define MR(x) (1 << RUN_TYPE_ ## x)
1311
0
    const uint32_t tab_run_mask[3] = {
1312
0
        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
1313
0
        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
1314
1315
0
        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
1316
1317
0
        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
1318
0
    };
1319
0
#undef MR
1320
0
    uint32_t mask, v, code, type, len, i, idx;
1321
1322
0
    if (case_mask == 0)
1323
0
        return 0;
1324
0
    mask = 0;
1325
0
    for(i = 0; i < 3; i++) {
1326
0
        if ((case_mask >> i) & 1)
1327
0
            mask |= tab_run_mask[i];
1328
0
    }
1329
0
    for(idx = 0; idx < countof(case_conv_table1); idx++) {
1330
0
        v = case_conv_table1[idx];
1331
0
        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
1332
0
        code = v >> (32 - 17);
1333
0
        len = (v >> (32 - 17 - 7)) & 0x7f;
1334
0
        if ((mask >> type) & 1) {
1335
            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
1336
0
            switch(type) {
1337
0
            case RUN_TYPE_UL:
1338
0
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1339
0
                    goto def_case;
1340
0
                code += ((case_mask & CASE_U) != 0);
1341
0
                for(i = 0; i < len; i += 2) {
1342
0
                    if (cr_add_interval(cr, code + i, code + i + 1))
1343
0
                        return -1;
1344
0
                }
1345
0
                break;
1346
0
            case RUN_TYPE_LSU:
1347
0
                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1348
0
                    goto def_case;
1349
0
                if (!(case_mask & CASE_U)) {
1350
0
                    if (cr_add_interval(cr, code, code + 1))
1351
0
                        return -1;
1352
0
                }
1353
0
                if (cr_add_interval(cr, code + 1, code + 2))
1354
0
                    return -1;
1355
0
                if (case_mask & CASE_U) {
1356
0
                    if (cr_add_interval(cr, code + 2, code + 3))
1357
0
                        return -1;
1358
0
                }
1359
0
                break;
1360
0
            default:
1361
0
            def_case:
1362
0
                if (cr_add_interval(cr, code, code + len))
1363
0
                    return -1;
1364
0
                break;
1365
0
            }
1366
0
        }
1367
0
    }
1368
0
    return 0;
1369
0
}
1370
1371
static int point_cmp(const void *p1, const void *p2, void *arg)
1372
0
{
1373
0
    uint32_t v1 = *(uint32_t *)p1;
1374
0
    uint32_t v2 = *(uint32_t *)p2;
1375
0
    return (v1 > v2) - (v1 < v2);
1376
0
}
1377
1378
static void cr_sort_and_remove_overlap(CharRange *cr)
1379
0
{
1380
0
    uint32_t start, end, start1, end1, i, j;
1381
1382
    /* the resulting ranges are not necessarily sorted and may overlap */
1383
0
    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
1384
0
    j = 0;
1385
0
    for(i = 0; i < cr->len; ) {
1386
0
        start = cr->points[i];
1387
0
        end = cr->points[i + 1];
1388
0
        i += 2;
1389
0
        while (i < cr->len) {
1390
0
            start1 = cr->points[i];
1391
0
            end1 = cr->points[i + 1];
1392
0
            if (start1 > end) {
1393
                /* |------|
1394
                 *           |-------| */
1395
0
                break;
1396
0
            } else if (end1 <= end) {
1397
                /* |------|
1398
                 *    |--| */
1399
0
                i += 2;
1400
0
            } else {
1401
                /* |------|
1402
                 *     |-------| */
1403
0
                end = end1;
1404
0
                i += 2;
1405
0
            }
1406
0
        }
1407
0
        cr->points[j] = start;
1408
0
        cr->points[j + 1] = end;
1409
0
        j += 2;
1410
0
    }
1411
0
    cr->len = j;
1412
0
}
1413
1414
/* canonicalize a character set using the JS regex case folding rules
1415
   (see lre_canonicalize()) */
1416
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
1417
0
{
1418
0
    CharRange cr_inter, cr_mask, cr_result, cr_sub;
1419
0
    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
1420
1421
0
    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
1422
0
    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
1423
0
    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
1424
0
    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
1425
1426
0
    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
1427
0
        goto fail;
1428
0
    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
1429
0
        goto fail;
1430
1431
0
    if (cr_invert(&cr_mask))
1432
0
        goto fail;
1433
0
    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
1434
0
        goto fail;
1435
1436
    /* cr_inter = cr & cr_mask */
1437
    /* cr_sub = cr & ~cr_mask */
1438
1439
    /* use the case conversion table to compute the result */
1440
0
    d_start = -1;
1441
0
    d_end = -1;
1442
0
    idx = 0;
1443
0
    v = case_conv_table1[idx];
1444
0
    code = v >> (32 - 17);
1445
0
    len = (v >> (32 - 17 - 7)) & 0x7f;
1446
0
    for(i = 0; i < cr_inter.len; i += 2) {
1447
0
        start = cr_inter.points[i];
1448
0
        end = cr_inter.points[i + 1];
1449
1450
0
        for(c = start; c < end; c++) {
1451
0
            for(;;) {
1452
0
                if (c >= code && c < code + len)
1453
0
                    break;
1454
0
                idx++;
1455
0
                assert(idx < countof(case_conv_table1));
1456
0
                v = case_conv_table1[idx];
1457
0
                code = v >> (32 - 17);
1458
0
                len = (v >> (32 - 17 - 7)) & 0x7f;
1459
0
            }
1460
0
            d = lre_case_folding_entry(c, idx, v, is_unicode);
1461
            /* try to merge with the current interval */
1462
0
            if (d_start == -1) {
1463
0
                d_start = d;
1464
0
                d_end = d + 1;
1465
0
            } else if (d_end == d) {
1466
0
                d_end++;
1467
0
            } else {
1468
0
                cr_add_interval(&cr_result, d_start, d_end);
1469
0
                d_start = d;
1470
0
                d_end = d + 1;
1471
0
            }
1472
0
        }
1473
0
    }
1474
0
    if (d_start != -1) {
1475
0
        if (cr_add_interval(&cr_result, d_start, d_end))
1476
0
            goto fail;
1477
0
    }
1478
1479
    /* the resulting ranges are not necessarily sorted and may overlap */
1480
0
    cr_sort_and_remove_overlap(&cr_result);
1481
1482
    /* or with the character not affected by the case folding */
1483
0
    cr->len = 0;
1484
0
    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
1485
0
        goto fail;
1486
1487
0
    cr_free(&cr_inter);
1488
0
    cr_free(&cr_mask);
1489
0
    cr_free(&cr_result);
1490
0
    cr_free(&cr_sub);
1491
0
    return 0;
1492
0
 fail:
1493
0
    cr_free(&cr_inter);
1494
0
    cr_free(&cr_mask);
1495
0
    cr_free(&cr_result);
1496
0
    cr_free(&cr_sub);
1497
0
    return -1;
1498
0
}
1499
1500
typedef enum {
1501
    POP_GC,
1502
    POP_PROP,
1503
    POP_CASE,
1504
    POP_UNION,
1505
    POP_INTER,
1506
    POP_XOR,
1507
    POP_INVERT,
1508
    POP_END,
1509
} PropOPEnum;
1510
1511
#define POP_STACK_LEN_MAX 4
1512
1513
static int unicode_prop_ops(CharRange *cr, ...)
1514
0
{
1515
0
    va_list ap;
1516
0
    CharRange stack[POP_STACK_LEN_MAX];
1517
0
    int stack_len, op, ret, i;
1518
0
    uint32_t a;
1519
1520
0
    va_start(ap, cr);
1521
0
    stack_len = 0;
1522
0
    for(;;) {
1523
0
        op = va_arg(ap, int);
1524
0
        switch(op) {
1525
0
        case POP_GC:
1526
0
            assert(stack_len < POP_STACK_LEN_MAX);
1527
0
            a = va_arg(ap, int);
1528
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1529
0
            if (unicode_general_category1(&stack[stack_len - 1], a))
1530
0
                goto fail;
1531
0
            break;
1532
0
        case POP_PROP:
1533
0
            assert(stack_len < POP_STACK_LEN_MAX);
1534
0
            a = va_arg(ap, int);
1535
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1536
0
            if (unicode_prop1(&stack[stack_len - 1], a))
1537
0
                goto fail;
1538
0
            break;
1539
0
        case POP_CASE:
1540
0
            assert(stack_len < POP_STACK_LEN_MAX);
1541
0
            a = va_arg(ap, int);
1542
0
            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1543
0
            if (unicode_case1(&stack[stack_len - 1], a))
1544
0
                goto fail;
1545
0
            break;
1546
0
        case POP_UNION:
1547
0
        case POP_INTER:
1548
0
        case POP_XOR:
1549
0
            {
1550
0
                CharRange *cr1, *cr2, *cr3;
1551
0
                assert(stack_len >= 2);
1552
0
                assert(stack_len < POP_STACK_LEN_MAX);
1553
0
                cr1 = &stack[stack_len - 2];
1554
0
                cr2 = &stack[stack_len - 1];
1555
0
                cr3 = &stack[stack_len++];
1556
0
                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1557
0
                if (cr_op(cr3, cr1->points, cr1->len,
1558
0
                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1559
0
                    goto fail;
1560
0
                cr_free(cr1);
1561
0
                cr_free(cr2);
1562
0
                *cr1 = *cr3;
1563
0
                stack_len -= 2;
1564
0
            }
1565
0
            break;
1566
0
        case POP_INVERT:
1567
0
            assert(stack_len >= 1);
1568
0
            if (cr_invert(&stack[stack_len - 1]))
1569
0
                goto fail;
1570
0
            break;
1571
0
        case POP_END:
1572
0
            goto done;
1573
0
        default:
1574
0
            abort();
1575
0
        }
1576
0
    }
1577
0
 done:
1578
0
    assert(stack_len == 1);
1579
0
    ret = cr_copy(cr, &stack[0]);
1580
0
    cr_free(&stack[0]);
1581
0
    return ret;
1582
0
 fail:
1583
0
    for(i = 0; i < stack_len; i++)
1584
0
        cr_free(&stack[i]);
1585
0
    return -1;
1586
0
}
1587
1588
static const uint32_t unicode_gc_mask_table[] = {
1589
    M(Lu) | M(Ll) | M(Lt), /* LC */
1590
    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1591
    M(Mn) | M(Mc) | M(Me), /* M */
1592
    M(Nd) | M(Nl) | M(No), /* N */
1593
    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1594
    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1595
    M(Zs) | M(Zl) | M(Zp), /* Z */
1596
    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1597
};
1598
1599
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1600
   if not found */
1601
int unicode_general_category(CharRange *cr, const char *gc_name)
1602
0
{
1603
0
    int gc_idx;
1604
0
    uint32_t gc_mask;
1605
1606
0
    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1607
0
    if (gc_idx < 0)
1608
0
        return -2;
1609
0
    if (gc_idx <= UNICODE_GC_Co) {
1610
0
        gc_mask = (uint64_t)1 << gc_idx;
1611
0
    } else {
1612
0
        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1613
0
    }
1614
0
    return unicode_general_category1(cr, gc_mask);
1615
0
}
1616
1617
1618
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1619
   if not found */
1620
int unicode_prop(CharRange *cr, const char *prop_name)
1621
0
{
1622
0
    int prop_idx, ret;
1623
1624
0
    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1625
0
    if (prop_idx < 0)
1626
0
        return -2;
1627
0
    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1628
1629
0
    ret = 0;
1630
0
    switch(prop_idx) {
1631
0
    case UNICODE_PROP_ASCII:
1632
0
        if (cr_add_interval(cr, 0x00, 0x7f + 1))
1633
0
            return -1;
1634
0
        break;
1635
0
    case UNICODE_PROP_Any:
1636
0
        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1637
0
            return -1;
1638
0
        break;
1639
0
    case UNICODE_PROP_Assigned:
1640
0
        ret = unicode_prop_ops(cr,
1641
0
                               POP_GC, M(Cn),
1642
0
                               POP_INVERT,
1643
0
                               POP_END);
1644
0
        break;
1645
0
    case UNICODE_PROP_Math:
1646
0
        ret = unicode_prop_ops(cr,
1647
0
                               POP_GC, M(Sm),
1648
0
                               POP_PROP, UNICODE_PROP_Other_Math,
1649
0
                               POP_UNION,
1650
0
                               POP_END);
1651
0
        break;
1652
0
    case UNICODE_PROP_Lowercase:
1653
0
        ret = unicode_prop_ops(cr,
1654
0
                               POP_GC, M(Ll),
1655
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1656
0
                               POP_UNION,
1657
0
                               POP_END);
1658
0
        break;
1659
0
    case UNICODE_PROP_Uppercase:
1660
0
        ret = unicode_prop_ops(cr,
1661
0
                               POP_GC, M(Lu),
1662
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1663
0
                               POP_UNION,
1664
0
                               POP_END);
1665
0
        break;
1666
0
    case UNICODE_PROP_Cased:
1667
0
        ret = unicode_prop_ops(cr,
1668
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt),
1669
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1670
0
                               POP_UNION,
1671
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1672
0
                               POP_UNION,
1673
0
                               POP_END);
1674
0
        break;
1675
0
    case UNICODE_PROP_Alphabetic:
1676
0
        ret = unicode_prop_ops(cr,
1677
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1678
0
                               POP_PROP, UNICODE_PROP_Other_Uppercase,
1679
0
                               POP_UNION,
1680
0
                               POP_PROP, UNICODE_PROP_Other_Lowercase,
1681
0
                               POP_UNION,
1682
0
                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
1683
0
                               POP_UNION,
1684
0
                               POP_END);
1685
0
        break;
1686
0
    case UNICODE_PROP_Grapheme_Base:
1687
0
        ret = unicode_prop_ops(cr,
1688
0
                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1689
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1690
0
                               POP_UNION,
1691
0
                               POP_INVERT,
1692
0
                               POP_END);
1693
0
        break;
1694
0
    case UNICODE_PROP_Grapheme_Extend:
1695
0
        ret = unicode_prop_ops(cr,
1696
0
                               POP_GC, M(Me) | M(Mn),
1697
0
                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1698
0
                               POP_UNION,
1699
0
                               POP_END);
1700
0
        break;
1701
0
    case UNICODE_PROP_XID_Start:
1702
0
        ret = unicode_prop_ops(cr,
1703
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1704
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1705
0
                               POP_UNION,
1706
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1707
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1708
0
                               POP_UNION,
1709
0
                               POP_PROP, UNICODE_PROP_XID_Start1,
1710
0
                               POP_UNION,
1711
0
                               POP_INVERT,
1712
0
                               POP_INTER,
1713
0
                               POP_END);
1714
0
        break;
1715
0
    case UNICODE_PROP_XID_Continue:
1716
0
        ret = unicode_prop_ops(cr,
1717
0
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1718
0
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1719
0
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1720
0
                               POP_UNION,
1721
0
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1722
0
                               POP_UNION,
1723
0
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1724
0
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1725
0
                               POP_UNION,
1726
0
                               POP_PROP, UNICODE_PROP_XID_Continue1,
1727
0
                               POP_UNION,
1728
0
                               POP_INVERT,
1729
0
                               POP_INTER,
1730
0
                               POP_END);
1731
0
        break;
1732
0
    case UNICODE_PROP_Changes_When_Uppercased:
1733
0
        ret = unicode_case1(cr, CASE_U);
1734
0
        break;
1735
0
    case UNICODE_PROP_Changes_When_Lowercased:
1736
0
        ret = unicode_case1(cr, CASE_L);
1737
0
        break;
1738
0
    case UNICODE_PROP_Changes_When_Casemapped:
1739
0
        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1740
0
        break;
1741
0
    case UNICODE_PROP_Changes_When_Titlecased:
1742
0
        ret = unicode_prop_ops(cr,
1743
0
                               POP_CASE, CASE_U,
1744
0
                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1745
0
                               POP_XOR,
1746
0
                               POP_END);
1747
0
        break;
1748
0
    case UNICODE_PROP_Changes_When_Casefolded:
1749
0
        ret = unicode_prop_ops(cr,
1750
0
                               POP_CASE, CASE_F,
1751
0
                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1752
0
                               POP_XOR,
1753
0
                               POP_END);
1754
0
        break;
1755
0
    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1756
0
        ret = unicode_prop_ops(cr,
1757
0
                               POP_CASE, CASE_F,
1758
0
                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1759
0
                               POP_XOR,
1760
0
                               POP_END);
1761
0
        break;
1762
#if 0
1763
    case UNICODE_PROP_ID_Start:
1764
        ret = unicode_prop_ops(cr,
1765
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1766
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1767
                               POP_UNION,
1768
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1769
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1770
                               POP_UNION,
1771
                               POP_INVERT,
1772
                               POP_INTER,
1773
                               POP_END);
1774
        break;
1775
    case UNICODE_PROP_ID_Continue:
1776
        ret = unicode_prop_ops(cr,
1777
                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1778
                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
1779
                               POP_PROP, UNICODE_PROP_Other_ID_Start,
1780
                               POP_UNION,
1781
                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
1782
                               POP_UNION,
1783
                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
1784
                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
1785
                               POP_UNION,
1786
                               POP_INVERT,
1787
                               POP_INTER,
1788
                               POP_END);
1789
        break;
1790
    case UNICODE_PROP_Case_Ignorable:
1791
        ret = unicode_prop_ops(cr,
1792
                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1793
                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
1794
                               POP_XOR,
1795
                               POP_END);
1796
        break;
1797
#else
1798
        /* we use the existing tables */
1799
0
    case UNICODE_PROP_ID_Continue:
1800
0
        ret = unicode_prop_ops(cr,
1801
0
                               POP_PROP, UNICODE_PROP_ID_Start,
1802
0
                               POP_PROP, UNICODE_PROP_ID_Continue1,
1803
0
                               POP_XOR,
1804
0
                               POP_END);
1805
0
        break;
1806
0
#endif
1807
0
    default:
1808
0
        if (prop_idx >= countof(unicode_prop_table))
1809
0
            return -2;
1810
0
        ret = unicode_prop1(cr, prop_idx);
1811
0
        break;
1812
0
    }
1813
0
    return ret;
1814
0
}
1815
1816
#endif /* CONFIG_ALL_UNICODE */
1817
1818
/*---- lre codepoint categorizing functions ----*/
1819
1820
#define S  UNICODE_C_SPACE
1821
#define D  UNICODE_C_DIGIT
1822
#define X  UNICODE_C_XDIGIT
1823
#define U  UNICODE_C_UPPER
1824
#define L  UNICODE_C_LOWER
1825
#define _  UNICODE_C_UNDER
1826
#define d  UNICODE_C_DOLLAR
1827
1828
uint8_t const lre_ctype_bits[256] = {
1829
    0, 0, 0, 0, 0, 0, 0, 0,
1830
    0, S, S, S, S, S, 0, 0,
1831
    0, 0, 0, 0, 0, 0, 0, 0,
1832
    0, 0, 0, 0, 0, 0, 0, 0,
1833
1834
    S, 0, 0, 0, d, 0, 0, 0,
1835
    0, 0, 0, 0, 0, 0, 0, 0,
1836
    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1837
    X|D, X|D, 0, 0, 0, 0, 0, 0,
1838
1839
    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1840
    U, U, U, U, U, U, U, U,
1841
    U, U, U, U, U, U, U, U,
1842
    U, U, U, 0, 0, 0, 0, _,
1843
1844
    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1845
    L, L, L, L, L, L, L, L,
1846
    L, L, L, L, L, L, L, L,
1847
    L, L, L, 0, 0, 0, 0, 0,
1848
1849
    0, 0, 0, 0, 0, 0, 0, 0,
1850
    0, 0, 0, 0, 0, 0, 0, 0,
1851
    0, 0, 0, 0, 0, 0, 0, 0,
1852
    0, 0, 0, 0, 0, 0, 0, 0,
1853
1854
    S, 0, 0, 0, 0, 0, 0, 0,
1855
    0, 0, 0, 0, 0, 0, 0, 0,
1856
    0, 0, 0, 0, 0, 0, 0, 0,
1857
    0, 0, 0, 0, 0, 0, 0, 0,
1858
1859
    0, 0, 0, 0, 0, 0, 0, 0,
1860
    0, 0, 0, 0, 0, 0, 0, 0,
1861
    0, 0, 0, 0, 0, 0, 0, 0,
1862
    0, 0, 0, 0, 0, 0, 0, 0,
1863
1864
    0, 0, 0, 0, 0, 0, 0, 0,
1865
    0, 0, 0, 0, 0, 0, 0, 0,
1866
    0, 0, 0, 0, 0, 0, 0, 0,
1867
    0, 0, 0, 0, 0, 0, 0, 0,
1868
};
1869
1870
#undef S
1871
#undef D
1872
#undef X
1873
#undef U
1874
#undef L
1875
#undef _
1876
#undef d
1877
1878
/* code point ranges for Zs,Zl or Zp property */
1879
static const uint16_t char_range_s[] = {
1880
    10,
1881
    0x0009, 0x000D + 1,
1882
    0x0020, 0x0020 + 1,
1883
    0x00A0, 0x00A0 + 1,
1884
    0x1680, 0x1680 + 1,
1885
    0x2000, 0x200A + 1,
1886
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1887
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1888
    0x2028, 0x2029 + 1,
1889
    0x202F, 0x202F + 1,
1890
    0x205F, 0x205F + 1,
1891
    0x3000, 0x3000 + 1,
1892
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1893
    0xFEFF, 0xFEFF + 1,
1894
};
1895
1896
BOOL lre_is_space_non_ascii(uint32_t c)
1897
0
{
1898
0
    size_t i, n;
1899
1900
0
    n = countof(char_range_s);
1901
0
    for(i = 5; i < n; i += 2) {
1902
0
        uint32_t low = char_range_s[i];
1903
0
        uint32_t high = char_range_s[i + 1];
1904
0
        if (c < low)
1905
0
            return FALSE;
1906
0
        if (c < high)
1907
0
            return TRUE;
1908
0
    }
1909
0
    return FALSE;
1910
0
}