Coverage Report

Created: 2025-12-26 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quickjs/libregexp.c
Line
Count
Source
1
/*
2
 * Regular Expression Engine
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <inttypes.h>
28
#include <string.h>
29
#include <assert.h>
30
31
#include "cutils.h"
32
#include "libregexp.h"
33
#include "libunicode.h"
34
35
/*
36
  TODO:
37
  - remove REOP_char_i and REOP_range_i by precomputing the case folding.
38
  - add specific opcodes for simple unicode property tests so that the
39
    generated bytecode is smaller.
40
  - Add a lock step execution mode (=linear time execution guaranteed)
41
    when the regular expression is "simple" i.e. no backreference nor
42
    complicated lookahead. The opcodes are designed for this execution
43
    model.
44
*/
45
46
#if defined(TEST) 
47
#define DUMP_REOP
48
#endif
49
//#define DUMP_REOP
50
//#define DUMP_EXEC
51
52
typedef enum {
53
#define DEF(id, size) REOP_ ## id,
54
#include "libregexp-opcode.h"
55
#undef DEF
56
    REOP_COUNT,
57
} REOPCodeEnum;
58
59
130k
#define CAPTURE_COUNT_MAX 255
60
488
#define REGISTER_COUNT_MAX 255
61
/* must be large enough to have a negligible runtime cost and small
62
   enough to call the interrupt callback often. */
63
125
#define INTERRUPT_COUNTER_INIT 10000
64
65
/* unicode code points */
66
1.53M
#define CP_LS   0x2028
67
768k
#define CP_PS   0x2029
68
69
#define TMP_BUF_SIZE 128
70
71
typedef struct {
72
    DynBuf byte_code;
73
    const uint8_t *buf_ptr;
74
    const uint8_t *buf_end;
75
    const uint8_t *buf_start;
76
    int re_flags;
77
    BOOL is_unicode;
78
    BOOL unicode_sets; /* if set, is_unicode is also set */
79
    BOOL ignore_case;
80
    BOOL multi_line;
81
    BOOL dotall;
82
    uint8_t group_name_scope;
83
    int capture_count;
84
    int total_capture_count; /* -1 = not computed yet */
85
    int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
86
    void *opaque;
87
    DynBuf group_names;
88
    union {
89
        char error_msg[TMP_BUF_SIZE];
90
        char tmp_buf[TMP_BUF_SIZE];
91
    } u;
92
} REParseState;
93
94
typedef struct {
95
#ifdef DUMP_REOP
96
    const char *name;
97
#endif
98
    uint8_t size;
99
} REOpCode;
100
101
static const REOpCode reopcode_info[REOP_COUNT] = {
102
#ifdef DUMP_REOP
103
#define DEF(id, size) { #id, size },
104
#else
105
#define DEF(id, size) { size },
106
#endif
107
#include "libregexp-opcode.h"
108
#undef DEF
109
};
110
111
15
#define RE_HEADER_FLAGS          0
112
35
#define RE_HEADER_CAPTURE_COUNT  2
113
15
#define RE_HEADER_REGISTER_COUNT 3
114
15
#define RE_HEADER_BYTECODE_LEN   4
115
116
60
#define RE_HEADER_LEN 8
117
118
1
static inline int is_digit(int c) {
119
1
    return c >= '0' && c <= '9';
120
1
}
121
122
/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
123
static int dbuf_insert(DynBuf *s, int pos, int len)
124
5.42k
{
125
5.42k
    if (dbuf_claim(s, len))
126
0
        return -1;
127
5.42k
    memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
128
5.42k
    s->size += len;
129
5.42k
    return 0;
130
5.42k
}
131
132
typedef struct REString {
133
    struct REString *next;
134
    uint32_t hash;
135
    uint32_t len;
136
    uint32_t buf[];
137
} REString;
138
139
typedef struct {
140
    /* the string list is the union of 'char_range' and of the strings
141
       in hash_table[]. The strings in hash_table[] have a length !=
142
       1. */
143
    CharRange cr;
144
    uint32_t n_strings;
145
    uint32_t hash_size;
146
    int hash_bits;
147
    REString **hash_table;
148
} REStringList;
149
150
static uint32_t re_string_hash(int len, const uint32_t *buf)
151
0
{
152
0
    int i;
153
0
    uint32_t h;
154
0
    h = 1;
155
0
    for(i = 0; i < len; i++)
156
0
        h = h * 263 + buf[i];
157
0
    return h * 0x61C88647;
158
0
}
159
160
static void re_string_list_init(REParseState *s1, REStringList *s)
161
254k
{
162
254k
    cr_init(&s->cr, s1->opaque, lre_realloc);
163
254k
    s->n_strings = 0;
164
254k
    s->hash_size = 0;
165
254k
    s->hash_bits = 0;
166
254k
    s->hash_table = NULL;
167
254k
}
168
169
static void re_string_list_free(REStringList *s)
170
254k
{
171
254k
    REString *p, *p_next;
172
254k
    int i;
173
254k
    for(i = 0; i < s->hash_size; i++) {
174
0
        for(p = s->hash_table[i]; p != NULL; p = p_next) {
175
0
            p_next = p->next;
176
0
            lre_realloc(s->cr.mem_opaque, p, 0);
177
0
        }
178
0
    }
179
254k
    lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
180
181
254k
    cr_free(&s->cr);
182
254k
}
183
184
static void lre_print_char(int c, BOOL is_range)
185
0
{
186
0
    if (c == '\'' || c == '\\' ||
187
0
        (is_range && (c == '-' || c == ']'))) {
188
0
        printf("\\%c", c);
189
0
    } else if (c >= ' ' && c <= 126) {
190
0
        printf("%c", c);
191
0
    } else {
192
0
        printf("\\u{%04x}", c);
193
0
    }
194
0
}
195
196
static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
197
0
{
198
0
    REString *p;
199
0
    const CharRange *cr;
200
0
    int i, j, k;
201
0
202
0
    printf("%s:\n", str);
203
0
    printf("  ranges: [");
204
0
    cr = &s->cr;
205
0
    for(i = 0; i < cr->len; i += 2) {
206
0
        lre_print_char(cr->points[i], TRUE);
207
0
        if (cr->points[i] != cr->points[i + 1] - 1) {
208
0
            printf("-");
209
0
            lre_print_char(cr->points[i + 1] - 1, TRUE);
210
0
        }
211
0
    }
212
0
    printf("]\n");
213
0
    
214
0
    j = 0;
215
0
    for(i = 0; i < s->hash_size; i++) {
216
0
        for(p = s->hash_table[i]; p != NULL; p = p->next) {
217
0
            printf("  %d/%d: '", j, s->n_strings);
218
0
            for(k = 0; k < p->len; k++) {
219
0
                lre_print_char(p->buf[k], FALSE);
220
0
            }
221
0
            printf("'\n");
222
0
            j++;
223
0
        }
224
0
    }
225
0
}
226
227
static int re_string_find2(REStringList *s, int len, const uint32_t *buf,
228
                           uint32_t h0, BOOL add_flag)
229
0
{
230
0
    uint32_t h = 0; /* avoid warning */
231
0
    REString *p;
232
0
    if (s->n_strings != 0) {
233
0
        h = h0 >> (32 - s->hash_bits);
234
0
        for(p = s->hash_table[h]; p != NULL; p = p->next) {
235
0
            if (p->hash == h0 && p->len == len &&
236
0
                !memcmp(p->buf, buf, len * sizeof(buf[0]))) {
237
0
                return 1;
238
0
            }
239
0
        }
240
0
    }
241
    /* not found */
242
0
    if (!add_flag)
243
0
        return 0;
244
    /* increase the size of the hash table if needed */
245
0
    if (unlikely((s->n_strings + 1) > s->hash_size)) {
246
0
        REString **new_hash_table, *p_next;
247
0
        int new_hash_bits, i;
248
0
        uint32_t new_hash_size;
249
0
        new_hash_bits = max_int(s->hash_bits + 1, 4);
250
0
        new_hash_size = 1 << new_hash_bits;
251
0
        new_hash_table = lre_realloc(s->cr.mem_opaque, NULL,
252
0
                                     sizeof(new_hash_table[0]) * new_hash_size);
253
0
        if (!new_hash_table)
254
0
            return -1;
255
0
        memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
256
0
        for(i = 0; i < s->hash_size; i++) {
257
0
            for(p = s->hash_table[i]; p != NULL; p = p_next) {
258
0
                p_next = p->next;
259
0
                h = p->hash >> (32 - new_hash_bits);
260
0
                p->next = new_hash_table[h];
261
0
                new_hash_table[h] = p;
262
0
            }
263
0
        }
264
0
        lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
265
0
        s->hash_bits = new_hash_bits;
266
0
        s->hash_size = new_hash_size;
267
0
        s->hash_table = new_hash_table;
268
0
        h = h0 >> (32 - s->hash_bits);
269
0
    }
270
271
0
    p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0]));
272
0
    if (!p)
273
0
        return -1;
274
0
    p->next = s->hash_table[h];
275
0
    s->hash_table[h] = p;
276
0
    s->n_strings++;
277
0
    p->hash = h0;
278
0
    p->len = len;
279
0
    memcpy(p->buf, buf, sizeof(buf[0]) * len);
280
0
    return 1;
281
0
}
282
283
static int re_string_find(REStringList *s, int len, const uint32_t *buf,
284
                          BOOL add_flag)
285
0
{
286
0
    uint32_t h0;
287
0
    h0 = re_string_hash(len, buf);
288
0
    return re_string_find2(s, len, buf, h0, add_flag);
289
0
}
290
291
/* return -1 if memory error, 0 if OK */
292
static int re_string_add(REStringList *s, int len, const uint32_t *buf)
293
0
{
294
0
    if (len == 1) {
295
0
        return cr_union_interval(&s->cr, buf[0], buf[0]);
296
0
    }
297
0
    if (re_string_find(s, len, buf, TRUE) < 0)
298
0
        return -1;
299
0
    return 0;
300
0
}
301
302
/* a = a op b */
303
static int re_string_list_op(REStringList *a, REStringList *b, int op)
304
47.9k
{
305
47.9k
    int i, ret;
306
47.9k
    REString *p, **pp;
307
308
47.9k
    if (cr_op1(&a->cr, b->cr.points, b->cr.len, op))
309
0
        return -1;
310
311
47.9k
    switch(op) {
312
47.9k
    case CR_OP_UNION:
313
47.9k
        if (b->n_strings != 0) {
314
0
            for(i = 0; i < b->hash_size; i++) {
315
0
                for(p = b->hash_table[i]; p != NULL; p = p->next) {
316
0
                    if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0)
317
0
                        return -1;
318
0
                }
319
0
            }
320
0
        }
321
47.9k
        break;
322
47.9k
    case CR_OP_INTER:
323
0
    case CR_OP_SUB:
324
0
        for(i = 0; i < a->hash_size; i++) {
325
0
            pp = &a->hash_table[i];
326
0
            for(;;) {
327
0
                p = *pp;
328
0
                if (p == NULL)
329
0
                    break;
330
0
                ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE);
331
0
                if (op == CR_OP_SUB)
332
0
                    ret = !ret;
333
0
                if (!ret) {
334
                    /* remove it */
335
0
                    *pp = p->next;
336
0
                    a->n_strings--;
337
0
                    lre_realloc(a->cr.mem_opaque, p, 0);
338
0
                } else {
339
                    /* keep it */
340
0
                    pp = &p->next;
341
0
                }
342
0
            }
343
0
        }
344
0
        break;
345
0
    default:
346
0
        abort();
347
47.9k
    }
348
47.9k
    return 0;
349
47.9k
}
350
351
static int re_string_list_canonicalize(REParseState *s1,
352
                                       REStringList *s, BOOL is_unicode)
353
0
{
354
0
    if (cr_regexp_canonicalize(&s->cr, is_unicode))
355
0
        return -1;
356
0
    if (s->n_strings != 0) {
357
0
        REStringList a_s, *a = &a_s;
358
0
        int i, j;
359
0
        REString *p;
360
        
361
        /* XXX: simplify */
362
0
        re_string_list_init(s1, a);
363
364
0
        a->n_strings = s->n_strings;
365
0
        a->hash_size = s->hash_size;
366
0
        a->hash_bits = s->hash_bits;
367
0
        a->hash_table = s->hash_table;
368
        
369
0
        s->n_strings = 0;
370
0
        s->hash_size = 0;
371
0
        s->hash_bits = 0;
372
0
        s->hash_table = NULL;
373
374
0
        for(i = 0; i < a->hash_size; i++) {
375
0
            for(p = a->hash_table[i]; p != NULL; p = p->next) {
376
0
                for(j = 0; j < p->len; j++) {
377
0
                    p->buf[j] = lre_canonicalize(p->buf[j], is_unicode);
378
0
                }
379
0
                if (re_string_add(s, p->len, p->buf)) {
380
0
                    re_string_list_free(a);
381
0
                    return -1;
382
0
                }
383
0
            }
384
0
        }
385
0
        re_string_list_free(a);
386
0
    }
387
0
    return 0;
388
0
}
389
390
static const uint16_t char_range_d[] = {
391
    1,
392
    0x0030, 0x0039 + 1,
393
};
394
395
/* code point ranges for Zs,Zl or Zp property */
396
static const uint16_t char_range_s[] = {
397
    10,
398
    0x0009, 0x000D + 1,
399
    0x0020, 0x0020 + 1,
400
    0x00A0, 0x00A0 + 1,
401
    0x1680, 0x1680 + 1,
402
    0x2000, 0x200A + 1,
403
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
404
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
405
    0x2028, 0x2029 + 1,
406
    0x202F, 0x202F + 1,
407
    0x205F, 0x205F + 1,
408
    0x3000, 0x3000 + 1,
409
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
410
    0xFEFF, 0xFEFF + 1,
411
};
412
413
static const uint16_t char_range_w[] = {
414
    4,
415
    0x0030, 0x0039 + 1,
416
    0x0041, 0x005A + 1,
417
    0x005F, 0x005F + 1,
418
    0x0061, 0x007A + 1,
419
};
420
421
1.51M
#define CLASS_RANGE_BASE 0x40000000
422
423
typedef enum {
424
    CHAR_RANGE_d,
425
    CHAR_RANGE_D,
426
    CHAR_RANGE_s,
427
    CHAR_RANGE_S,
428
    CHAR_RANGE_w,
429
    CHAR_RANGE_W,
430
} CharRangeEnum;
431
432
static const uint16_t * const char_range_table[] = {
433
    char_range_d,
434
    char_range_s,
435
    char_range_w,
436
};
437
438
static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c)
439
221k
{
440
221k
    BOOL invert;
441
221k
    const uint16_t *c_pt;
442
221k
    int len, i;
443
444
221k
    invert = c & 1;
445
221k
    c_pt = char_range_table[c >> 1];
446
221k
    len = *c_pt++;
447
221k
    re_string_list_init(s, cr);
448
4.65M
    for(i = 0; i < len * 2; i++) {
449
4.43M
        if (cr_add_point(&cr->cr, c_pt[i]))
450
0
            goto fail;
451
4.43M
    }
452
221k
    if (invert) {
453
221k
        if (cr_invert(&cr->cr))
454
0
            goto fail;
455
221k
    }
456
221k
    return 0;
457
0
 fail:
458
0
    re_string_list_free(cr);
459
0
    return -1;
460
221k
}
461
462
#ifdef DUMP_REOP
463
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
464
                                                     int buf_len)
465
{
466
    int pos, len, opcode, bc_len, re_flags, i;
467
    uint32_t val, val2;
468
469
    assert(buf_len >= RE_HEADER_LEN);
470
471
    re_flags = lre_get_flags(buf);
472
    bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
473
    assert(bc_len + RE_HEADER_LEN <= buf_len);
474
    printf("flags: 0x%x capture_count=%d reg_count=%d\n",
475
           re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_REGISTER_COUNT]);
476
    if (re_flags & LRE_FLAG_NAMED_GROUPS) {
477
        const char *p;
478
        p = (char *)buf + RE_HEADER_LEN + bc_len;
479
        printf("named groups: ");
480
        for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
481
            if (i != 1)
482
                printf(",");
483
            printf("<%s>", p);
484
            p += strlen(p) + LRE_GROUP_NAME_TRAILER_LEN;
485
        }
486
        printf("\n");
487
        assert(p == (char *)(buf + buf_len));
488
    }
489
    printf("bytecode_len=%d\n", bc_len);
490
491
    buf += RE_HEADER_LEN;
492
    pos = 0;
493
    while (pos < bc_len) {
494
        printf("%5u: ", pos);
495
        opcode = buf[pos];
496
        len = reopcode_info[opcode].size;
497
        if (opcode >= REOP_COUNT) {
498
            printf(" invalid opcode=0x%02x\n", opcode);
499
            break;
500
        }
501
        if ((pos + len) > bc_len) {
502
            printf(" buffer overflow (opcode=0x%02x)\n", opcode);
503
            break;
504
        }
505
        printf("%s", reopcode_info[opcode].name);
506
        switch(opcode) {
507
        case REOP_char:
508
        case REOP_char_i:
509
            val = get_u16(buf + pos + 1);
510
            if (val >= ' ' && val <= 126)
511
                printf(" '%c'", val);
512
            else
513
                printf(" 0x%04x", val);
514
            break;
515
        case REOP_char32:
516
        case REOP_char32_i:
517
            val = get_u32(buf + pos + 1);
518
            if (val >= ' ' && val <= 126)
519
                printf(" '%c'", val);
520
            else
521
                printf(" 0x%08x", val);
522
            break;
523
        case REOP_goto:
524
        case REOP_split_goto_first:
525
        case REOP_split_next_first:
526
        case REOP_lookahead:
527
        case REOP_negative_lookahead:
528
            val = get_u32(buf + pos + 1);
529
            val += (pos + 5);
530
            printf(" %u", val);
531
            break;
532
        case REOP_loop:
533
            val2 = buf[pos + 1];
534
            val = get_u32(buf + pos + 2);
535
            val += (pos + 6);
536
            printf(" r%u, %u", val2, val);
537
            break;
538
        case REOP_loop_split_goto_first:
539
        case REOP_loop_split_next_first:
540
        case REOP_loop_check_adv_split_goto_first:
541
        case REOP_loop_check_adv_split_next_first:
542
            {
543
                uint32_t limit;
544
                val2 = buf[pos + 1];
545
                limit = get_u32(buf + pos + 2);
546
                val = get_u32(buf + pos + 6);
547
                val += (pos + 10);
548
                printf(" r%u, %u, %u", val2, limit, val);
549
            }
550
            break;
551
        case REOP_save_start:
552
        case REOP_save_end:
553
            printf(" %u", buf[pos + 1]);
554
            break;
555
        case REOP_back_reference:
556
        case REOP_back_reference_i:
557
        case REOP_backward_back_reference:
558
        case REOP_backward_back_reference_i:
559
            {
560
                int n, i;
561
                n = buf[pos + 1];
562
                len += n;
563
                for(i = 0; i < n; i++) {
564
                    if (i != 0)
565
                        printf(",");
566
                    printf(" %u", buf[pos + 2 + i]);
567
                }
568
            }
569
            break;
570
        case REOP_save_reset:
571
            printf(" %u %u", buf[pos + 1], buf[pos + 2]);
572
            break;
573
        case REOP_set_i32:
574
            val = buf[pos + 1];
575
            val2 = get_u32(buf + pos + 2);
576
            printf(" r%u, %d", val, val2);
577
            break;
578
        case REOP_set_char_pos:
579
        case REOP_check_advance:
580
            val = buf[pos + 1];
581
            printf(" r%u", val);
582
            break;
583
        case REOP_range:
584
        case REOP_range_i:
585
            {
586
                int n, i;
587
                n = get_u16(buf + pos + 1);
588
                len += n * 4;
589
                for(i = 0; i < n * 2; i++) {
590
                    val = get_u16(buf + pos + 3 + i * 2);
591
                    printf(" 0x%04x", val);
592
                }
593
            }
594
            break;
595
        case REOP_range32:
596
        case REOP_range32_i:
597
            {
598
                int n, i;
599
                n = get_u16(buf + pos + 1);
600
                len += n * 8;
601
                for(i = 0; i < n * 2; i++) {
602
                    val = get_u32(buf + pos + 3 + i * 4);
603
                    printf(" 0x%08x", val);
604
                }
605
            }
606
            break;
607
        default:
608
            break;
609
        }
610
        printf("\n");
611
        pos += len;
612
    }
613
}
614
#endif
615
616
static void re_emit_op(REParseState *s, int op)
617
441k
{
618
441k
    dbuf_putc(&s->byte_code, op);
619
441k
}
620
621
/* return the offset of the u32 value */
622
static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
623
2.35k
{
624
2.35k
    int pos;
625
2.35k
    dbuf_putc(&s->byte_code, op);
626
2.35k
    pos = s->byte_code.size;
627
2.35k
    dbuf_put_u32(&s->byte_code, val);
628
2.35k
    return pos;
629
2.35k
}
630
631
static int re_emit_goto(REParseState *s, int op, uint32_t val)
632
2.10k
{
633
2.10k
    int pos;
634
2.10k
    dbuf_putc(&s->byte_code, op);
635
2.10k
    pos = s->byte_code.size;
636
2.10k
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
637
2.10k
    return pos;
638
2.10k
}
639
640
static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val)
641
1
{
642
1
    int pos;
643
1
    dbuf_putc(&s->byte_code, op);
644
1
    dbuf_putc(&s->byte_code, arg);
645
1
    pos = s->byte_code.size;
646
1
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
647
1
    return pos;
648
1
}
649
650
static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val)
651
447
{
652
447
    int pos;
653
447
    dbuf_putc(&s->byte_code, op);
654
447
    dbuf_putc(&s->byte_code, arg0);
655
447
    dbuf_put_u32(&s->byte_code, arg1);
656
447
    pos = s->byte_code.size;
657
447
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
658
447
    return pos;
659
447
}
660
661
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
662
6.00k
{
663
6.00k
    dbuf_putc(&s->byte_code, op);
664
6.00k
    dbuf_putc(&s->byte_code, val);
665
6.00k
}
666
667
static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
668
335k
{
669
335k
    dbuf_putc(&s->byte_code, op);
670
335k
    dbuf_put_u16(&s->byte_code, val);
671
335k
}
672
673
static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
674
15
{
675
15
    va_list ap;
676
15
    va_start(ap, fmt);
677
15
    vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
678
15
    va_end(ap);
679
15
    return -1;
680
15
}
681
682
static int re_parse_out_of_memory(REParseState *s)
683
0
{
684
0
    return re_parse_error(s, "out of memory");
685
0
}
686
687
/* If allow_overflow is false, return -1 in case of
688
   overflow. Otherwise return INT32_MAX. */
689
static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
690
801
{
691
801
    const uint8_t *p;
692
801
    uint64_t v;
693
801
    int c;
694
695
801
    p = *pp;
696
801
    v = 0;
697
1.69k
    for(;;) {
698
1.69k
        c = *p;
699
1.69k
        if (c < '0' || c > '9')
700
790
            break;
701
901
        v = v * 10 + c - '0';
702
901
        if (v >= INT32_MAX) {
703
11
            if (allow_overflow)
704
0
                v = INT32_MAX;
705
11
            else
706
11
                return -1;
707
11
        }
708
890
        p++;
709
890
    }
710
790
    *pp = p;
711
790
    return v;
712
801
}
713
714
static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
715
4.00k
{
716
4.00k
    const uint8_t *p;
717
4.00k
    p = *pp;
718
4.00k
    if (*p != c)
719
1
        return re_parse_error(s, "expecting '%c'", c);
720
4.00k
    p++;
721
4.00k
    *pp = p;
722
4.00k
    return 0;
723
4.00k
}
724
725
/* Parse an escape sequence, *pp points after the '\':
726
   allow_utf16 value:
727
   0 : no UTF-16 escapes allowed
728
   1 : UTF-16 escapes allowed
729
   2 : UTF-16 escapes allowed and escapes of surrogate pairs are
730
   converted to a unicode character (unicode regexp case).
731
732
   Return the unicode char and update *pp if recognized,
733
   return -1 if malformed escape,
734
   return -2 otherwise. */
735
int lre_parse_escape(const uint8_t **pp, int allow_utf16)
736
23.7k
{
737
23.7k
    const uint8_t *p;
738
23.7k
    uint32_t c;
739
740
23.7k
    p = *pp;
741
23.7k
    c = *p++;
742
23.7k
    switch(c) {
743
0
    case 'b':
744
0
        c = '\b';
745
0
        break;
746
290
    case 'f':
747
290
        c = '\f';
748
290
        break;
749
83
    case 'n':
750
83
        c = '\n';
751
83
        break;
752
0
    case 'r':
753
0
        c = '\r';
754
0
        break;
755
0
    case 't':
756
0
        c = '\t';
757
0
        break;
758
0
    case 'v':
759
0
        c = '\v';
760
0
        break;
761
40
    case 'x':
762
40
        {
763
40
            int h0, h1;
764
765
40
            h0 = from_hex(*p++);
766
40
            if (h0 < 0)
767
11
                return -1;
768
29
            h1 = from_hex(*p++);
769
29
            if (h1 < 0)
770
0
                return -1;
771
29
            c = (h0 << 4) | h1;
772
29
        }
773
0
        break;
774
1.17k
    case 'u':
775
1.17k
        {
776
1.17k
            int h, i;
777
1.17k
            uint32_t c1;
778
779
1.17k
            if (*p == '{' && allow_utf16) {
780
0
                p++;
781
0
                c = 0;
782
0
                for(;;) {
783
0
                    h = from_hex(*p++);
784
0
                    if (h < 0)
785
0
                        return -1;
786
0
                    c = (c << 4) | h;
787
0
                    if (c > 0x10FFFF)
788
0
                        return -1;
789
0
                    if (*p == '}')
790
0
                        break;
791
0
                }
792
0
                p++;
793
1.17k
            } else {
794
1.17k
                c = 0;
795
1.18k
                for(i = 0; i < 4; i++) {
796
1.18k
                    h = from_hex(*p++);
797
1.18k
                    if (h < 0) {
798
1.17k
                        return -1;
799
1.17k
                    }
800
2
                    c = (c << 4) | h;
801
2
                }
802
0
                if (is_hi_surrogate(c) &&
803
0
                    allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
804
                    /* convert an escaped surrogate pair into a
805
                       unicode char */
806
0
                    c1 = 0;
807
0
                    for(i = 0; i < 4; i++) {
808
0
                        h = from_hex(p[2 + i]);
809
0
                        if (h < 0)
810
0
                            break;
811
0
                        c1 = (c1 << 4) | h;
812
0
                    }
813
0
                    if (i == 4 && is_lo_surrogate(c1)) {
814
0
                        p += 6;
815
0
                        c = from_surrogate(c, c1);
816
0
                    }
817
0
                }
818
0
            }
819
1.17k
        }
820
0
        break;
821
630
    case '0': case '1': case '2': case '3':
822
16.1k
    case '4': case '5': case '6': case '7':
823
16.1k
        c -= '0';
824
16.1k
        if (allow_utf16 == 2) {
825
            /* only accept \0 not followed by digit */
826
0
            if (c != 0 || is_digit(*p))
827
0
                return -1;
828
16.1k
        } else {
829
            /* parse a legacy octal sequence */
830
16.1k
            uint32_t v;
831
16.1k
            v = *p - '0';
832
16.1k
            if (v > 7)
833
16.1k
                break;
834
13
            c = (c << 3) | v;
835
13
            p++;
836
13
            if (c >= 32)
837
4
                break;
838
9
            v = *p - '0';
839
9
            if (v > 7)
840
0
                break;
841
9
            c = (c << 3) | v;
842
9
            p++;
843
9
        }
844
9
        break;
845
6.00k
    default:
846
6.00k
        return -2;
847
23.7k
    }
848
16.5k
    *pp = p;
849
16.5k
    return c;
850
23.7k
}
851
852
#ifdef CONFIG_ALL_UNICODE
853
/* XXX: we use the same chars for name and value */
854
static BOOL is_unicode_char(int c)
855
0
{
856
0
    return ((c >= '0' && c <= '9') ||
857
0
            (c >= 'A' && c <= 'Z') ||
858
0
            (c >= 'a' && c <= 'z') ||
859
0
            (c == '_'));
860
0
}
861
862
/* XXX: memory error test */
863
static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len)
864
0
{
865
0
    REStringList *sl = opaque;
866
0
    re_string_add(sl, seq_len, seq);
867
0
}
868
869
static int parse_unicode_property(REParseState *s, REStringList *cr,
870
                                  const uint8_t **pp, BOOL is_inv,
871
                                  BOOL allow_sequence_prop)
872
0
{
873
0
    const uint8_t *p;
874
0
    char name[64], value[64];
875
0
    char *q;
876
0
    BOOL script_ext;
877
0
    int ret;
878
879
0
    p = *pp;
880
0
    if (*p != '{')
881
0
        return re_parse_error(s, "expecting '{' after \\p");
882
0
    p++;
883
0
    q = name;
884
0
    while (is_unicode_char(*p)) {
885
0
        if ((q - name) >= sizeof(name) - 1)
886
0
            goto unknown_property_name;
887
0
        *q++ = *p++;
888
0
    }
889
0
    *q = '\0';
890
0
    q = value;
891
0
    if (*p == '=') {
892
0
        p++;
893
0
        while (is_unicode_char(*p)) {
894
0
            if ((q - value) >= sizeof(value) - 1)
895
0
                return re_parse_error(s, "unknown unicode property value");
896
0
            *q++ = *p++;
897
0
        }
898
0
    }
899
0
    *q = '\0';
900
0
    if (*p != '}')
901
0
        return re_parse_error(s, "expecting '}'");
902
0
    p++;
903
    //    printf("name=%s value=%s\n", name, value);
904
905
0
    if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
906
0
        script_ext = FALSE;
907
0
        goto do_script;
908
0
    } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
909
0
        script_ext = TRUE;
910
0
    do_script:
911
0
        re_string_list_init(s, cr);
912
0
        ret = unicode_script(&cr->cr, value, script_ext);
913
0
        if (ret) {
914
0
            re_string_list_free(cr);
915
0
            if (ret == -2)
916
0
                return re_parse_error(s, "unknown unicode script");
917
0
            else
918
0
                goto out_of_memory;
919
0
        }
920
0
    } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
921
0
        re_string_list_init(s, cr);
922
0
        ret = unicode_general_category(&cr->cr, value);
923
0
        if (ret) {
924
0
            re_string_list_free(cr);
925
0
            if (ret == -2)
926
0
                return re_parse_error(s, "unknown unicode general category");
927
0
            else
928
0
                goto out_of_memory;
929
0
        }
930
0
    } else if (value[0] == '\0') {
931
0
        re_string_list_init(s, cr);
932
0
        ret = unicode_general_category(&cr->cr, name);
933
0
        if (ret == -1) {
934
0
            re_string_list_free(cr);
935
0
            goto out_of_memory;
936
0
        }
937
0
        if (ret < 0) {
938
0
            ret = unicode_prop(&cr->cr, name);
939
0
            if (ret == -1) {
940
0
                re_string_list_free(cr);
941
0
                goto out_of_memory;
942
0
            }
943
0
        }
944
0
        if (ret < 0 && !is_inv && allow_sequence_prop) {
945
0
            CharRange cr_tmp;
946
0
            cr_init(&cr_tmp, s->opaque, lre_realloc);
947
0
            ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp);
948
0
            cr_free(&cr_tmp);
949
0
            if (ret == -1) {
950
0
                re_string_list_free(cr);
951
0
                goto out_of_memory;
952
0
            }
953
0
        }
954
0
        if (ret < 0)
955
0
            goto unknown_property_name;
956
0
    } else {
957
0
    unknown_property_name:
958
0
        return re_parse_error(s, "unknown unicode property name");
959
0
    }
960
961
    /* the ordering of case folding and inversion  differs with
962
       unicode_sets. 'unicode_sets' ordering is more consistent */
963
    /* XXX: the spec seems incorrect, we do it as the other engines
964
       seem to do it. */
965
0
    if (s->ignore_case && s->unicode_sets) {
966
0
        if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
967
0
            re_string_list_free(cr);
968
0
            goto out_of_memory;
969
0
        }
970
0
    }
971
0
    if (is_inv) {
972
0
        if (cr_invert(&cr->cr)) {
973
0
            re_string_list_free(cr);
974
0
            goto out_of_memory;
975
0
        }
976
0
    }
977
0
    if (s->ignore_case && !s->unicode_sets) {
978
0
        if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
979
0
            re_string_list_free(cr);
980
0
            goto out_of_memory;
981
0
        }
982
0
    }
983
0
    *pp = p;
984
0
    return 0;
985
0
 out_of_memory:
986
0
    return re_parse_out_of_memory(s);
987
0
}
988
#endif /* CONFIG_ALL_UNICODE */
989
990
static int get_class_atom(REParseState *s, REStringList *cr,
991
                          const uint8_t **pp, BOOL inclass);
992
993
static int parse_class_string_disjunction(REParseState *s, REStringList *cr,
994
                                          const uint8_t **pp)
995
0
{
996
0
    const uint8_t *p;
997
0
    DynBuf str;
998
0
    int c;
999
    
1000
0
    p = *pp;
1001
0
    if (*p != '{')
1002
0
        return re_parse_error(s, "expecting '{' after \\q");
1003
1004
0
    dbuf_init2(&str, s->opaque, lre_realloc);
1005
0
    re_string_list_init(s, cr);
1006
    
1007
0
    p++;
1008
0
    for(;;) {
1009
0
        str.size = 0;
1010
0
        while (*p != '}' && *p != '|') {
1011
0
            c = get_class_atom(s, NULL, &p, FALSE);
1012
0
            if (c < 0)
1013
0
                goto fail;
1014
0
            if (dbuf_put_u32(&str, c)) {
1015
0
                re_parse_out_of_memory(s);
1016
0
                goto fail;
1017
0
            }
1018
0
        }
1019
0
        if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) {
1020
0
            re_parse_out_of_memory(s);
1021
0
            goto fail;
1022
0
        }
1023
0
        if (*p == '}')
1024
0
            break;
1025
0
        p++;
1026
0
    }
1027
0
    if (s->ignore_case) {
1028
0
        if (re_string_list_canonicalize(s, cr, TRUE))
1029
0
            goto fail;
1030
0
    }
1031
0
    p++; /* skip the '}' */
1032
0
    dbuf_free(&str);
1033
0
    *pp = p;
1034
0
    return 0;
1035
0
 fail:
1036
0
    dbuf_free(&str);
1037
0
    re_string_list_free(cr);
1038
0
    return -1;
1039
0
}
1040
1041
/* return -1 if error otherwise the character or a class range
1042
   (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is
1043
   initialized. Otherwise, it is ignored. */
1044
static int get_class_atom(REParseState *s, REStringList *cr,
1045
                          const uint8_t **pp, BOOL inclass)
1046
949k
{
1047
949k
    const uint8_t *p;
1048
949k
    uint32_t c;
1049
949k
    int ret;
1050
1051
949k
    p = *pp;
1052
1053
949k
    c = *p;
1054
949k
    switch(c) {
1055
284k
    case '\\':
1056
284k
        p++;
1057
284k
        if (p >= s->buf_end)
1058
0
            goto unexpected_end;
1059
284k
        c = *p++;
1060
284k
        switch(c) {
1061
0
        case 'd':
1062
0
            c = CHAR_RANGE_d;
1063
0
            goto class_range;
1064
0
        case 'D':
1065
0
            c = CHAR_RANGE_D;
1066
0
            goto class_range;
1067
180
        case 's':
1068
180
            c = CHAR_RANGE_s;
1069
180
            goto class_range;
1070
221k
        case 'S':
1071
221k
            c = CHAR_RANGE_S;
1072
221k
            goto class_range;
1073
0
        case 'w':
1074
0
            c = CHAR_RANGE_w;
1075
0
            goto class_range;
1076
1
        case 'W':
1077
1
            c = CHAR_RANGE_W;
1078
221k
        class_range:
1079
221k
            if (!cr)
1080
0
                goto default_escape;
1081
221k
            if (cr_init_char_range(s, cr, c))
1082
0
                return -1;
1083
221k
            c += CLASS_RANGE_BASE;
1084
221k
            break;
1085
110
        case 'c':
1086
110
            c = *p;
1087
110
            if ((c >= 'a' && c <= 'z') ||
1088
104
                (c >= 'A' && c <= 'Z') ||
1089
100
                (((c >= '0' && c <= '9') || c == '_') &&
1090
12
                 inclass && !s->is_unicode)) {   /* Annex B.1.4 */
1091
10
                c &= 0x1f;
1092
10
                p++;
1093
100
            } else if (s->is_unicode) {
1094
0
                goto invalid_escape;
1095
100
            } else {
1096
                /* otherwise return '\' and 'c' */
1097
100
                p--;
1098
100
                c = '\\';
1099
100
            }
1100
110
            break;
1101
110
        case '-':
1102
12
            if (!inclass && s->is_unicode)
1103
0
                goto invalid_escape;
1104
12
            break;
1105
12
        case '^':
1106
0
        case '$':
1107
37.8k
        case '\\':
1108
37.8k
        case '.':
1109
37.8k
        case '*':
1110
37.8k
        case '+':
1111
37.8k
        case '?':
1112
37.8k
        case '(':
1113
37.8k
        case ')':
1114
38.5k
        case '[':
1115
38.5k
        case ']':
1116
38.5k
        case '{':
1117
38.5k
        case '}':
1118
38.7k
        case '|':
1119
38.7k
        case '/':
1120
            /* always valid to escape these characters */
1121
38.7k
            break;
1122
0
#ifdef CONFIG_ALL_UNICODE
1123
2
        case 'p':
1124
2
        case 'P':
1125
2
            if (s->is_unicode && cr) {
1126
0
                if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets))
1127
0
                    return -1;
1128
0
                c = CLASS_RANGE_BASE;
1129
0
                break;
1130
0
            }
1131
2
            goto default_escape;
1132
2
#endif
1133
699
        case 'q':
1134
699
            if (s->unicode_sets && cr && inclass) {
1135
0
                if (parse_class_string_disjunction(s, cr, &p))
1136
0
                    return -1;
1137
0
                c = CLASS_RANGE_BASE;
1138
0
                break;
1139
0
            }
1140
699
            goto default_escape;
1141
23.0k
        default:
1142
23.7k
        default_escape:
1143
23.7k
            p--;
1144
23.7k
            ret = lre_parse_escape(&p, s->is_unicode * 2);
1145
23.7k
            if (ret >= 0) {
1146
16.5k
                c = ret;
1147
16.5k
            } else {
1148
7.19k
                if (s->is_unicode) {
1149
0
                invalid_escape:
1150
0
                    return re_parse_error(s, "invalid escape sequence in regular expression");
1151
7.19k
                } else {
1152
                    /* just ignore the '\' */
1153
7.19k
                    goto normal_char;
1154
7.19k
                }
1155
7.19k
            }
1156
16.5k
            break;
1157
284k
        }
1158
277k
        break;
1159
277k
    case '\0':
1160
1
        if (p >= s->buf_end) {
1161
1
        unexpected_end:
1162
1
            return re_parse_error(s, "unexpected end");
1163
1
        }
1164
        /* fall thru */
1165
0
        goto normal_char;
1166
1167
3.04k
    case '&':
1168
3.05k
    case '!':
1169
4.07k
    case '#':
1170
4.55k
    case '$':
1171
5.18k
    case '%':
1172
5.35k
    case '*':
1173
5.37k
    case '+':
1174
6.03k
    case ',':
1175
6.50k
    case '.':
1176
15.8k
    case ':':
1177
16.0k
    case ';':
1178
19.3k
    case '<':
1179
21.4k
    case '=':
1180
24.5k
    case '>':
1181
48.3k
    case '?':
1182
54.7k
    case '@':
1183
56.8k
    case '^':
1184
57.0k
    case '`':
1185
57.0k
    case '~':
1186
57.0k
        if (s->unicode_sets && p[1] == c) {
1187
            /* forbidden double characters */
1188
0
            return re_parse_error(s, "invalid class set operation in regular expression");
1189
0
        }
1190
57.0k
        goto normal_char;
1191
1192
57.0k
    case '(':
1193
19.1k
    case ')':
1194
30.7k
    case '[':
1195
45.0k
    case ']':
1196
45.0k
    case '{':
1197
45.0k
    case '}':
1198
46.1k
    case '/':
1199
71.9k
    case '-':
1200
74.9k
    case '|':
1201
74.9k
        if (s->unicode_sets) {
1202
            /* invalid characters in unicode sets */
1203
0
            return re_parse_error(s, "invalid character in class in regular expression");
1204
0
        }
1205
74.9k
        goto normal_char;
1206
1207
533k
    default:
1208
672k
    normal_char:
1209
        /* normal char */
1210
672k
        if (c >= 128) {
1211
32.8k
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1212
32.8k
            if ((unsigned)c > 0xffff && !s->is_unicode) {
1213
                /* XXX: should handle non BMP-1 code points */
1214
10
                return re_parse_error(s, "malformed unicode char");
1215
10
            }
1216
639k
        } else {
1217
639k
            p++;
1218
639k
        }
1219
672k
        break;
1220
949k
    }
1221
949k
    *pp = p;
1222
949k
    return c;
1223
949k
}
1224
1225
static int re_emit_range(REParseState *s, const CharRange *cr)
1226
32.5k
{
1227
32.5k
    int len, i;
1228
32.5k
    uint32_t high;
1229
1230
32.5k
    len = (unsigned)cr->len / 2;
1231
32.5k
    if (len >= 65535)
1232
0
        return re_parse_error(s, "too many ranges");
1233
32.5k
    if (len == 0) {
1234
17
        re_emit_op_u32(s, REOP_char32, -1);
1235
32.4k
    } else {
1236
32.4k
        high = cr->points[cr->len - 1];
1237
32.4k
        if (high == UINT32_MAX)
1238
14
            high = cr->points[cr->len - 2];
1239
32.4k
        if (high <= 0xffff) {
1240
            /* can use 16 bit ranges with the conversion that 0xffff =
1241
               infinity */
1242
15.8k
            re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len);
1243
94.5k
            for(i = 0; i < cr->len; i += 2) {
1244
78.7k
                dbuf_put_u16(&s->byte_code, cr->points[i]);
1245
78.7k
                high = cr->points[i + 1] - 1;
1246
78.7k
                if (high == UINT32_MAX - 1)
1247
14
                    high = 0xffff;
1248
78.7k
                dbuf_put_u16(&s->byte_code, high);
1249
78.7k
            }
1250
16.6k
        } else {
1251
16.6k
            re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len);
1252
4.70M
            for(i = 0; i < cr->len; i += 2) {
1253
4.69M
                dbuf_put_u32(&s->byte_code, cr->points[i]);
1254
4.69M
                dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
1255
4.69M
            }
1256
16.6k
        }
1257
32.4k
    }
1258
32.5k
    return 0;
1259
32.5k
}
1260
1261
static int re_string_cmp_len(const void *a, const void *b, void *arg)
1262
0
{
1263
0
    REString *p1 = *(REString **)a;
1264
0
    REString *p2 = *(REString **)b;
1265
0
    return (p1->len < p2->len) - (p1->len > p2->len);
1266
0
}
1267
1268
static void re_emit_char(REParseState *s, int c)
1269
302k
{
1270
302k
    if (c <= 0xffff)
1271
302k
        re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c);
1272
0
    else
1273
0
        re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c);
1274
302k
}
1275
1276
static int re_emit_string_list(REParseState *s, const REStringList *sl)
1277
32.5k
{
1278
32.5k
    REString **tab, *p;
1279
32.5k
    int i, j, split_pos, last_match_pos, n;
1280
32.5k
    BOOL has_empty_string, is_last;
1281
    
1282
    //    re_string_list_dump("sl", sl);
1283
32.5k
    if (sl->n_strings == 0) {
1284
        /* simple case: only characters */
1285
32.5k
        if (re_emit_range(s, &sl->cr))
1286
0
            return -1;
1287
32.5k
    } else {
1288
        /* at least one string list is present : match the longest ones first */
1289
        /* XXX: add a new op_switch opcode to compile as a trie */
1290
0
        tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings);
1291
0
        if (!tab) {
1292
0
            re_parse_out_of_memory(s);
1293
0
            return -1;
1294
0
        }
1295
0
        has_empty_string = FALSE;
1296
0
        n = 0;
1297
0
        for(i = 0; i < sl->hash_size; i++) {
1298
0
            for(p = sl->hash_table[i]; p != NULL; p = p->next) {
1299
0
                if (p->len == 0) {
1300
0
                    has_empty_string = TRUE;
1301
0
                } else {
1302
0
                    tab[n++] = p;
1303
0
                }
1304
0
            }
1305
0
        }
1306
0
        assert(n <= sl->n_strings);
1307
        
1308
0
        rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL);
1309
1310
0
        last_match_pos = -1;
1311
0
        for(i = 0; i < n; i++) {
1312
0
            p = tab[i];
1313
0
            is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1);
1314
0
            if (!is_last)
1315
0
                split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1316
0
            else
1317
0
                split_pos = 0;
1318
0
            for(j = 0; j < p->len; j++) {
1319
0
                re_emit_char(s, p->buf[j]);
1320
0
            }
1321
0
            if (!is_last) {
1322
0
                last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
1323
0
                put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1324
0
            }
1325
0
        }
1326
1327
0
        if (sl->cr.len != 0) {
1328
            /* char range */
1329
0
            is_last = !has_empty_string;
1330
0
            if (!is_last)
1331
0
                split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1332
0
            else
1333
0
                split_pos = 0; /* not used */
1334
0
            if (re_emit_range(s, &sl->cr)) {
1335
0
                lre_realloc(s->opaque, tab, 0);
1336
0
                return -1;
1337
0
            }
1338
0
            if (!is_last)
1339
0
                put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1340
0
        }
1341
1342
        /* patch the 'goto match' */
1343
0
        while (last_match_pos != -1) {
1344
0
            int next_pos = get_u32(s->byte_code.buf + last_match_pos);
1345
0
            put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4));
1346
0
            last_match_pos = next_pos;
1347
0
        }
1348
        
1349
0
        lre_realloc(s->opaque, tab, 0);
1350
0
    }
1351
32.5k
    return 0;
1352
32.5k
}
1353
1354
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp);
1355
1356
static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp)
1357
0
{
1358
0
    int c1;
1359
0
    const uint8_t *p = *pp;
1360
    
1361
0
    if (*p == '[') {
1362
0
        if (re_parse_nested_class(s, cr, pp))
1363
0
            return -1;
1364
0
    } else {
1365
0
        c1 = get_class_atom(s, cr, pp, TRUE);
1366
0
        if (c1 < 0)
1367
0
            return -1;
1368
0
        if (c1 < CLASS_RANGE_BASE) {
1369
            /* create a range with a single character */
1370
0
            re_string_list_init(s, cr);
1371
0
            if (s->ignore_case)
1372
0
                c1 = lre_canonicalize(c1, s->is_unicode);
1373
0
            if (cr_union_interval(&cr->cr, c1, c1)) {
1374
0
                re_string_list_free(cr);
1375
0
                return -1;
1376
0
            }
1377
0
        }
1378
0
    }
1379
0
    return 0;
1380
0
}
1381
1382
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp)
1383
32.5k
{
1384
32.5k
    const uint8_t *p;
1385
32.5k
    uint32_t c1, c2;
1386
32.5k
    int ret;
1387
32.5k
    REStringList cr1_s, *cr1 = &cr1_s;
1388
32.5k
    BOOL invert, is_first;
1389
1390
32.5k
    if (lre_check_stack_overflow(s->opaque, 0))
1391
0
        return re_parse_error(s, "stack overflow");
1392
1393
32.5k
    re_string_list_init(s, cr);
1394
32.5k
    p = *pp;
1395
32.5k
    p++;    /* skip '[' */
1396
1397
32.5k
    invert = FALSE;
1398
32.5k
    if (*p == '^') {
1399
0
        p++;
1400
0
        invert = TRUE;
1401
0
    }
1402
    
1403
    /* handle unions */
1404
32.5k
    is_first = TRUE;
1405
489k
    for(;;) {
1406
489k
        if (*p == ']')
1407
32.5k
            break;
1408
457k
        if (*p == '[' && s->unicode_sets) {
1409
0
            if (re_parse_nested_class(s, cr1, &p))
1410
0
                goto fail;
1411
0
            goto class_union;
1412
457k
        } else {
1413
457k
            c1 = get_class_atom(s, cr1, &p, TRUE);
1414
457k
            if ((int)c1 < 0)
1415
10
                goto fail;
1416
457k
            if (*p == '-' && p[1] != ']') {
1417
15.8k
                const uint8_t *p0 = p + 1;
1418
15.8k
                if (p[1] == '-' && s->unicode_sets && is_first)
1419
0
                    goto class_atom; /* first character class followed by '--' */
1420
15.8k
                if (c1 >= CLASS_RANGE_BASE) {
1421
0
                    if (s->is_unicode) {
1422
0
                        re_string_list_free(cr1);
1423
0
                        goto invalid_class_range;
1424
0
                    }
1425
                    /* Annex B: match '-' character */
1426
0
                    goto class_atom;
1427
0
                }
1428
15.8k
                c2 = get_class_atom(s, cr1, &p0, TRUE);
1429
15.8k
                if ((int)c2 < 0)
1430
0
                    goto fail;
1431
15.8k
                if (c2 >= CLASS_RANGE_BASE) {
1432
0
                    re_string_list_free(cr1);
1433
0
                    if (s->is_unicode) {
1434
0
                        goto invalid_class_range;
1435
0
                    }
1436
                    /* Annex B: match '-' character */
1437
0
                    goto class_atom;
1438
0
                }
1439
15.8k
                p = p0;
1440
15.8k
                if (c2 < c1) {
1441
0
                invalid_class_range:
1442
0
                    re_parse_error(s, "invalid class range");
1443
0
                    goto fail;
1444
0
                }
1445
15.8k
                if (s->ignore_case) {
1446
15.8k
                    CharRange cr2_s, *cr2 = &cr2_s;
1447
15.8k
                    cr_init(cr2, s->opaque, lre_realloc);
1448
15.8k
                    if (cr_add_interval(cr2, c1, c2 + 1) ||
1449
15.8k
                        cr_regexp_canonicalize(cr2, s->is_unicode) ||
1450
15.8k
                        cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) {
1451
0
                        cr_free(cr2);
1452
0
                        goto memory_error;
1453
0
                    }
1454
15.8k
                    cr_free(cr2);
1455
15.8k
                } else {
1456
78
                    if (cr_union_interval(&cr->cr, c1, c2))
1457
0
                        goto memory_error;
1458
78
                }
1459
15.8k
                is_first = FALSE; /* union operation */
1460
441k
            } else {
1461
441k
            class_atom:
1462
441k
                if (c1 >= CLASS_RANGE_BASE) {
1463
47.9k
                class_union:
1464
47.9k
                    ret = re_string_list_op(cr, cr1, CR_OP_UNION);
1465
47.9k
                    re_string_list_free(cr1);
1466
47.9k
                    if (ret)
1467
0
                        goto memory_error;
1468
393k
                } else {
1469
393k
                    if (s->ignore_case)
1470
391k
                        c1 = lre_canonicalize(c1, s->is_unicode);
1471
393k
                    if (cr_union_interval(&cr->cr, c1, c1))
1472
0
                        goto memory_error;
1473
393k
                }
1474
441k
            }
1475
457k
        }
1476
457k
        if (s->unicode_sets && is_first) {
1477
0
            if (*p == '&' && p[1] == '&' && p[2] != '&') {
1478
                /* handle '&&' */
1479
0
                for(;;) {
1480
0
                    if (*p == ']') {
1481
0
                        break;
1482
0
                    } else if (*p == '&' && p[1] == '&' && p[2] != '&') {
1483
0
                        p += 2;
1484
0
                    } else {
1485
0
                        goto invalid_operation;
1486
0
                    }
1487
0
                    if (re_parse_class_set_operand(s, cr1, &p))
1488
0
                        goto fail;
1489
0
                    ret = re_string_list_op(cr, cr1, CR_OP_INTER);
1490
0
                    re_string_list_free(cr1);
1491
0
                    if (ret)
1492
0
                        goto memory_error;
1493
0
                }
1494
0
            } else if (*p == '-' && p[1] == '-') {
1495
                /* handle '--' */
1496
0
                for(;;) {
1497
0
                    if (*p == ']') {
1498
0
                        break;
1499
0
                    } else if (*p == '-' && p[1] == '-') {
1500
0
                        p += 2;
1501
0
                    } else {
1502
0
                    invalid_operation:
1503
0
                        re_parse_error(s, "invalid operation in regular expression");
1504
0
                        goto fail;
1505
0
                    }
1506
0
                    if (re_parse_class_set_operand(s, cr1, &p))
1507
0
                        goto fail;
1508
0
                    ret = re_string_list_op(cr, cr1, CR_OP_SUB);
1509
0
                    re_string_list_free(cr1);
1510
0
                    if (ret)
1511
0
                        goto memory_error;
1512
0
                }
1513
0
            }
1514
0
        }
1515
457k
        is_first = FALSE;
1516
457k
    }
1517
1518
32.5k
    p++;    /* skip ']' */
1519
32.5k
    *pp = p;
1520
32.5k
    if (invert) {
1521
        /* XXX: add may_contain_string syntax check to be fully
1522
           compliant. The test here accepts more input than the
1523
           spec. */
1524
0
        if (cr->n_strings != 0) {
1525
0
            re_parse_error(s, "negated character class with strings in regular expression debugger eval code");
1526
0
            goto fail;
1527
0
        }
1528
0
        if (cr_invert(&cr->cr))
1529
0
            goto memory_error;
1530
0
    }
1531
32.5k
    return 0;
1532
0
 memory_error:
1533
0
    re_parse_out_of_memory(s);
1534
10
 fail:
1535
10
    re_string_list_free(cr);
1536
10
    return -1;
1537
0
}
1538
1539
static int re_parse_char_class(REParseState *s, const uint8_t **pp)
1540
32.5k
{
1541
32.5k
    REStringList cr_s, *cr = &cr_s;
1542
1543
32.5k
    if (re_parse_nested_class(s, cr, pp))
1544
10
        return -1;
1545
32.5k
    if (re_emit_string_list(s, cr))
1546
0
        goto fail;
1547
32.5k
    re_string_list_free(cr);
1548
32.5k
    return 0;
1549
0
 fail:
1550
0
    re_string_list_free(cr);
1551
0
    return -1;
1552
32.5k
}
1553
1554
/* need_check_adv: false if the opcodes always advance the char pointer
1555
   need_capture_init: true if all the captures in the atom are not set
1556
*/
1557
static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
1558
                                               const uint8_t *bc_buf, int bc_buf_len)
1559
4.10k
{
1560
4.10k
    int pos, opcode, len;
1561
4.10k
    uint32_t val;
1562
4.10k
    BOOL need_check_adv, need_capture_init;
1563
1564
4.10k
    need_check_adv = TRUE;
1565
4.10k
    need_capture_init = FALSE;
1566
4.10k
    pos = 0;
1567
25.9k
    while (pos < bc_buf_len) {
1568
24.0k
        opcode = bc_buf[pos];
1569
24.0k
        len = reopcode_info[opcode].size;
1570
24.0k
        switch(opcode) {
1571
0
        case REOP_range:
1572
401
        case REOP_range_i:
1573
401
            val = get_u16(bc_buf + pos + 1);
1574
401
            len += val * 4;
1575
401
            need_check_adv = FALSE;
1576
401
            break;
1577
0
        case REOP_range32:
1578
788
        case REOP_range32_i:
1579
788
            val = get_u16(bc_buf + pos + 1);
1580
788
            len += val * 8;
1581
788
            need_check_adv = FALSE;
1582
788
            break;
1583
456
        case REOP_char:
1584
7.57k
        case REOP_char_i:
1585
7.58k
        case REOP_char32:
1586
7.58k
        case REOP_char32_i:
1587
7.72k
        case REOP_dot:
1588
7.72k
        case REOP_any:
1589
7.72k
        case REOP_space:
1590
7.72k
        case REOP_not_space:
1591
7.72k
            need_check_adv = FALSE;
1592
7.72k
            break;
1593
0
        case REOP_line_start:
1594
0
        case REOP_line_start_m:
1595
134
        case REOP_line_end:
1596
134
        case REOP_line_end_m:
1597
204
        case REOP_set_i32:
1598
274
        case REOP_set_char_pos:
1599
274
        case REOP_word_boundary:
1600
274
        case REOP_word_boundary_i:
1601
276
        case REOP_not_word_boundary:
1602
276
        case REOP_not_word_boundary_i:
1603
10.7k
        case REOP_prev:
1604
            /* no effect */
1605
10.7k
            break;
1606
872
        case REOP_save_start:
1607
1.74k
        case REOP_save_end:
1608
2.08k
        case REOP_save_reset:
1609
2.08k
            break;
1610
1
        case REOP_back_reference:
1611
1
        case REOP_back_reference_i:
1612
1
        case REOP_backward_back_reference:
1613
134
        case REOP_backward_back_reference_i:
1614
134
            val = bc_buf[pos + 1];
1615
134
            len += val;
1616
134
            need_capture_init = TRUE;
1617
134
            break;
1618
2.20k
        default:
1619
            /* safe behavior: we cannot predict the outcome */
1620
2.20k
            need_capture_init = TRUE;
1621
2.20k
            goto done;
1622
24.0k
        }
1623
21.8k
        pos += len;
1624
21.8k
    }
1625
4.10k
 done:
1626
4.10k
    *pneed_capture_init = need_capture_init;
1627
4.10k
    return need_check_adv;
1628
4.10k
}
1629
1630
/* '*pp' is the first char after '<' */
1631
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
1632
2.71k
{
1633
2.71k
    const uint8_t *p, *p1;
1634
2.71k
    uint32_t c, d;
1635
2.71k
    char *q;
1636
1637
2.71k
    p = *pp;
1638
2.71k
    q = buf;
1639
8.55k
    for(;;) {
1640
8.55k
        c = *p;
1641
8.55k
        if (c == '\\') {
1642
1
            p++;
1643
1
            if (*p != 'u')
1644
0
                return -1;
1645
1
            c = lre_parse_escape(&p, 2); // accept surrogate pairs
1646
8.54k
        } else if (c == '>') {
1647
2.71k
            break;
1648
5.83k
        } else if (c >= 128) {
1649
0
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1650
0
            if (is_hi_surrogate(c)) {
1651
0
                d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
1652
0
                if (is_lo_surrogate(d)) {
1653
0
                    c = from_surrogate(c, d);
1654
0
                    p = p1;
1655
0
                }
1656
0
            }
1657
5.83k
        } else {
1658
5.83k
            p++;
1659
5.83k
        }
1660
5.83k
        if (c > 0x10FFFF)
1661
1
            return -1;
1662
5.83k
        if (q == buf) {
1663
2.71k
            if (!lre_js_is_ident_first(c))
1664
0
                return -1;
1665
3.11k
        } else {
1666
3.11k
            if (!lre_js_is_ident_next(c))
1667
0
                return -1;
1668
3.11k
        }
1669
5.83k
        if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1670
0
            return -1;
1671
5.83k
        if (c < 128) {
1672
5.83k
            *q++ = c;
1673
5.83k
        } else {
1674
0
            q += unicode_to_utf8((uint8_t*)q, c);
1675
0
        }
1676
5.83k
    }
1677
2.71k
    if (q == buf)
1678
0
        return -1;
1679
2.71k
    *q = '\0';
1680
2.71k
    p++;
1681
2.71k
    *pp = p;
1682
2.71k
    return 0;
1683
2.71k
}
1684
1685
/* if capture_name = NULL: return the number of captures + 1.
1686
   Otherwise, return the number of matching capture groups  */
1687
static int re_parse_captures(REParseState *s, int *phas_named_captures,
1688
                             const char *capture_name, BOOL emit_group_index)
1689
848
{
1690
848
    const uint8_t *p;
1691
848
    int capture_index, n;
1692
848
    char name[TMP_BUF_SIZE];
1693
1694
848
    capture_index = 1;
1695
848
    n = 0;
1696
848
    *phas_named_captures = 0;
1697
13.5M
    for (p = s->buf_start; p < s->buf_end; p++) {
1698
13.5M
        switch (*p) {
1699
253k
        case '(':
1700
253k
            if (p[1] == '?') {
1701
127k
                if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1702
1.14k
                    *phas_named_captures = 1;
1703
                    /* potential named capture */
1704
1.14k
                    if (capture_name) {
1705
1.14k
                        p += 3;
1706
1.14k
                        if (re_parse_group_name(name, sizeof(name), &p) == 0) {
1707
1.14k
                            if (!strcmp(name, capture_name)) {
1708
744
                                if (emit_group_index)
1709
372
                                    dbuf_putc(&s->byte_code, capture_index);
1710
744
                                n++;
1711
744
                            }
1712
1.14k
                        }
1713
1.14k
                    }
1714
1.14k
                    capture_index++;
1715
1.14k
                    if (capture_index >= CAPTURE_COUNT_MAX)
1716
4
                        goto done;
1717
1.14k
                }
1718
127k
            } else {
1719
126k
                capture_index++;
1720
126k
                if (capture_index >= CAPTURE_COUNT_MAX)
1721
0
                    goto done;
1722
126k
            }
1723
253k
            break;
1724
1.53M
        case '\\':
1725
1.53M
            p++;
1726
1.53M
            break;
1727
2.90M
        case '[':
1728
46.8M
            for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1729
43.9M
                if (*p == '\\')
1730
2.15M
                    p++;
1731
43.9M
            }
1732
2.90M
            break;
1733
13.5M
        }
1734
13.5M
    }
1735
848
 done:
1736
848
    if (capture_name) {
1737
838
        return n;
1738
838
    } else {
1739
10
        return capture_index;
1740
10
    }
1741
848
}
1742
1743
static int re_count_captures(REParseState *s)
1744
10
{
1745
10
    if (s->total_capture_count < 0) {
1746
10
        s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1747
10
                                                   NULL, FALSE);
1748
10
    }
1749
10
    return s->total_capture_count;
1750
10
}
1751
1752
static BOOL re_has_named_captures(REParseState *s)
1753
832
{
1754
832
    if (s->has_named_captures < 0)
1755
9
        re_count_captures(s);
1756
832
    return s->has_named_captures;
1757
832
}
1758
1759
static int find_group_name(REParseState *s, const char *name, BOOL emit_group_index)
1760
1.20k
{
1761
1.20k
    const char *p, *buf_end;
1762
1.20k
    size_t len, name_len;
1763
1.20k
    int capture_index, n;
1764
1765
1.20k
    p = (char *)s->group_names.buf;
1766
1.20k
    if (!p)
1767
0
        return 0;
1768
1.20k
    buf_end = (char *)s->group_names.buf + s->group_names.size;
1769
1.20k
    name_len = strlen(name);
1770
1.20k
    capture_index = 1;
1771
1.20k
    n = 0;
1772
122k
    while (p < buf_end) {
1773
121k
        len = strlen(p);
1774
121k
        if (len == name_len && memcmp(name, p, name_len) == 0) {
1775
23.8k
            if (emit_group_index)
1776
11.9k
                dbuf_putc(&s->byte_code, capture_index);
1777
23.8k
            n++;
1778
23.8k
        }
1779
121k
        p += len + LRE_GROUP_NAME_TRAILER_LEN;
1780
121k
        capture_index++;
1781
121k
    }
1782
1.20k
    return n;
1783
1.20k
}
1784
1785
static BOOL is_duplicate_group_name(REParseState *s, const char *name, int scope)
1786
556
{
1787
556
    const char *p, *buf_end;
1788
556
    size_t len, name_len;
1789
556
    int scope1;
1790
    
1791
556
    p = (char *)s->group_names.buf;
1792
556
    if (!p)
1793
0
        return 0;
1794
556
    buf_end = (char *)s->group_names.buf + s->group_names.size;
1795
556
    name_len = strlen(name);
1796
64.8k
    while (p < buf_end) {
1797
64.3k
        len = strlen(p);
1798
64.3k
        if (len == name_len && memcmp(name, p, name_len) == 0) {
1799
26.3k
            scope1 = (uint8_t)p[len + 1];
1800
26.3k
            if (scope == scope1)
1801
2
                return TRUE;
1802
26.3k
        }
1803
64.3k
        p += len + LRE_GROUP_NAME_TRAILER_LEN;
1804
64.3k
    }
1805
554
    return FALSE;
1806
556
}
1807
1808
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1809
1810
static int re_parse_modifiers(REParseState *s, const uint8_t **pp)
1811
996
{
1812
996
    const uint8_t *p = *pp;
1813
996
    int mask = 0;
1814
996
    int val;
1815
1816
1.99k
    for(;;) {
1817
1.99k
        if (*p == 'i') {
1818
996
            val = LRE_FLAG_IGNORECASE;
1819
996
        } else if (*p == 'm') {
1820
0
            val = LRE_FLAG_MULTILINE;
1821
996
        } else if (*p == 's') {
1822
0
            val = LRE_FLAG_DOTALL;
1823
996
        } else {
1824
996
            break;
1825
996
        }
1826
996
        if (mask & val)
1827
0
            return re_parse_error(s, "duplicate modifier: '%c'", *p);
1828
996
        mask |= val;
1829
996
        p++;
1830
996
    }
1831
996
    *pp = p;
1832
996
    return mask;
1833
996
}
1834
1835
static BOOL update_modifier(BOOL val, int add_mask, int remove_mask,
1836
                            int mask)
1837
2.98k
{
1838
2.98k
    if (add_mask & mask)
1839
996
        val = TRUE;
1840
2.98k
    if (remove_mask & mask)
1841
0
        val = FALSE;
1842
2.98k
    return val;
1843
2.98k
}
1844
1845
static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1846
527k
{
1847
527k
    const uint8_t *p;
1848
527k
    int c, last_atom_start, quant_min, quant_max, last_capture_count;
1849
527k
    BOOL greedy, is_neg, is_backward_lookahead;
1850
527k
    REStringList cr_s, *cr = &cr_s;
1851
1852
527k
    last_atom_start = -1;
1853
527k
    last_capture_count = 0;
1854
527k
    p = s->buf_ptr;
1855
527k
    c = *p;
1856
527k
    switch(c) {
1857
313
    case '^':
1858
313
        p++;
1859
313
        re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start);
1860
313
        break;
1861
1.52k
    case '$':
1862
1.52k
        p++;
1863
1.52k
        re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end);
1864
1.52k
        break;
1865
286
    case '.':
1866
286
        p++;
1867
286
        last_atom_start = s->byte_code.size;
1868
286
        last_capture_count = s->capture_count;
1869
286
        if (is_backward_dir)
1870
127
            re_emit_op(s, REOP_prev);
1871
286
        re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1872
286
        if (is_backward_dir)
1873
127
            re_emit_op(s, REOP_prev);
1874
286
        break;
1875
0
    case '{':
1876
0
        if (s->is_unicode) {
1877
0
            return re_parse_error(s, "syntax error");
1878
0
        } else if (!is_digit(p[1])) {
1879
            /* Annex B: we accept '{' not followed by digits as a
1880
               normal atom */
1881
0
            goto parse_class_atom;
1882
0
        } else {
1883
0
            const uint8_t *p1 = p + 1;
1884
            /* Annex B: error if it is like a repetition count */
1885
0
            parse_digits(&p1, TRUE);
1886
0
            if (*p1 == ',') {
1887
0
                p1++;
1888
0
                if (is_digit(*p1)) {
1889
0
                    parse_digits(&p1, TRUE);
1890
0
                }
1891
0
            }
1892
0
            if (*p1 != '}') {
1893
0
                goto parse_class_atom;
1894
0
            }
1895
0
        }
1896
        /* fall thru */
1897
0
    case '*':
1898
0
    case '+':
1899
0
    case '?':
1900
0
        return re_parse_error(s, "nothing to repeat");
1901
4.06k
    case '(':
1902
4.06k
        if (p[1] == '?') {
1903
2.01k
            if (p[2] == ':') {
1904
55
                p += 3;
1905
55
                last_atom_start = s->byte_code.size;
1906
55
                last_capture_count = s->capture_count;
1907
55
                s->buf_ptr = p;
1908
55
                if (re_parse_disjunction(s, is_backward_dir))
1909
0
                    return -1;
1910
55
                p = s->buf_ptr;
1911
55
                if (re_parse_expect(s, &p, ')'))
1912
0
                    return -1;
1913
1.96k
            } else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') {
1914
996
                BOOL saved_ignore_case, saved_multi_line, saved_dotall;
1915
996
                int add_mask, remove_mask;
1916
996
                p += 2;
1917
996
                remove_mask = 0;
1918
996
                add_mask = re_parse_modifiers(s, &p);
1919
996
                if (add_mask < 0)
1920
0
                    return -1;
1921
996
                if (*p == '-') {
1922
0
                    p++;
1923
0
                    remove_mask = re_parse_modifiers(s, &p);
1924
0
                    if (remove_mask < 0)
1925
0
                        return -1;
1926
0
                }
1927
996
                if ((add_mask == 0 && remove_mask == 0) ||
1928
996
                    (add_mask & remove_mask) != 0) {
1929
0
                    return re_parse_error(s, "invalid modifiers");
1930
0
                }
1931
996
                if (re_parse_expect(s, &p, ':'))
1932
0
                    return -1;
1933
996
                saved_ignore_case = s->ignore_case;
1934
996
                saved_multi_line = s->multi_line;
1935
996
                saved_dotall = s->dotall;
1936
996
                s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE);
1937
996
                s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE);
1938
996
                s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL);
1939
1940
996
                last_atom_start = s->byte_code.size;
1941
996
                last_capture_count = s->capture_count;
1942
996
                s->buf_ptr = p;
1943
996
                if (re_parse_disjunction(s, is_backward_dir))
1944
67
                    return -1;
1945
929
                p = s->buf_ptr;
1946
929
                if (re_parse_expect(s, &p, ')'))
1947
0
                    return -1;
1948
929
                s->ignore_case = saved_ignore_case;
1949
929
                s->multi_line = saved_multi_line;
1950
929
                s->dotall = saved_dotall;
1951
965
            } else if ((p[2] == '=' || p[2] == '!')) {
1952
268
                is_neg = (p[2] == '!');
1953
268
                is_backward_lookahead = FALSE;
1954
268
                p += 3;
1955
268
                goto lookahead;
1956
697
            } else if (p[2] == '<' &&
1957
697
                       (p[3] == '=' || p[3] == '!')) {
1958
140
                int pos;
1959
140
                is_neg = (p[3] == '!');
1960
140
                is_backward_lookahead = TRUE;
1961
140
                p += 4;
1962
                /* lookahead */
1963
408
            lookahead:
1964
                /* Annex B allows lookahead to be used as an atom for
1965
                   the quantifiers */
1966
408
                if (!s->is_unicode && !is_backward_lookahead)  {
1967
268
                    last_atom_start = s->byte_code.size;
1968
268
                    last_capture_count = s->capture_count;
1969
268
                }
1970
408
                pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1971
408
                s->buf_ptr = p;
1972
408
                if (re_parse_disjunction(s, is_backward_lookahead))
1973
143
                    return -1;
1974
265
                p = s->buf_ptr;
1975
265
                if (re_parse_expect(s, &p, ')'))
1976
0
                    return -1;
1977
265
                re_emit_op(s, REOP_lookahead_match + is_neg);
1978
                /* jump after the 'match' after the lookahead is successful */
1979
265
                if (dbuf_error(&s->byte_code))
1980
0
                    return -1;
1981
265
                put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1982
557
            } else if (p[2] == '<') {
1983
557
                p += 3;
1984
557
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1985
557
                                        &p)) {
1986
1
                    return re_parse_error(s, "invalid group name");
1987
1
                }
1988
                /* poor's man method to test duplicate group
1989
                   names. */
1990
                /* XXX: this method does not catch all the errors*/
1991
556
                if (is_duplicate_group_name(s, s->u.tmp_buf, s->group_name_scope)) {
1992
2
                    return re_parse_error(s, "duplicate group name");
1993
2
                }
1994
                /* group name with a trailing zero */
1995
554
                dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1996
554
                         strlen(s->u.tmp_buf) + 1);
1997
554
                dbuf_putc(&s->group_names, s->group_name_scope);
1998
554
                s->has_named_captures = 1;
1999
554
                goto parse_capture;
2000
556
            } else {
2001
0
                return re_parse_error(s, "invalid group");
2002
0
            }
2003
2.04k
        } else {
2004
2.04k
            int capture_index;
2005
2.04k
            p++;
2006
            /* capture without group name */
2007
2.04k
            dbuf_putc(&s->group_names, 0);
2008
2.04k
            dbuf_putc(&s->group_names, 0);
2009
2.60k
        parse_capture:
2010
2.60k
            if (s->capture_count >= CAPTURE_COUNT_MAX)
2011
0
                return re_parse_error(s, "too many captures");
2012
2.60k
            last_atom_start = s->byte_code.size;
2013
2.60k
            last_capture_count = s->capture_count;
2014
2.60k
            capture_index = s->capture_count++;
2015
2.60k
            re_emit_op_u8(s, REOP_save_start + is_backward_dir,
2016
2.60k
                          capture_index);
2017
2018
2.60k
            s->buf_ptr = p;
2019
2.60k
            if (re_parse_disjunction(s, is_backward_dir))
2020
839
                return -1;
2021
1.76k
            p = s->buf_ptr;
2022
2023
1.76k
            re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
2024
1.76k
                          capture_index);
2025
2026
1.76k
            if (re_parse_expect(s, &p, ')'))
2027
1
                return -1;
2028
1.76k
        }
2029
3.01k
        break;
2030
218k
    case '\\':
2031
218k
        switch(p[1]) {
2032
0
        case 'b':
2033
10.8k
        case 'B':
2034
10.8k
            if (p[1] != 'b') {
2035
10.8k
                re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
2036
10.8k
            } else {
2037
0
                re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
2038
0
            }
2039
10.8k
            p += 2;
2040
10.8k
            break;
2041
1.02k
        case 'k':
2042
1.02k
            {
2043
1.02k
                const uint8_t *p1;
2044
1.02k
                int dummy_res, n;
2045
1.02k
                BOOL is_forward;
2046
                
2047
1.02k
                p1 = p;
2048
1.02k
                if (p1[2] != '<') {
2049
                    /* annex B: we tolerate invalid group names in non
2050
                       unicode mode if there is no named capture
2051
                       definition */
2052
0
                    if (s->is_unicode || re_has_named_captures(s))
2053
0
                        return re_parse_error(s, "expecting group name");
2054
0
                    else
2055
0
                        goto parse_class_atom;
2056
0
                }
2057
1.02k
                p1 += 3;
2058
1.02k
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
2059
1.02k
                                        &p1)) {
2060
0
                    if (s->is_unicode || re_has_named_captures(s))
2061
0
                        return re_parse_error(s, "invalid group name");
2062
0
                    else
2063
0
                        goto parse_class_atom;
2064
0
                }
2065
1.02k
                is_forward = FALSE;
2066
1.02k
                n = find_group_name(s, s->u.tmp_buf, FALSE);
2067
1.02k
                if (n == 0) {
2068
                    /* no capture name parsed before, try to look
2069
                       after (inefficient, but hopefully not common */
2070
835
                    n = re_parse_captures(s, &dummy_res, s->u.tmp_buf, FALSE);
2071
835
                    if (n == 0) {
2072
832
                        if (s->is_unicode || re_has_named_captures(s))
2073
0
                            return re_parse_error(s, "group name not defined");
2074
832
                        else
2075
832
                            goto parse_class_atom;
2076
832
                    }
2077
3
                    is_forward = TRUE;
2078
3
                }
2079
190
                last_atom_start = s->byte_code.size;
2080
190
                last_capture_count = s->capture_count;
2081
                
2082
                /* emit back references to all the captures indexes matching the group name */
2083
190
                re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, n);
2084
190
                if (is_forward) {
2085
3
                    re_parse_captures(s, &dummy_res, s->u.tmp_buf, TRUE);
2086
187
                } else {
2087
187
                    find_group_name(s, s->u.tmp_buf, TRUE);
2088
187
                }
2089
190
                p = p1;
2090
190
            }
2091
0
            break;
2092
21
        case '0':
2093
21
            p += 2;
2094
21
            c = 0;
2095
21
            if (s->is_unicode) {
2096
0
                if (is_digit(*p)) {
2097
0
                    return re_parse_error(s, "invalid decimal escape in regular expression");
2098
0
                }
2099
21
            } else {
2100
                /* Annex B.1.4: accept legacy octal */
2101
21
                if (*p >= '0' && *p <= '7') {
2102
0
                    c = *p++ - '0';
2103
0
                    if (*p >= '0' && *p <= '7') {
2104
0
                        c = (c << 3) + *p++ - '0';
2105
0
                    }
2106
0
                }
2107
21
            }
2108
21
            goto normal_char;
2109
520
        case '1': case '2': case '3': case '4':
2110
800
        case '5': case '6': case '7': case '8':
2111
800
        case '9':
2112
800
            {
2113
800
                const uint8_t *q = ++p;
2114
2115
800
                c = parse_digits(&p, FALSE);
2116
800
                if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
2117
11
                    if (!s->is_unicode) {
2118
                        /* Annex B.1.4: accept legacy octal */
2119
11
                        p = q;
2120
11
                        if (*p <= '7') {
2121
11
                            c = 0;
2122
11
                            if (*p <= '3')
2123
0
                                c = *p++ - '0';
2124
11
                            if (*p >= '0' && *p <= '7') {
2125
11
                                c = (c << 3) + *p++ - '0';
2126
11
                                if (*p >= '0' && *p <= '7') {
2127
11
                                    c = (c << 3) + *p++ - '0';
2128
11
                                }
2129
11
                            }
2130
11
                        } else {
2131
0
                            c = *p++;
2132
0
                        }
2133
11
                        goto normal_char;
2134
11
                    }
2135
0
                    return re_parse_error(s, "back reference out of range in regular expression");
2136
11
                }
2137
789
                last_atom_start = s->byte_code.size;
2138
789
                last_capture_count = s->capture_count;
2139
                
2140
789
                re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, 1);
2141
789
                dbuf_putc(&s->byte_code, c);
2142
789
            }
2143
0
            break;
2144
206k
        default:
2145
206k
            goto parse_class_atom;
2146
218k
        }
2147
11.8k
        break;
2148
32.5k
    case '[':
2149
32.5k
        last_atom_start = s->byte_code.size;
2150
32.5k
        last_capture_count = s->capture_count;
2151
32.5k
        if (is_backward_dir)
2152
31.5k
            re_emit_op(s, REOP_prev);
2153
32.5k
        if (re_parse_char_class(s, &p))
2154
10
            return -1;
2155
32.5k
        if (is_backward_dir)
2156
31.4k
            re_emit_op(s, REOP_prev);
2157
32.5k
        break;
2158
14.3k
    case ']':
2159
14.3k
    case '}':
2160
14.3k
        if (s->is_unicode)
2161
0
            return re_parse_error(s, "syntax error");
2162
14.3k
        goto parse_class_atom;
2163
255k
    default:
2164
476k
    parse_class_atom:
2165
476k
        c = get_class_atom(s, cr, &p, FALSE);
2166
476k
        if ((int)c < 0)
2167
1
            return -1;
2168
476k
    normal_char:
2169
476k
        last_atom_start = s->byte_code.size;
2170
476k
        last_capture_count = s->capture_count;
2171
476k
        if (is_backward_dir)
2172
95.6k
            re_emit_op(s, REOP_prev);
2173
476k
        if (c >= CLASS_RANGE_BASE) {
2174
173k
            int ret = 0;
2175
            /* optimize the common 'space' tests */
2176
173k
            if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
2177
37
                re_emit_op(s, REOP_space);
2178
173k
            } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
2179
173k
                re_emit_op(s, REOP_not_space);
2180
173k
            } else {
2181
1
                ret = re_emit_string_list(s, cr);
2182
1
            }
2183
173k
            re_string_list_free(cr);
2184
173k
            if (ret)
2185
0
                return -1;
2186
302k
        } else {
2187
302k
            if (s->ignore_case)
2188
86.5k
                c = lre_canonicalize(c, s->is_unicode);
2189
302k
            re_emit_char(s, c);
2190
302k
        }
2191
476k
        if (is_backward_dir)
2192
95.6k
            re_emit_op(s, REOP_prev);
2193
476k
        break;
2194
527k
    }
2195
2196
    /* quantifier */
2197
525k
    if (last_atom_start >= 0) {
2198
513k
        c = *p;
2199
513k
        switch(c) {
2200
3
        case '*':
2201
3
            p++;
2202
3
            quant_min = 0;
2203
3
            quant_max = INT32_MAX;
2204
3
            goto quantifier;
2205
2.54k
        case '+':
2206
2.54k
            p++;
2207
2.54k
            quant_min = 1;
2208
2.54k
            quant_max = INT32_MAX;
2209
2.54k
            goto quantifier;
2210
1.55k
        case '?':
2211
1.55k
            p++;
2212
1.55k
            quant_min = 0;
2213
1.55k
            quant_max = 1;
2214
1.55k
            goto quantifier;
2215
1
        case '{':
2216
1
            {
2217
1
                const uint8_t *p1 = p;
2218
                /* As an extension (see ES6 annex B), we accept '{' not
2219
                   followed by digits as a normal atom */
2220
1
                if (!is_digit(p[1])) {
2221
0
                    if (s->is_unicode)
2222
0
                        goto invalid_quant_count;
2223
0
                    break;
2224
0
                }
2225
1
                p++;
2226
1
                quant_min = parse_digits(&p, TRUE);
2227
1
                quant_max = quant_min;
2228
1
                if (*p == ',') {
2229
0
                    p++;
2230
0
                    if (is_digit(*p)) {
2231
0
                        quant_max = parse_digits(&p, TRUE);
2232
0
                        if (quant_max < quant_min) {
2233
0
                        invalid_quant_count:
2234
0
                            return re_parse_error(s, "invalid repetition count");
2235
0
                        }
2236
0
                    } else {
2237
0
                        quant_max = INT32_MAX; /* infinity */
2238
0
                    }
2239
0
                }
2240
1
                if (*p != '}' && !s->is_unicode) {
2241
                    /* Annex B: normal atom if invalid '{' syntax */
2242
0
                    p = p1;
2243
0
                    break;
2244
0
                }
2245
1
                if (re_parse_expect(s, &p, '}'))
2246
0
                    return -1;
2247
1
            }
2248
4.10k
        quantifier:
2249
4.10k
            greedy = TRUE;
2250
4.10k
            if (*p == '?') {
2251
146
                p++;
2252
146
                greedy = FALSE;
2253
146
            }
2254
4.10k
            if (last_atom_start < 0) {
2255
0
                return re_parse_error(s, "nothing to repeat");
2256
0
            }
2257
4.10k
            {
2258
4.10k
                BOOL need_capture_init, add_zero_advance_check;
2259
4.10k
                int len, pos;
2260
                
2261
                /* the spec tells that if there is no advance when
2262
                   running the atom after the first quant_min times,
2263
                   then there is no match. We remove this test when we
2264
                   are sure the atom always advances the position. */
2265
4.10k
                add_zero_advance_check =
2266
4.10k
                    re_need_check_adv_and_capture_init(&need_capture_init,
2267
4.10k
                                                       s->byte_code.buf + last_atom_start,
2268
4.10k
                                                       s->byte_code.size - last_atom_start);
2269
            
2270
                /* general case: need to reset the capture at each
2271
                   iteration. We don't do it if there are no captures
2272
                   in the atom or if we are sure all captures are
2273
                   initialized in the atom. If quant_min = 0, we still
2274
                   need to reset once the captures in case the atom
2275
                   does not match. */
2276
4.10k
                if (need_capture_init && last_capture_count != s->capture_count) {
2277
1.54k
                    if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2278
0
                        goto out_of_memory;
2279
1.54k
                    int pos = last_atom_start;
2280
1.54k
                    s->byte_code.buf[pos++] = REOP_save_reset;
2281
1.54k
                    s->byte_code.buf[pos++] = last_capture_count;
2282
1.54k
                    s->byte_code.buf[pos++] = s->capture_count - 1;
2283
1.54k
                }
2284
2285
4.10k
                len = s->byte_code.size - last_atom_start;
2286
4.10k
                if (quant_min == 0) {
2287
                    /* need to reset the capture in case the atom is
2288
                       not executed */
2289
1.55k
                    if (!need_capture_init && last_capture_count != s->capture_count) {
2290
6
                        if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2291
0
                            goto out_of_memory;
2292
6
                        s->byte_code.buf[last_atom_start++] = REOP_save_reset;
2293
6
                        s->byte_code.buf[last_atom_start++] = last_capture_count;
2294
6
                        s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
2295
6
                    }
2296
1.55k
                    if (quant_max == 0) {
2297
0
                        s->byte_code.size = last_atom_start;
2298
1.55k
                    } else if (quant_max == 1 || quant_max == INT32_MAX) {
2299
1.55k
                        BOOL has_goto = (quant_max == INT32_MAX);
2300
1.55k
                        if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check * 2))
2301
0
                            goto out_of_memory;
2302
1.55k
                        s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
2303
1.55k
                            greedy;
2304
1.55k
                        put_u32(s->byte_code.buf + last_atom_start + 1,
2305
1.55k
                                len + 5 * has_goto + add_zero_advance_check * 2 * 2);
2306
1.55k
                        if (add_zero_advance_check) {
2307
618
                            s->byte_code.buf[last_atom_start + 1 + 4] = REOP_set_char_pos;
2308
618
                            s->byte_code.buf[last_atom_start + 1 + 4 + 1] = 0;
2309
618
                            re_emit_op_u8(s, REOP_check_advance, 0);
2310
618
                        }
2311
1.55k
                        if (has_goto)
2312
3
                            re_emit_goto(s, REOP_goto, last_atom_start);
2313
1.55k
                    } else {
2314
0
                        if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
2315
0
                            goto out_of_memory;
2316
0
                        pos = last_atom_start;
2317
0
                        s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
2318
0
                        put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
2319
0
                        pos += 4;
2320
2321
0
                        s->byte_code.buf[pos++] = REOP_set_i32;
2322
0
                        s->byte_code.buf[pos++] = 0;
2323
0
                        put_u32(s->byte_code.buf + pos, quant_max);
2324
0
                        pos += 4;
2325
0
                        last_atom_start = pos;
2326
0
                        if (add_zero_advance_check) {
2327
0
                            s->byte_code.buf[pos++] = REOP_set_char_pos;
2328
0
                            s->byte_code.buf[pos++] = 0;
2329
0
                        }
2330
0
                        re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
2331
0
                    }
2332
2.54k
                } else if (quant_min == 1 && quant_max == INT32_MAX &&
2333
2.54k
                           !add_zero_advance_check) {
2334
2.10k
                    re_emit_goto(s, REOP_split_next_first - greedy,
2335
2.10k
                                 last_atom_start);
2336
2.10k
                } else {
2337
448
                    if (quant_min == quant_max)
2338
1
                        add_zero_advance_check = FALSE;
2339
448
                    if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2))
2340
0
                        goto out_of_memory;
2341
                    /* Note: we assume the string length is < INT32_MAX */
2342
448
                    pos = last_atom_start;
2343
448
                    s->byte_code.buf[pos++] = REOP_set_i32;
2344
448
                    s->byte_code.buf[pos++] = 0;
2345
448
                    put_u32(s->byte_code.buf + pos, quant_max);
2346
448
                    pos += 4;
2347
448
                    last_atom_start = pos;
2348
448
                    if (add_zero_advance_check) {
2349
447
                        s->byte_code.buf[pos++] = REOP_set_char_pos;
2350
447
                        s->byte_code.buf[pos++] = 0;
2351
447
                    }
2352
448
                    if (quant_min == quant_max) {
2353
                        /* a simple loop is enough */
2354
1
                        re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
2355
447
                    } else {
2356
447
                        re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start);
2357
447
                    }
2358
448
                }
2359
4.10k
                last_atom_start = -1;
2360
4.10k
            }
2361
0
            break;
2362
509k
        default:
2363
509k
            break;
2364
513k
        }
2365
513k
    }
2366
525k
    s->buf_ptr = p;
2367
525k
    return 0;
2368
0
 out_of_memory:
2369
0
    return re_parse_out_of_memory(s);
2370
525k
}
2371
2372
static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
2373
5.96k
{
2374
5.96k
    const uint8_t *p;
2375
5.96k
    int ret;
2376
5.96k
    size_t start, term_start, end, term_size;
2377
2378
5.96k
    start = s->byte_code.size;
2379
531k
    for(;;) {
2380
531k
        p = s->buf_ptr;
2381
531k
        if (p >= s->buf_end)
2382
16
            break;
2383
531k
        if (*p == '|' || *p == ')')
2384
4.88k
            break;
2385
527k
        term_start = s->byte_code.size;
2386
527k
        ret = re_parse_term(s, is_backward_dir);
2387
527k
        if (ret)
2388
1.06k
            return ret;
2389
525k
        if (is_backward_dir) {
2390
            /* reverse the order of the terms (XXX: inefficient, but
2391
               speed is not really critical here) */
2392
141k
            end = s->byte_code.size;
2393
141k
            term_size = end - term_start;
2394
141k
            if (dbuf_claim(&s->byte_code, term_size))
2395
0
                return -1;
2396
141k
            memmove(s->byte_code.buf + start + term_size,
2397
141k
                    s->byte_code.buf + start,
2398
141k
                    end - start);
2399
141k
            memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
2400
141k
                   term_size);
2401
141k
        }
2402
525k
    }
2403
4.89k
    return 0;
2404
5.96k
}
2405
2406
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
2407
4.09k
{
2408
4.09k
    int start, len, pos;
2409
2410
4.09k
    if (lre_check_stack_overflow(s->opaque, 0))
2411
0
        return re_parse_error(s, "stack overflow");
2412
2413
4.09k
    start = s->byte_code.size;
2414
4.09k
    if (re_parse_alternative(s, is_backward_dir))
2415
446
        return -1;
2416
4.89k
    while (*s->buf_ptr == '|') {
2417
1.87k
        s->buf_ptr++;
2418
2419
1.87k
        len = s->byte_code.size - start;
2420
2421
        /* insert a split before the first alternative */
2422
1.87k
        if (dbuf_insert(&s->byte_code, start, 5)) {
2423
0
            return re_parse_out_of_memory(s);
2424
0
        }
2425
1.87k
        s->byte_code.buf[start] = REOP_split_next_first;
2426
1.87k
        put_u32(s->byte_code.buf + start + 1, len + 5);
2427
2428
1.87k
        pos = re_emit_op_u32(s, REOP_goto, 0);
2429
2430
1.87k
        s->group_name_scope++;
2431
        
2432
1.87k
        if (re_parse_alternative(s, is_backward_dir))
2433
618
            return -1;
2434
2435
        /* patch the goto */
2436
1.25k
        len = s->byte_code.size - (pos + 4);
2437
1.25k
        put_u32(s->byte_code.buf + pos, len);
2438
1.25k
    }
2439
3.02k
    return 0;
2440
3.64k
}
2441
2442
/* Allocate the registers as a stack. The control flow is recursive so
2443
   the analysis can be linear. */
2444
static int compute_register_count(uint8_t *bc_buf, int bc_buf_len)
2445
15
{
2446
15
    int stack_size, stack_size_max, pos, opcode, len;
2447
15
    uint32_t val;
2448
2449
15
    stack_size = 0;
2450
15
    stack_size_max = 0;
2451
15
    bc_buf += RE_HEADER_LEN;
2452
15
    bc_buf_len -= RE_HEADER_LEN;
2453
15
    pos = 0;
2454
224k
    while (pos < bc_buf_len) {
2455
224k
        opcode = bc_buf[pos];
2456
224k
        len = reopcode_info[opcode].size;
2457
224k
        assert(opcode < REOP_COUNT);
2458
224k
        assert((pos + len) <= bc_buf_len);
2459
224k
        switch(opcode) {
2460
39
        case REOP_set_i32:
2461
562
        case REOP_set_char_pos:
2462
562
            bc_buf[pos + 1] = stack_size;
2463
562
            stack_size++;
2464
562
            if (stack_size > stack_size_max) {
2465
488
                if (stack_size > REGISTER_COUNT_MAX)
2466
0
                    return -1;
2467
488
                stack_size_max = stack_size;
2468
488
            }
2469
562
            break;
2470
562
        case REOP_check_advance:
2471
486
        case REOP_loop:
2472
486
        case REOP_loop_split_goto_first:
2473
486
        case REOP_loop_split_next_first:
2474
486
            assert(stack_size > 0);
2475
486
            stack_size--;
2476
486
            bc_buf[pos + 1] = stack_size;
2477
486
            break;
2478
27
        case REOP_loop_check_adv_split_goto_first:
2479
38
        case REOP_loop_check_adv_split_next_first:
2480
38
            assert(stack_size >= 2);
2481
38
            stack_size -= 2;
2482
38
            bc_buf[pos + 1] = stack_size;
2483
38
            break;
2484
1
        case REOP_range:
2485
1
        case REOP_range_i:
2486
1
            val = get_u16(bc_buf + pos + 1);
2487
1
            len += val * 4;
2488
1
            break;
2489
0
        case REOP_range32:
2490
3
        case REOP_range32_i:
2491
3
            val = get_u16(bc_buf + pos + 1);
2492
3
            len += val * 8;
2493
3
            break;
2494
1
        case REOP_back_reference:
2495
1
        case REOP_back_reference_i:
2496
1
        case REOP_backward_back_reference:
2497
1
        case REOP_backward_back_reference_i:
2498
1
            val = bc_buf[pos + 1];
2499
1
            len += val;
2500
1
            break;
2501
224k
        }
2502
224k
        pos += len;
2503
224k
    }
2504
15
    return stack_size_max;
2505
15
}
2506
2507
static void *lre_bytecode_realloc(void *opaque, void *ptr, size_t size)
2508
752
{
2509
752
    if (size > (INT32_MAX / 2)) {
2510
        /* the bytecode cannot be larger than 2G. Leave some slack to 
2511
           avoid some overflows. */
2512
0
        return NULL;
2513
752
    } else {
2514
752
        return lre_realloc(opaque, ptr, size);
2515
752
    }
2516
752
}
2517
2518
/* 'buf' must be a zero terminated UTF-8 string of length buf_len.
2519
   Return NULL if error and allocate an error message in *perror_msg,
2520
   otherwise the compiled bytecode and its length in plen.
2521
*/
2522
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
2523
                     const char *buf, size_t buf_len, int re_flags,
2524
                     void *opaque)
2525
30
{
2526
30
    REParseState s_s, *s = &s_s;
2527
30
    int register_count;
2528
30
    BOOL is_sticky;
2529
2530
30
    memset(s, 0, sizeof(*s));
2531
30
    s->opaque = opaque;
2532
30
    s->buf_ptr = (const uint8_t *)buf;
2533
30
    s->buf_end = s->buf_ptr + buf_len;
2534
30
    s->buf_start = s->buf_ptr;
2535
30
    s->re_flags = re_flags;
2536
30
    s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
2537
30
    is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
2538
30
    s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
2539
30
    s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0);
2540
30
    s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
2541
30
    s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
2542
30
    s->capture_count = 1;
2543
30
    s->total_capture_count = -1;
2544
30
    s->has_named_captures = -1;
2545
2546
30
    dbuf_init2(&s->byte_code, opaque, lre_bytecode_realloc);
2547
30
    dbuf_init2(&s->group_names, opaque, lre_realloc);
2548
2549
30
    dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */
2550
30
    dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
2551
30
    dbuf_putc(&s->byte_code, 0); /* stack size */
2552
30
    dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
2553
2554
30
    if (!is_sticky) {
2555
        /* iterate thru all positions (about the same as .*?( ... ) )
2556
           .  We do it without an explicit loop so that lock step
2557
           thread execution will be possible in an optimized
2558
           implementation */
2559
30
        re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
2560
30
        re_emit_op(s, REOP_any);
2561
30
        re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
2562
30
    }
2563
30
    re_emit_op_u8(s, REOP_save_start, 0);
2564
2565
30
    if (re_parse_disjunction(s, FALSE)) {
2566
15
    error:
2567
15
        dbuf_free(&s->byte_code);
2568
15
        dbuf_free(&s->group_names);
2569
15
        pstrcpy(error_msg, error_msg_size, s->u.error_msg);
2570
15
        *plen = 0;
2571
15
        return NULL;
2572
15
    }
2573
2574
15
    re_emit_op_u8(s, REOP_save_end, 0);
2575
2576
15
    re_emit_op(s, REOP_match);
2577
2578
15
    if (*s->buf_ptr != '\0') {
2579
0
        re_parse_error(s, "extraneous characters at the end");
2580
0
        goto error;
2581
0
    }
2582
2583
15
    if (dbuf_error(&s->byte_code)) {
2584
0
        re_parse_out_of_memory(s);
2585
0
        goto error;
2586
0
    }
2587
2588
15
    register_count = compute_register_count(s->byte_code.buf, s->byte_code.size);
2589
15
    if (register_count < 0) {
2590
0
        re_parse_error(s, "too many imbricated quantifiers");
2591
0
        goto error;
2592
0
    }
2593
2594
15
    s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
2595
15
    s->byte_code.buf[RE_HEADER_REGISTER_COUNT] = register_count;
2596
15
    put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
2597
15
            s->byte_code.size - RE_HEADER_LEN);
2598
2599
    /* add the named groups if needed */
2600
15
    if (s->group_names.size > (s->capture_count - 1) * LRE_GROUP_NAME_TRAILER_LEN) {
2601
0
        dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
2602
0
        put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
2603
0
                lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
2604
0
    }
2605
15
    dbuf_free(&s->group_names);
2606
2607
#ifdef DUMP_REOP
2608
    lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
2609
#endif
2610
2611
15
    error_msg[0] = '\0';
2612
15
    *plen = s->byte_code.size;
2613
15
    return s->byte_code.buf;
2614
15
}
2615
2616
static BOOL is_line_terminator(uint32_t c)
2617
768k
{
2618
768k
    return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
2619
768k
}
2620
2621
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
2622
1.56M
    do {                                                                \
2623
1.56M
        if (cbuf_type == 0) {                                           \
2624
1.56M
            c = *cptr++;                                                \
2625
1.56M
        } else {                                                        \
2626
0
            const uint16_t *_p = (const uint16_t *)cptr;                \
2627
0
            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
2628
0
            c = *_p++;                                                  \
2629
0
            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
2630
0
                if (_p < _end && is_lo_surrogate(*_p)) {                \
2631
0
                    c = from_surrogate(c, *_p++);                       \
2632
0
                }                                                       \
2633
0
            }                                                           \
2634
0
            cptr = (const void *)_p;                                    \
2635
0
        }                                                               \
2636
1.56M
    } while (0)
2637
2638
#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type)                         \
2639
0
    do {                                                                \
2640
0
        if (cbuf_type == 0) {                                           \
2641
0
            c = cptr[0];                                                \
2642
0
        } else {                                                        \
2643
0
            const uint16_t *_p = (const uint16_t *)cptr;                \
2644
0
            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
2645
0
            c = *_p++;                                                  \
2646
0
            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
2647
0
                if (_p < _end && is_lo_surrogate(*_p)) {                \
2648
0
                    c = from_surrogate(c, *_p);                         \
2649
0
                }                                                       \
2650
0
            }                                                           \
2651
0
        }                                                               \
2652
0
    } while (0)
2653
2654
#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                  \
2655
0
    do {                                                                \
2656
0
        if (cbuf_type == 0) {                                           \
2657
0
            c = cptr[-1];                                               \
2658
0
        } else {                                                        \
2659
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2660
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2661
0
            c = *_p;                                                    \
2662
0
            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
2663
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2664
0
                    c = from_surrogate(*--_p, c);                       \
2665
0
                }                                                       \
2666
0
            }                                                           \
2667
0
        }                                                               \
2668
0
    } while (0)
2669
2670
#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                   \
2671
0
    do {                                                                \
2672
0
        if (cbuf_type == 0) {                                           \
2673
0
            cptr--;                                                     \
2674
0
            c = cptr[0];                                                \
2675
0
        } else {                                                        \
2676
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2677
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2678
0
            c = *_p;                                                    \
2679
0
            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
2680
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2681
0
                    c = from_surrogate(*--_p, c);                       \
2682
0
                }                                                       \
2683
0
            }                                                           \
2684
0
            cptr = (const void *)_p;                                    \
2685
0
        }                                                               \
2686
0
    } while (0)
2687
2688
#define PREV_CHAR(cptr, cbuf_start, cbuf_type)                          \
2689
0
    do {                                                                \
2690
0
        if (cbuf_type == 0) {                                           \
2691
0
            cptr--;                                                     \
2692
0
        } else {                                                        \
2693
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2694
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2695
0
            if (is_lo_surrogate(*_p) && cbuf_type == 2) {               \
2696
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2697
0
                    --_p;                                               \
2698
0
                }                                                       \
2699
0
            }                                                           \
2700
0
            cptr = (const void *)_p;                                    \
2701
0
        }                                                               \
2702
0
    } while (0)
2703
2704
typedef enum {
2705
    RE_EXEC_STATE_SPLIT,
2706
    RE_EXEC_STATE_LOOKAHEAD,
2707
    RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
2708
} REExecStateEnum;
2709
2710
#if INTPTR_MAX >= INT64_MAX
2711
#define BP_TYPE_BITS 3
2712
#else
2713
#define BP_TYPE_BITS 2
2714
#endif
2715
2716
typedef union {
2717
    uint8_t *ptr;
2718
    intptr_t val; /* for bp, the low BP_SHIFT bits store REExecStateEnum */
2719
    struct {
2720
        uintptr_t val : sizeof(uintptr_t) * 8 - BP_TYPE_BITS;
2721
        uintptr_t type : BP_TYPE_BITS;
2722
    } bp;
2723
} StackElem;
2724
2725
typedef struct {
2726
    const uint8_t *cbuf;
2727
    const uint8_t *cbuf_end;
2728
    /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
2729
    int cbuf_type;
2730
    int capture_count;
2731
    BOOL is_unicode;
2732
    int interrupt_counter;
2733
    void *opaque; /* used for stack overflow check */
2734
2735
    StackElem *stack_buf;
2736
    size_t stack_size;
2737
    StackElem static_stack_buf[32]; /* static stack to avoid allocation in most cases */
2738
} REExecContext;
2739
2740
static int lre_poll_timeout(REExecContext *s)
2741
1.10M
{
2742
1.10M
    if (unlikely(--s->interrupt_counter <= 0)) {
2743
110
        s->interrupt_counter = INTERRUPT_COUNTER_INIT;
2744
110
        if (lre_check_timeout(s->opaque))
2745
10
            return LRE_RET_TIMEOUT;
2746
110
    }
2747
1.10M
    return 0;
2748
1.10M
}
2749
2750
static no_inline int stack_realloc(REExecContext *s, size_t n)
2751
239
{
2752
239
    StackElem *new_stack;
2753
239
    size_t new_size;
2754
239
    new_size = s->stack_size * 3 / 2;
2755
239
    if (new_size < n)
2756
7
        new_size = n;
2757
239
    if (s->stack_buf == s->static_stack_buf) {
2758
7
        new_stack = lre_realloc(s->opaque, NULL, new_size * sizeof(StackElem));
2759
7
        if (!new_stack)
2760
0
            return -1;
2761
        /* XXX: could use correct size */
2762
7
        memcpy(new_stack, s->stack_buf, s->stack_size * sizeof(StackElem));
2763
232
    } else {
2764
232
        new_stack = lre_realloc(s->opaque, s->stack_buf, new_size * sizeof(StackElem));
2765
232
        if (!new_stack)
2766
0
            return -1;
2767
232
    }
2768
239
    s->stack_size = new_size;
2769
239
    s->stack_buf = new_stack;
2770
239
    return 0;
2771
239
}
2772
2773
/* return 1 if match, 0 if not match or < 0 if error. */
2774
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2775
                                   const uint8_t *pc, const uint8_t *cptr)
2776
15
{
2777
15
    int opcode;
2778
15
    int cbuf_type;
2779
15
    uint32_t val, c, idx;
2780
15
    const uint8_t *cbuf_end;
2781
15
    StackElem *sp, *bp, *stack_end;
2782
#ifdef DUMP_EXEC
2783
    const uint8_t *pc_start = pc; /* TEST */
2784
#endif
2785
15
    cbuf_type = s->cbuf_type;
2786
15
    cbuf_end = s->cbuf_end;
2787
2788
15
    sp = s->stack_buf;
2789
15
    bp = s->stack_buf;
2790
15
    stack_end = s->stack_buf + s->stack_size;
2791
    
2792
15
#define CHECK_STACK_SPACE(n)                            \
2793
439M
    if (unlikely((stack_end - sp) < (n))) {             \
2794
239
        size_t saved_sp = sp - s->stack_buf;            \
2795
239
        size_t saved_bp = bp - s->stack_buf;            \
2796
239
        if (stack_realloc(s, sp - s->stack_buf + (n)))  \
2797
239
            return LRE_RET_MEMORY_ERROR;                \
2798
239
        stack_end = s->stack_buf + s->stack_size;       \
2799
239
        sp = s->stack_buf + saved_sp;                   \
2800
239
        bp = s->stack_buf + saved_bp;                   \
2801
239
    }
2802
2803
    /* XXX: could test if the value was saved to reduce the stack size
2804
       but slower */
2805
15
#define SAVE_CAPTURE(idx, value)                        \
2806
427M
    {                                                   \
2807
427M
        CHECK_STACK_SPACE(2);                           \
2808
427M
        sp[0].val = idx;                                \
2809
427M
        sp[1].ptr = capture[idx];                       \
2810
427M
        sp += 2;                                        \
2811
427M
        capture[idx] = (value);                         \
2812
427M
    }
2813
2814
    /* avoid saving the previous value if already saved */
2815
15
#define SAVE_CAPTURE_CHECK(idx, value)          \
2816
4.10M
    {                                           \
2817
4.10M
        StackElem *sp1;                         \
2818
4.10M
        sp1 = sp;                               \
2819
22.0M
        for(;;) {                               \
2820
22.0M
            if (sp1 > bp) {                             \
2821
17.9M
                if (sp1[-2].val == idx)                 \
2822
17.9M
                    break;                              \
2823
17.9M
                sp1 -= 2;                               \
2824
17.9M
            } else {                                    \
2825
4.10M
                CHECK_STACK_SPACE(2);                   \
2826
4.10M
                sp[0].val = idx;                        \
2827
4.10M
                sp[1].ptr = capture[idx];               \
2828
4.10M
                sp += 2;                                \
2829
4.10M
                break;                                  \
2830
4.10M
            }                                           \
2831
22.0M
        }                                               \
2832
4.10M
        capture[idx] = (value);                         \
2833
4.10M
    }
2834
2835
2836
#ifdef DUMP_EXEC
2837
    printf("%5s %5s %5s %5s %s\n", "PC", "CP", "BP", "SP", "OPCODE");
2838
#endif    
2839
25.8M
    for(;;) {
2840
25.8M
        opcode = *pc++;
2841
#ifdef DUMP_EXEC
2842
        printf("%5ld %5ld %5ld %5ld %s\n",
2843
               pc - 1 - pc_start,
2844
               cbuf_type == 0 ? cptr - s->cbuf : (cptr - s->cbuf) / 2,
2845
               bp - s->stack_buf,
2846
               sp - s->stack_buf,
2847
               reopcode_info[opcode].name);
2848
#endif        
2849
25.8M
        switch(opcode) {
2850
5
        case REOP_match:
2851
5
            return 1;
2852
1.08M
        no_match:
2853
1.08M
            for(;;) {
2854
1.08M
                REExecStateEnum type;
2855
1.08M
                if (bp == s->stack_buf)
2856
0
                    return 0;
2857
                /* undo the modifications to capture[] */
2858
45.5M
                while (sp > bp) {
2859
44.4M
                    capture[sp[-2].val] = sp[-1].ptr;
2860
44.4M
                    sp -= 2;
2861
44.4M
                }
2862
                
2863
1.08M
                pc = sp[-3].ptr;
2864
1.08M
                cptr = sp[-2].ptr;
2865
1.08M
                type = sp[-1].bp.type;
2866
1.08M
                bp = s->stack_buf + sp[-1].bp.val;
2867
1.08M
                sp -= 3;
2868
1.08M
                if (type != RE_EXEC_STATE_LOOKAHEAD)
2869
1.08M
                    break;
2870
1.08M
            }
2871
1.08M
            if (lre_poll_timeout(s))
2872
7
                return LRE_RET_TIMEOUT;
2873
1.08M
            break;
2874
1.08M
        case REOP_lookahead_match:
2875
            /* pop all the saved states until reaching the start of
2876
               the lookahead and keep the updated captures and
2877
               variables and the corresponding undo info. */
2878
7.28k
            {
2879
7.28k
                StackElem *sp1, *sp_top, *next_sp;
2880
7.28k
                REExecStateEnum type;
2881
2882
7.28k
                sp_top = sp;
2883
7.52k
                for(;;) {
2884
7.52k
                    sp1 = sp;
2885
7.52k
                    sp = bp;
2886
7.52k
                    pc = sp[-3].ptr;
2887
7.52k
                    cptr = sp[-2].ptr;
2888
7.52k
                    type = sp[-1].bp.type;
2889
7.52k
                    bp = s->stack_buf + sp[-1].bp.val;
2890
7.52k
                    sp[-1].ptr = (void *)sp1; /* save the next value for the copy step */
2891
7.52k
                    sp -= 3;
2892
7.52k
                    if (type == RE_EXEC_STATE_LOOKAHEAD)
2893
7.28k
                        break;
2894
7.52k
                }
2895
7.28k
                if (sp != s->stack_buf) {
2896
                    /* keep the undo info if there is a saved state */
2897
7.28k
                    sp1 = sp;
2898
14.8k
                    while (sp1 < sp_top) {
2899
7.52k
                        next_sp = (void *)sp1[2].ptr;
2900
7.52k
                        sp1 += 3;
2901
7.52k
                        while (sp1 < next_sp)
2902
0
                            *sp++ = *sp1++;
2903
7.52k
                    }
2904
7.28k
                }
2905
7.28k
            }
2906
7.28k
            break;
2907
0
        case REOP_negative_lookahead_match:
2908
            /* pop all the saved states until reaching start of the negative lookahead */
2909
0
            for(;;) {
2910
0
                REExecStateEnum type;
2911
0
                type = bp[-1].bp.type;
2912
                /* undo the modifications to capture[] */
2913
0
                while (sp > bp) {
2914
0
                    capture[sp[-2].val] = sp[-1].ptr;
2915
0
                    sp -= 2;
2916
0
                }
2917
0
                pc = sp[-3].ptr;
2918
0
                cptr = sp[-2].ptr;
2919
0
                type = sp[-1].bp.type;
2920
0
                bp = s->stack_buf + sp[-1].bp.val;
2921
0
                sp -= 3;
2922
0
                if (type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD)
2923
0
                    break;
2924
0
            }
2925
0
            goto no_match;
2926
0
        case REOP_char32:
2927
0
        case REOP_char32_i:
2928
0
            val = get_u32(pc);
2929
0
            pc += 4;
2930
0
            goto test_char;
2931
726k
        case REOP_char:
2932
726k
        case REOP_char_i:
2933
726k
            val = get_u16(pc);
2934
726k
            pc += 2;
2935
726k
        test_char:
2936
726k
            if (cptr >= cbuf_end)
2937
3.70k
                goto no_match;
2938
723k
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2939
723k
            if (opcode == REOP_char_i || opcode == REOP_char32_i) {
2940
0
                c = lre_canonicalize(c, s->is_unicode);
2941
0
            }
2942
723k
            if (val != c)
2943
714k
                goto no_match;
2944
8.80k
            break;
2945
790k
        case REOP_split_goto_first:
2946
4.63M
        case REOP_split_next_first:
2947
4.63M
            {
2948
4.63M
                const uint8_t *pc1;
2949
2950
4.63M
                val = get_u32(pc);
2951
4.63M
                pc += 4;
2952
4.63M
                if (opcode == REOP_split_next_first) {
2953
3.84M
                    pc1 = pc + (int)val;
2954
3.84M
                } else {
2955
790k
                    pc1 = pc;
2956
790k
                    pc = pc + (int)val;
2957
790k
                }
2958
4.63M
                CHECK_STACK_SPACE(3);
2959
4.63M
                sp[0].ptr = (uint8_t *)pc1;
2960
4.63M
                sp[1].ptr = (uint8_t *)cptr;
2961
4.63M
                sp[2].bp.val = bp - s->stack_buf;
2962
4.63M
                sp[2].bp.type = RE_EXEC_STATE_SPLIT;
2963
4.63M
                sp += 3;
2964
4.63M
                bp = sp;
2965
4.63M
            }
2966
0
            break;
2967
7.28k
        case REOP_lookahead:
2968
7.28k
        case REOP_negative_lookahead:
2969
7.28k
            val = get_u32(pc);
2970
7.28k
            pc += 4;
2971
7.28k
            CHECK_STACK_SPACE(3);
2972
7.28k
            sp[0].ptr = (uint8_t *)(pc + (int)val);
2973
7.28k
            sp[1].ptr = (uint8_t *)cptr;
2974
7.28k
            sp[2].bp.val = bp - s->stack_buf;
2975
7.28k
            sp[2].bp.type = RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead;
2976
7.28k
            sp += 3;
2977
7.28k
            bp = sp;
2978
7.28k
            break;
2979
15.0k
        case REOP_goto:
2980
15.0k
            val = get_u32(pc);
2981
15.0k
            pc += 4 + (int)val;
2982
15.0k
            if (lre_poll_timeout(s))
2983
3
                return LRE_RET_TIMEOUT;
2984
15.0k
            break;
2985
15.0k
        case REOP_line_start:
2986
0
        case REOP_line_start_m:
2987
0
            if (cptr == s->cbuf)
2988
0
                break;
2989
0
            if (opcode == REOP_line_start)
2990
0
                goto no_match;
2991
0
            PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2992
0
            if (!is_line_terminator(c))
2993
0
                goto no_match;
2994
0
            break;
2995
0
        case REOP_line_end:
2996
0
        case REOP_line_end_m:
2997
0
            if (cptr == cbuf_end)
2998
0
                break;
2999
0
            if (opcode == REOP_line_end)
3000
0
                goto no_match;
3001
0
            PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
3002
0
            if (!is_line_terminator(c))
3003
0
                goto no_match;
3004
0
            break;
3005
768k
        case REOP_dot:
3006
768k
            if (cptr == cbuf_end)
3007
923
                goto no_match;
3008
768k
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3009
768k
            if (is_line_terminator(c))
3010
0
                goto no_match;
3011
768k
            break;
3012
768k
        case REOP_any:
3013
15.0k
            if (cptr == cbuf_end)
3014
0
                goto no_match;
3015
15.0k
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3016
15.0k
            break;
3017
0
        case REOP_space:
3018
0
            if (cptr == cbuf_end)
3019
0
                goto no_match;
3020
0
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3021
0
            if (!lre_is_space(c))
3022
0
                goto no_match;
3023
0
            break;
3024
55.1k
        case REOP_not_space:
3025
55.1k
            if (cptr == cbuf_end)
3026
0
                goto no_match;
3027
55.1k
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3028
55.1k
            if (lre_is_space(c))
3029
0
                goto no_match;
3030
55.1k
            break;
3031
4.29M
        case REOP_save_start:
3032
8.94M
        case REOP_save_end:
3033
8.94M
            val = *pc++;
3034
8.94M
            assert(val < s->capture_count);
3035
8.94M
            idx = 2 * val + opcode - REOP_save_start;
3036
8.94M
            SAVE_CAPTURE(idx, (uint8_t *)cptr);
3037
8.94M
            break;
3038
3.51M
        case REOP_save_reset:
3039
3.51M
            {
3040
3.51M
                uint32_t val2;
3041
3.51M
                val = pc[0];
3042
3.51M
                val2 = pc[1];
3043
3.51M
                pc += 2;
3044
3.51M
                assert(val2 < s->capture_count);
3045
3.51M
                CHECK_STACK_SPACE(2 * (val2 - val + 1));
3046
212M
                while (val <= val2) {
3047
209M
                    idx = 2 * val;
3048
209M
                    SAVE_CAPTURE(idx, NULL);
3049
209M
                    idx = 2 * val + 1;
3050
209M
                    SAVE_CAPTURE(idx, NULL);
3051
209M
                    val++;
3052
209M
                }
3053
3.51M
            }
3054
3.51M
            break;
3055
3.51M
        case REOP_set_i32:
3056
30
            idx = 2 * s->capture_count + pc[0];
3057
30
            val = get_u32(pc + 1);
3058
30
            pc += 5;
3059
30
            SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
3060
30
            break;
3061
0
        case REOP_loop:
3062
0
            {
3063
0
                uint32_t val2;
3064
0
                idx = 2 * s->capture_count + pc[0];
3065
0
                val = get_u32(pc + 1);
3066
0
                pc += 5;
3067
3068
0
                val2 = (uintptr_t)capture[idx] - 1;
3069
0
                SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
3070
0
                if (val2 != 0) {
3071
0
                    pc += (int)val;
3072
0
                    if (lre_poll_timeout(s))
3073
0
                        return LRE_RET_TIMEOUT;
3074
0
                }
3075
0
            }
3076
0
            break;
3077
0
        case REOP_loop_split_goto_first:
3078
0
        case REOP_loop_split_next_first:
3079
607k
        case REOP_loop_check_adv_split_goto_first:
3080
607k
        case REOP_loop_check_adv_split_next_first:
3081
607k
            {
3082
607k
                const uint8_t *pc1;
3083
607k
                uint32_t val2, limit;
3084
607k
                idx = 2 * s->capture_count + pc[0];
3085
607k
                limit = get_u32(pc + 1);
3086
607k
                val = get_u32(pc + 5);
3087
607k
                pc += 9;
3088
3089
                /* decrement the counter */
3090
607k
                val2 = (uintptr_t)capture[idx] - 1;
3091
607k
                SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
3092
3093
607k
                if (val2 > limit) {
3094
                    /* normal loop if counter > limit */
3095
0
                    pc += (int)val;
3096
0
                    if (lre_poll_timeout(s))
3097
0
                        return LRE_RET_TIMEOUT;
3098
607k
                } else {
3099
                    /* check advance */
3100
607k
                    if ((opcode == REOP_loop_check_adv_split_goto_first ||
3101
0
                         opcode == REOP_loop_check_adv_split_next_first) &&
3102
607k
                        capture[idx + 1] == cptr &&
3103
7.28k
                        val2 != limit) {
3104
7.28k
                        goto no_match;
3105
7.28k
                    }
3106
                    
3107
                    /* otherwise conditional split */
3108
600k
                    if (val2 != 0) {
3109
600k
                        if (opcode == REOP_loop_split_next_first ||
3110
600k
                            opcode == REOP_loop_check_adv_split_next_first) {
3111
0
                            pc1 = pc + (int)val;
3112
600k
                        } else {
3113
600k
                            pc1 = pc;
3114
600k
                            pc = pc + (int)val;
3115
600k
                        }
3116
600k
                        CHECK_STACK_SPACE(3);
3117
600k
                        sp[0].ptr = (uint8_t *)pc1;
3118
600k
                        sp[1].ptr = (uint8_t *)cptr;
3119
600k
                        sp[2].bp.val = bp - s->stack_buf;
3120
600k
                        sp[2].bp.type = RE_EXEC_STATE_SPLIT;
3121
600k
                        sp += 3;
3122
600k
                        bp = sp;
3123
600k
                    }
3124
600k
                }
3125
607k
            }
3126
600k
            break;
3127
3.50M
        case REOP_set_char_pos:
3128
3.50M
            idx = 2 * s->capture_count + pc[0];
3129
3.50M
            pc++;
3130
3.50M
            SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
3131
3.50M
            break;
3132
3.09M
        case REOP_check_advance:
3133
3.09M
            idx = 2 * s->capture_count + pc[0];
3134
3.09M
            pc++;
3135
3.09M
            if (capture[idx] == cptr)
3136
358k
                goto no_match;
3137
2.73M
            break;
3138
2.73M
        case REOP_word_boundary:
3139
0
        case REOP_word_boundary_i:
3140
0
        case REOP_not_word_boundary:
3141
0
        case REOP_not_word_boundary_i:
3142
0
            {
3143
0
                BOOL v1, v2;
3144
0
                int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i);
3145
0
                BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i);
3146
                /* char before */
3147
0
                if (cptr == s->cbuf) {
3148
0
                    v1 = FALSE;
3149
0
                } else {
3150
0
                    PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
3151
0
                    if (c < 256) {
3152
0
                        v1 = (lre_is_word_byte(c) != 0);
3153
0
                    } else {
3154
0
                        v1 = ignore_case && (c == 0x017f || c == 0x212a);
3155
0
                    }
3156
0
                }
3157
                /* current char */
3158
0
                if (cptr >= cbuf_end) {
3159
0
                    v2 = FALSE;
3160
0
                } else {
3161
0
                    PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
3162
0
                    if (c < 256) {
3163
0
                        v2 = (lre_is_word_byte(c) != 0);
3164
0
                    } else {
3165
0
                        v2 = ignore_case && (c == 0x017f || c == 0x212a);
3166
0
                    }
3167
0
                }
3168
0
                if (v1 ^ v2 ^ is_boundary)
3169
0
                    goto no_match;
3170
0
            }
3171
0
            break;
3172
0
        case REOP_back_reference:
3173
0
        case REOP_back_reference_i:
3174
0
        case REOP_backward_back_reference:
3175
0
        case REOP_backward_back_reference_i:
3176
0
            {
3177
0
                const uint8_t *cptr1, *cptr1_end, *cptr1_start;
3178
0
                const uint8_t *pc1;
3179
0
                uint32_t c1, c2;
3180
0
                int i, n;
3181
3182
0
                n = *pc++;
3183
0
                pc1 = pc;
3184
0
                pc += n;
3185
3186
0
                for(i = 0; i < n; i++) {
3187
0
                    val = pc1[i];
3188
0
                    if (val >= s->capture_count)
3189
0
                        goto no_match;
3190
0
                    cptr1_start = capture[2 * val];
3191
0
                    cptr1_end = capture[2 * val + 1];
3192
                    /* test the first not empty capture */
3193
0
                    if (cptr1_start && cptr1_end) {
3194
0
                        if (opcode == REOP_back_reference ||
3195
0
                            opcode == REOP_back_reference_i) {
3196
0
                            cptr1 = cptr1_start;
3197
0
                            while (cptr1 < cptr1_end) {
3198
0
                                if (cptr >= cbuf_end)
3199
0
                                    goto no_match;
3200
0
                                GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
3201
0
                                GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
3202
0
                                if (opcode == REOP_back_reference_i) {
3203
0
                                    c1 = lre_canonicalize(c1, s->is_unicode);
3204
0
                                    c2 = lre_canonicalize(c2, s->is_unicode);
3205
0
                                }
3206
0
                                if (c1 != c2)
3207
0
                                    goto no_match;
3208
0
                            }
3209
0
                        } else {
3210
0
                            cptr1 = cptr1_end;
3211
0
                            while (cptr1 > cptr1_start) {
3212
0
                                if (cptr == s->cbuf)
3213
0
                                    goto no_match;
3214
0
                                GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
3215
0
                                GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
3216
0
                                if (opcode == REOP_backward_back_reference_i) {
3217
0
                                    c1 = lre_canonicalize(c1, s->is_unicode);
3218
0
                                    c2 = lre_canonicalize(c2, s->is_unicode);
3219
0
                                }
3220
0
                                if (c1 != c2)
3221
0
                                    goto no_match;
3222
0
                            }
3223
0
                        }
3224
0
                        break;
3225
0
                    }
3226
0
                }
3227
0
            }
3228
0
            break;
3229
0
        case REOP_range:
3230
0
        case REOP_range_i:
3231
0
            {
3232
0
                int n;
3233
0
                uint32_t low, high, idx_min, idx_max, idx;
3234
3235
0
                n = get_u16(pc); /* n must be >= 1 */
3236
0
                pc += 2;
3237
0
                if (cptr >= cbuf_end)
3238
0
                    goto no_match;
3239
0
                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3240
0
                if (opcode == REOP_range_i) {
3241
0
                    c = lre_canonicalize(c, s->is_unicode);
3242
0
                }
3243
0
                idx_min = 0;
3244
0
                low = get_u16(pc + 0 * 4);
3245
0
                if (c < low)
3246
0
                    goto no_match;
3247
0
                idx_max = n - 1;
3248
0
                high = get_u16(pc + idx_max * 4 + 2);
3249
                /* 0xffff in for last value means +infinity */
3250
0
                if (unlikely(c >= 0xffff) && high == 0xffff)
3251
0
                    goto range_match;
3252
0
                if (c > high)
3253
0
                    goto no_match;
3254
0
                while (idx_min <= idx_max) {
3255
0
                    idx = (idx_min + idx_max) / 2;
3256
0
                    low = get_u16(pc + idx * 4);
3257
0
                    high = get_u16(pc + idx * 4 + 2);
3258
0
                    if (c < low)
3259
0
                        idx_max = idx - 1;
3260
0
                    else if (c > high)
3261
0
                        idx_min = idx + 1;
3262
0
                    else
3263
0
                        goto range_match;
3264
0
                }
3265
0
                goto no_match;
3266
0
            range_match:
3267
0
                pc += 4 * n;
3268
0
            }
3269
0
            break;
3270
0
        case REOP_range32:
3271
0
        case REOP_range32_i:
3272
0
            {
3273
0
                int n;
3274
0
                uint32_t low, high, idx_min, idx_max, idx;
3275
3276
0
                n = get_u16(pc); /* n must be >= 1 */
3277
0
                pc += 2;
3278
0
                if (cptr >= cbuf_end)
3279
0
                    goto no_match;
3280
0
                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3281
0
                if (opcode == REOP_range32_i) {
3282
0
                    c = lre_canonicalize(c, s->is_unicode);
3283
0
                }
3284
0
                idx_min = 0;
3285
0
                low = get_u32(pc + 0 * 8);
3286
0
                if (c < low)
3287
0
                    goto no_match;
3288
0
                idx_max = n - 1;
3289
0
                high = get_u32(pc + idx_max * 8 + 4);
3290
0
                if (c > high)
3291
0
                    goto no_match;
3292
0
                while (idx_min <= idx_max) {
3293
0
                    idx = (idx_min + idx_max) / 2;
3294
0
                    low = get_u32(pc + idx * 8);
3295
0
                    high = get_u32(pc + idx * 8 + 4);
3296
0
                    if (c < low)
3297
0
                        idx_max = idx - 1;
3298
0
                    else if (c > high)
3299
0
                        idx_min = idx + 1;
3300
0
                    else
3301
0
                        goto range32_match;
3302
0
                }
3303
0
                goto no_match;
3304
0
            range32_match:
3305
0
                pc += 8 * n;
3306
0
            }
3307
0
            break;
3308
0
        case REOP_prev:
3309
            /* go to the previous char */
3310
0
            if (cptr == s->cbuf)
3311
0
                goto no_match;
3312
0
            PREV_CHAR(cptr, s->cbuf, cbuf_type);
3313
0
            break;
3314
0
        default:
3315
#ifdef DUMP_EXEC
3316
            printf("unknown opcode pc=%ld\n", pc - 1 - pc_start);
3317
#endif            
3318
0
            abort();
3319
25.8M
        }
3320
25.8M
    }
3321
15
}
3322
3323
/* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the
3324
   starting position of the match and must be such as 0 <= cindex <=
3325
   clen. */
3326
int lre_exec(uint8_t **capture,
3327
             const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
3328
             int cbuf_type, void *opaque)
3329
15
{
3330
15
    REExecContext s_s, *s = &s_s;
3331
15
    int re_flags, i, ret;
3332
15
    const uint8_t *cptr;
3333
3334
15
    re_flags = lre_get_flags(bc_buf);
3335
15
    s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
3336
15
    s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
3337
15
    s->cbuf = cbuf;
3338
15
    s->cbuf_end = cbuf + (clen << cbuf_type);
3339
15
    s->cbuf_type = cbuf_type;
3340
15
    if (s->cbuf_type == 1 && s->is_unicode)
3341
0
        s->cbuf_type = 2;
3342
15
    s->interrupt_counter = INTERRUPT_COUNTER_INIT;
3343
15
    s->opaque = opaque;
3344
3345
15
    s->stack_buf = s->static_stack_buf;
3346
15
    s->stack_size = countof(s->static_stack_buf);
3347
3348
1.46k
    for(i = 0; i < s->capture_count * 2; i++)
3349
1.44k
        capture[i] = NULL;
3350
3351
15
    cptr = cbuf + (cindex << cbuf_type);
3352
15
    if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
3353
0
        const uint16_t *p = (const uint16_t *)cptr;
3354
0
        if (is_lo_surrogate(*p) && is_hi_surrogate(p[-1])) {
3355
0
            cptr = (const uint8_t *)(p - 1);
3356
0
        }
3357
0
    }
3358
3359
15
    ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
3360
3361
15
    if (s->stack_buf != s->static_stack_buf)
3362
7
        lre_realloc(s->opaque, s->stack_buf, 0);
3363
15
    return ret;
3364
15
}
3365
3366
int lre_get_alloc_count(const uint8_t *bc_buf)
3367
0
{
3368
0
    return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
3369
0
        bc_buf[RE_HEADER_REGISTER_COUNT];
3370
0
}
3371
3372
int lre_get_capture_count(const uint8_t *bc_buf)
3373
5
{
3374
5
    return bc_buf[RE_HEADER_CAPTURE_COUNT];
3375
5
}
3376
3377
int lre_get_flags(const uint8_t *bc_buf)
3378
15
{
3379
15
    return get_u16(bc_buf + RE_HEADER_FLAGS);
3380
15
}
3381
3382
/* Return NULL if no group names. Otherwise, return a pointer to
3383
   'capture_count - 1' zero terminated UTF-8 strings. */
3384
const char *lre_get_groupnames(const uint8_t *bc_buf)
3385
0
{
3386
0
    uint32_t re_bytecode_len;
3387
0
    if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
3388
0
        return NULL;
3389
0
    re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
3390
0
    return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
3391
0
}
3392
3393
#ifdef TEST
3394
3395
BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
3396
{
3397
    return FALSE;
3398
}
3399
3400
void *lre_realloc(void *opaque, void *ptr, size_t size)
3401
{
3402
    return realloc(ptr, size);
3403
}
3404
3405
int main(int argc, char **argv)
3406
{
3407
    int len, flags, ret, i;
3408
    uint8_t *bc;
3409
    char error_msg[64];
3410
    uint8_t *capture;
3411
    const char *input;
3412
    int input_len, capture_count;
3413
3414
    if (argc < 4) {
3415
        printf("usage: %s regexp flags input\n", argv[0]);
3416
        return 1;
3417
    }
3418
    flags = atoi(argv[2]);
3419
    bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
3420
                     strlen(argv[1]), flags, NULL);
3421
    if (!bc) {
3422
        fprintf(stderr, "error: %s\n", error_msg);
3423
        exit(1);
3424
    }
3425
3426
    input = argv[3];
3427
    input_len = strlen(input);
3428
3429
    capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
3430
    ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
3431
    printf("ret=%d\n", ret);
3432
    if (ret == 1) {
3433
        capture_count = lre_get_capture_count(bc);
3434
        for(i = 0; i < 2 * capture_count; i++) {
3435
            uint8_t *ptr;
3436
            ptr = capture[i];
3437
            printf("%d: ", i);
3438
            if (!ptr)
3439
                printf("<nil>");
3440
            else
3441
                printf("%u", (int)(ptr - (uint8_t *)input));
3442
            printf("\n");
3443
        }
3444
    }
3445
    free(capture);
3446
    return 0;
3447
}
3448
#endif