Coverage Report

Created: 2025-10-13 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quickjs/libregexp.c
Line
Count
Source
1
/*
2
 * Regular Expression Engine
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <inttypes.h>
28
#include <string.h>
29
#include <assert.h>
30
31
#include "cutils.h"
32
#include "libregexp.h"
33
#include "libunicode.h"
34
35
/*
36
  TODO:
37
38
  - Add a lock step execution mode (=linear time execution guaranteed)
39
    when the regular expression is "simple" i.e. no backreference nor
40
    complicated lookahead. The opcodes are designed for this execution
41
    model.
42
*/
43
44
#if defined(TEST)
45
#define DUMP_REOP
46
#endif
47
48
typedef enum {
49
#define DEF(id, size) REOP_ ## id,
50
#include "libregexp-opcode.h"
51
#undef DEF
52
    REOP_COUNT,
53
} REOPCodeEnum;
54
55
120k
#define CAPTURE_COUNT_MAX 255
56
6.82k
#define STACK_SIZE_MAX 255
57
/* must be large enough to have a negligible runtime cost and small
58
   enough to call the interrupt callback often. */
59
4.69M
#define INTERRUPT_COUNTER_INIT 10000
60
61
/* unicode code points */
62
8.84M
#define CP_LS   0x2028
63
2.90M
#define CP_PS   0x2029
64
65
#define TMP_BUF_SIZE 128
66
67
typedef struct {
68
    DynBuf byte_code;
69
    const uint8_t *buf_ptr;
70
    const uint8_t *buf_end;
71
    const uint8_t *buf_start;
72
    int re_flags;
73
    BOOL is_unicode;
74
    BOOL unicode_sets; /* if set, is_unicode is also set */
75
    BOOL ignore_case;
76
    BOOL multi_line;
77
    BOOL dotall;
78
    int capture_count;
79
    int total_capture_count; /* -1 = not computed yet */
80
    int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
81
    void *opaque;
82
    DynBuf group_names;
83
    union {
84
        char error_msg[TMP_BUF_SIZE];
85
        char tmp_buf[TMP_BUF_SIZE];
86
    } u;
87
} REParseState;
88
89
typedef struct {
90
#ifdef DUMP_REOP
91
    const char *name;
92
#endif
93
    uint8_t size;
94
} REOpCode;
95
96
static const REOpCode reopcode_info[REOP_COUNT] = {
97
#ifdef DUMP_REOP
98
#define DEF(id, size) { #id, size },
99
#else
100
#define DEF(id, size) { size },
101
#endif
102
#include "libregexp-opcode.h"
103
#undef DEF
104
};
105
106
10.4M
#define RE_HEADER_FLAGS         0
107
7.81M
#define RE_HEADER_CAPTURE_COUNT 2
108
4.73M
#define RE_HEADER_STACK_SIZE    3
109
38.4k
#define RE_HEADER_BYTECODE_LEN  4
110
111
4.80M
#define RE_HEADER_LEN 8
112
113
20.2k
static inline int is_digit(int c) {
114
20.2k
    return c >= '0' && c <= '9';
115
20.2k
}
116
117
/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
118
static int dbuf_insert(DynBuf *s, int pos, int len)
119
15.0k
{
120
15.0k
    if (dbuf_realloc(s, s->size + len))
121
5
        return -1;
122
15.0k
    memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
123
15.0k
    s->size += len;
124
15.0k
    return 0;
125
15.0k
}
126
127
typedef struct REString {
128
    struct REString *next;
129
    uint32_t hash;
130
    uint32_t len;
131
    uint32_t buf[];
132
} REString;
133
134
typedef struct {
135
    /* the string list is the union of 'char_range' and of the strings
136
       in hash_table[]. The strings in hash_table[] have a length !=
137
       1. */
138
    CharRange cr;
139
    uint32_t n_strings;
140
    uint32_t hash_size;
141
    int hash_bits;
142
    REString **hash_table;
143
} REStringList;
144
145
static uint32_t re_string_hash(int len, const uint32_t *buf)
146
0
{
147
0
    int i;
148
0
    uint32_t h;
149
0
    h = 1;
150
0
    for(i = 0; i < len; i++)
151
0
        h = h * 263 + buf[i];
152
0
    return h * 0x61C88647;
153
0
}
154
155
static void re_string_list_init(REParseState *s1, REStringList *s)
156
7.43k
{
157
7.43k
    cr_init(&s->cr, s1->opaque, lre_realloc);
158
7.43k
    s->n_strings = 0;
159
7.43k
    s->hash_size = 0;
160
7.43k
    s->hash_bits = 0;
161
7.43k
    s->hash_table = NULL;
162
7.43k
}
163
164
static void re_string_list_free(REStringList *s)
165
7.43k
{
166
7.43k
    REString *p, *p_next;
167
7.43k
    int i;
168
7.43k
    for(i = 0; i < s->hash_size; i++) {
169
0
        for(p = s->hash_table[i]; p != NULL; p = p_next) {
170
0
            p_next = p->next;
171
0
            lre_realloc(s->cr.mem_opaque, p, 0);
172
0
        }
173
0
    }
174
7.43k
    lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
175
176
7.43k
    cr_free(&s->cr);
177
7.43k
}
178
179
static void lre_print_char(int c, BOOL is_range)
180
0
{
181
0
    if (c == '\'' || c == '\\' ||
182
0
        (is_range && (c == '-' || c == ']'))) {
183
0
        printf("\\%c", c);
184
0
    } else if (c >= ' ' && c <= 126) {
185
0
        printf("%c", c);
186
0
    } else {
187
0
        printf("\\u{%04x}", c);
188
0
    }
189
0
}
190
191
static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
192
0
{
193
0
    REString *p;
194
0
    const CharRange *cr;
195
0
    int i, j, k;
196
0
197
0
    printf("%s:\n", str);
198
0
    printf("  ranges: [");
199
0
    cr = &s->cr;
200
0
    for(i = 0; i < cr->len; i += 2) {
201
0
        lre_print_char(cr->points[i], TRUE);
202
0
        if (cr->points[i] != cr->points[i + 1] - 1) {
203
0
            printf("-");
204
0
            lre_print_char(cr->points[i + 1] - 1, TRUE);
205
0
        }
206
0
    }
207
0
    printf("]\n");
208
0
    
209
0
    j = 0;
210
0
    for(i = 0; i < s->hash_size; i++) {
211
0
        for(p = s->hash_table[i]; p != NULL; p = p->next) {
212
0
            printf("  %d/%d: '", j, s->n_strings);
213
0
            for(k = 0; k < p->len; k++) {
214
0
                lre_print_char(p->buf[k], FALSE);
215
0
            }
216
0
            printf("'\n");
217
0
            j++;
218
0
        }
219
0
    }
220
0
}
221
222
static int re_string_find2(REStringList *s, int len, const uint32_t *buf,
223
                           uint32_t h0, BOOL add_flag)
224
0
{
225
0
    uint32_t h = 0; /* avoid warning */
226
0
    REString *p;
227
0
    if (s->n_strings != 0) {
228
0
        h = h0 >> (32 - s->hash_bits);
229
0
        for(p = s->hash_table[h]; p != NULL; p = p->next) {
230
0
            if (p->hash == h0 && p->len == len &&
231
0
                !memcmp(p->buf, buf, len * sizeof(buf[0]))) {
232
0
                return 1;
233
0
            }
234
0
        }
235
0
    }
236
    /* not found */
237
0
    if (!add_flag)
238
0
        return 0;
239
    /* increase the size of the hash table if needed */
240
0
    if (unlikely((s->n_strings + 1) > s->hash_size)) {
241
0
        REString **new_hash_table, *p_next;
242
0
        int new_hash_bits, i;
243
0
        uint32_t new_hash_size;
244
0
        new_hash_bits = max_int(s->hash_bits + 1, 4);
245
0
        new_hash_size = 1 << new_hash_bits;
246
0
        new_hash_table = lre_realloc(s->cr.mem_opaque, NULL,
247
0
                                     sizeof(new_hash_table[0]) * new_hash_size);
248
0
        if (!new_hash_table)
249
0
            return -1;
250
0
        memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
251
0
        for(i = 0; i < s->hash_size; i++) {
252
0
            for(p = s->hash_table[i]; p != NULL; p = p_next) {
253
0
                p_next = p->next;
254
0
                h = p->hash >> (32 - new_hash_bits);
255
0
                p->next = new_hash_table[h];
256
0
                new_hash_table[h] = p;
257
0
            }
258
0
        }
259
0
        lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
260
0
        s->hash_bits = new_hash_bits;
261
0
        s->hash_size = new_hash_size;
262
0
        s->hash_table = new_hash_table;
263
0
        h = h0 >> (32 - s->hash_bits);
264
0
    }
265
266
0
    p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0]));
267
0
    if (!p)
268
0
        return -1;
269
0
    p->next = s->hash_table[h];
270
0
    s->hash_table[h] = p;
271
0
    s->n_strings++;
272
0
    p->hash = h0;
273
0
    p->len = len;
274
0
    memcpy(p->buf, buf, sizeof(buf[0]) * len);
275
0
    return 1;
276
0
}
277
278
static int re_string_find(REStringList *s, int len, const uint32_t *buf,
279
                          BOOL add_flag)
280
0
{
281
0
    uint32_t h0;
282
0
    h0 = re_string_hash(len, buf);
283
0
    return re_string_find2(s, len, buf, h0, add_flag);
284
0
}
285
286
/* return -1 if memory error, 0 if OK */
287
static int re_string_add(REStringList *s, int len, const uint32_t *buf)
288
0
{
289
0
    if (len == 1) {
290
0
        return cr_union_interval(&s->cr, buf[0], buf[0]);
291
0
    }
292
0
    if (re_string_find(s, len, buf, TRUE) < 0)
293
0
        return -1;
294
0
    return 0;
295
0
}
296
297
/* a = a op b */
298
static int re_string_list_op(REStringList *a, REStringList *b, int op)
299
1.33k
{
300
1.33k
    int i, ret;
301
1.33k
    REString *p, **pp;
302
303
1.33k
    if (cr_op1(&a->cr, b->cr.points, b->cr.len, op))
304
0
        return -1;
305
306
1.33k
    switch(op) {
307
1.33k
    case CR_OP_UNION:
308
1.33k
        if (b->n_strings != 0) {
309
0
            for(i = 0; i < b->hash_size; i++) {
310
0
                for(p = b->hash_table[i]; p != NULL; p = p->next) {
311
0
                    if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0)
312
0
                        return -1;
313
0
                }
314
0
            }
315
0
        }
316
1.33k
        break;
317
1.33k
    case CR_OP_INTER:
318
0
    case CR_OP_SUB:
319
0
        for(i = 0; i < a->hash_size; i++) {
320
0
            pp = &a->hash_table[i];
321
0
            for(;;) {
322
0
                p = *pp;
323
0
                if (p == NULL)
324
0
                    break;
325
0
                ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE);
326
0
                if (op == CR_OP_SUB)
327
0
                    ret = !ret;
328
0
                if (!ret) {
329
                    /* remove it */
330
0
                    *pp = p->next;
331
0
                    a->n_strings--;
332
0
                    lre_realloc(a->cr.mem_opaque, p, 0);
333
0
                } else {
334
                    /* keep it */
335
0
                    pp = &p->next;
336
0
                }
337
0
            }
338
0
        }
339
0
        break;
340
0
    default:
341
0
        abort();
342
1.33k
    }
343
1.33k
    return 0;
344
1.33k
}
345
346
static int re_string_list_canonicalize(REParseState *s1,
347
                                       REStringList *s, BOOL is_unicode)
348
0
{
349
0
    if (cr_regexp_canonicalize(&s->cr, is_unicode))
350
0
        return -1;
351
0
    if (s->n_strings != 0) {
352
0
        REStringList a_s, *a = &a_s;
353
0
        int i, j;
354
0
        REString *p;
355
        
356
        /* XXX: simplify */
357
0
        re_string_list_init(s1, a);
358
359
0
        a->n_strings = s->n_strings;
360
0
        a->hash_size = s->hash_size;
361
0
        a->hash_bits = s->hash_bits;
362
0
        a->hash_table = s->hash_table;
363
        
364
0
        s->n_strings = 0;
365
0
        s->hash_size = 0;
366
0
        s->hash_bits = 0;
367
0
        s->hash_table = NULL;
368
369
0
        for(i = 0; i < a->hash_size; i++) {
370
0
            for(p = a->hash_table[i]; p != NULL; p = p->next) {
371
0
                for(j = 0; j < p->len; j++) {
372
0
                    p->buf[j] = lre_canonicalize(p->buf[j], is_unicode);
373
0
                }
374
0
                if (re_string_add(s, p->len, p->buf)) {
375
0
                    re_string_list_free(a);
376
0
                    return -1;
377
0
                }
378
0
            }
379
0
        }
380
0
        re_string_list_free(a);
381
0
    }
382
0
    return 0;
383
0
}
384
385
static const uint16_t char_range_d[] = {
386
    1,
387
    0x0030, 0x0039 + 1,
388
};
389
390
/* code point ranges for Zs,Zl or Zp property */
391
static const uint16_t char_range_s[] = {
392
    10,
393
    0x0009, 0x000D + 1,
394
    0x0020, 0x0020 + 1,
395
    0x00A0, 0x00A0 + 1,
396
    0x1680, 0x1680 + 1,
397
    0x2000, 0x200A + 1,
398
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
399
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
400
    0x2028, 0x2029 + 1,
401
    0x202F, 0x202F + 1,
402
    0x205F, 0x205F + 1,
403
    0x3000, 0x3000 + 1,
404
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
405
    0xFEFF, 0xFEFF + 1,
406
};
407
408
static const uint16_t char_range_w[] = {
409
    4,
410
    0x0030, 0x0039 + 1,
411
    0x0041, 0x005A + 1,
412
    0x005F, 0x005F + 1,
413
    0x0061, 0x007A + 1,
414
};
415
416
442k
#define CLASS_RANGE_BASE 0x40000000
417
418
typedef enum {
419
    CHAR_RANGE_d,
420
    CHAR_RANGE_D,
421
    CHAR_RANGE_s,
422
    CHAR_RANGE_S,
423
    CHAR_RANGE_w,
424
    CHAR_RANGE_W,
425
} CharRangeEnum;
426
427
static const uint16_t * const char_range_table[] = {
428
    char_range_d,
429
    char_range_s,
430
    char_range_w,
431
};
432
433
static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c)
434
3.93k
{
435
3.93k
    BOOL invert;
436
3.93k
    const uint16_t *c_pt;
437
3.93k
    int len, i;
438
439
3.93k
    invert = c & 1;
440
3.93k
    c_pt = char_range_table[c >> 1];
441
3.93k
    len = *c_pt++;
442
3.93k
    re_string_list_init(s, cr);
443
51.1k
    for(i = 0; i < len * 2; i++) {
444
47.1k
        if (cr_add_point(&cr->cr, c_pt[i]))
445
0
            goto fail;
446
47.1k
    }
447
3.93k
    if (invert) {
448
1.83k
        if (cr_invert(&cr->cr))
449
0
            goto fail;
450
1.83k
    }
451
3.93k
    return 0;
452
0
 fail:
453
0
    re_string_list_free(cr);
454
0
    return -1;
455
3.93k
}
456
457
#ifdef DUMP_REOP
458
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
459
                                                     int buf_len)
460
{
461
    int pos, len, opcode, bc_len, re_flags, i;
462
    uint32_t val;
463
464
    assert(buf_len >= RE_HEADER_LEN);
465
466
    re_flags = lre_get_flags(buf);
467
    bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
468
    assert(bc_len + RE_HEADER_LEN <= buf_len);
469
    printf("flags: 0x%x capture_count=%d stack_size=%d\n",
470
           re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
471
    if (re_flags & LRE_FLAG_NAMED_GROUPS) {
472
        const char *p;
473
        p = (char *)buf + RE_HEADER_LEN + bc_len;
474
        printf("named groups: ");
475
        for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
476
            if (i != 1)
477
                printf(",");
478
            printf("<%s>", p);
479
            p += strlen(p) + 1;
480
        }
481
        printf("\n");
482
        assert(p == (char *)(buf + buf_len));
483
    }
484
    printf("bytecode_len=%d\n", bc_len);
485
486
    buf += RE_HEADER_LEN;
487
    pos = 0;
488
    while (pos < bc_len) {
489
        printf("%5u: ", pos);
490
        opcode = buf[pos];
491
        len = reopcode_info[opcode].size;
492
        if (opcode >= REOP_COUNT) {
493
            printf(" invalid opcode=0x%02x\n", opcode);
494
            break;
495
        }
496
        if ((pos + len) > bc_len) {
497
            printf(" buffer overflow (opcode=0x%02x)\n", opcode);
498
            break;
499
        }
500
        printf("%s", reopcode_info[opcode].name);
501
        switch(opcode) {
502
        case REOP_char:
503
        case REOP_char_i:
504
            val = get_u16(buf + pos + 1);
505
            if (val >= ' ' && val <= 126)
506
                printf(" '%c'", val);
507
            else
508
                printf(" 0x%04x", val);
509
            break;
510
        case REOP_char32:
511
        case REOP_char32_i:
512
            val = get_u32(buf + pos + 1);
513
            if (val >= ' ' && val <= 126)
514
                printf(" '%c'", val);
515
            else
516
                printf(" 0x%08x", val);
517
            break;
518
        case REOP_goto:
519
        case REOP_split_goto_first:
520
        case REOP_split_next_first:
521
        case REOP_loop:
522
        case REOP_lookahead:
523
        case REOP_negative_lookahead:
524
            val = get_u32(buf + pos + 1);
525
            val += (pos + 5);
526
            printf(" %u", val);
527
            break;
528
        case REOP_simple_greedy_quant:
529
            printf(" %u %u %u %u",
530
                   get_u32(buf + pos + 1) + (pos + 17),
531
                   get_u32(buf + pos + 1 + 4),
532
                   get_u32(buf + pos + 1 + 8),
533
                   get_u32(buf + pos + 1 + 12));
534
            break;
535
        case REOP_save_start:
536
        case REOP_save_end:
537
        case REOP_back_reference:
538
        case REOP_back_reference_i:
539
        case REOP_backward_back_reference:
540
        case REOP_backward_back_reference_i:
541
            printf(" %u", buf[pos + 1]);
542
            break;
543
        case REOP_save_reset:
544
            printf(" %u %u", buf[pos + 1], buf[pos + 2]);
545
            break;
546
        case REOP_push_i32:
547
            val = get_u32(buf + pos + 1);
548
            printf(" %d", val);
549
            break;
550
        case REOP_range:
551
        case REOP_range_i:
552
            {
553
                int n, i;
554
                n = get_u16(buf + pos + 1);
555
                len += n * 4;
556
                for(i = 0; i < n * 2; i++) {
557
                    val = get_u16(buf + pos + 3 + i * 2);
558
                    printf(" 0x%04x", val);
559
                }
560
            }
561
            break;
562
        case REOP_range32:
563
        case REOP_range32_i:
564
            {
565
                int n, i;
566
                n = get_u16(buf + pos + 1);
567
                len += n * 8;
568
                for(i = 0; i < n * 2; i++) {
569
                    val = get_u32(buf + pos + 3 + i * 4);
570
                    printf(" 0x%08x", val);
571
                }
572
            }
573
            break;
574
        default:
575
            break;
576
        }
577
        printf("\n");
578
        pos += len;
579
    }
580
}
581
#endif
582
583
static void re_emit_op(REParseState *s, int op)
584
116k
{
585
116k
    dbuf_putc(&s->byte_code, op);
586
116k
}
587
588
/* return the offset of the u32 value */
589
static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
590
99.7k
{
591
99.7k
    int pos;
592
99.7k
    dbuf_putc(&s->byte_code, op);
593
99.7k
    pos = s->byte_code.size;
594
99.7k
    dbuf_put_u32(&s->byte_code, val);
595
99.7k
    return pos;
596
99.7k
}
597
598
static int re_emit_goto(REParseState *s, int op, uint32_t val)
599
10.0k
{
600
10.0k
    int pos;
601
10.0k
    dbuf_putc(&s->byte_code, op);
602
10.0k
    pos = s->byte_code.size;
603
10.0k
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
604
10.0k
    return pos;
605
10.0k
}
606
607
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
608
138k
{
609
138k
    dbuf_putc(&s->byte_code, op);
610
138k
    dbuf_putc(&s->byte_code, val);
611
138k
}
612
613
static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
614
335k
{
615
335k
    dbuf_putc(&s->byte_code, op);
616
335k
    dbuf_put_u16(&s->byte_code, val);
617
335k
}
618
619
static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
620
3.04k
{
621
3.04k
    va_list ap;
622
3.04k
    va_start(ap, fmt);
623
3.04k
    vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
624
3.04k
    va_end(ap);
625
3.04k
    return -1;
626
3.04k
}
627
628
static int re_parse_out_of_memory(REParseState *s)
629
25
{
630
25
    return re_parse_error(s, "out of memory");
631
25
}
632
633
/* If allow_overflow is false, return -1 in case of
634
   overflow. Otherwise return INT32_MAX. */
635
static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
636
15.2k
{
637
15.2k
    const uint8_t *p;
638
15.2k
    uint64_t v;
639
15.2k
    int c;
640
641
15.2k
    p = *pp;
642
15.2k
    v = 0;
643
39.9k
    for(;;) {
644
39.9k
        c = *p;
645
39.9k
        if (c < '0' || c > '9')
646
15.0k
            break;
647
24.8k
        v = v * 10 + c - '0';
648
24.8k
        if (v >= INT32_MAX) {
649
1.69k
            if (allow_overflow)
650
1.49k
                v = INT32_MAX;
651
196
            else
652
196
                return -1;
653
1.69k
        }
654
24.7k
        p++;
655
24.7k
    }
656
15.0k
    *pp = p;
657
15.0k
    return v;
658
15.2k
}
659
660
static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
661
21.2k
{
662
21.2k
    const uint8_t *p;
663
21.2k
    p = *pp;
664
21.2k
    if (*p != c)
665
707
        return re_parse_error(s, "expecting '%c'", c);
666
20.5k
    p++;
667
20.5k
    *pp = p;
668
20.5k
    return 0;
669
21.2k
}
670
671
/* Parse an escape sequence, *pp points after the '\':
672
   allow_utf16 value:
673
   0 : no UTF-16 escapes allowed
674
   1 : UTF-16 escapes allowed
675
   2 : UTF-16 escapes allowed and escapes of surrogate pairs are
676
   converted to a unicode character (unicode regexp case).
677
678
   Return the unicode char and update *pp if recognized,
679
   return -1 if malformed escape,
680
   return -2 otherwise. */
681
int lre_parse_escape(const uint8_t **pp, int allow_utf16)
682
86.0k
{
683
86.0k
    const uint8_t *p;
684
86.0k
    uint32_t c;
685
686
86.0k
    p = *pp;
687
86.0k
    c = *p++;
688
86.0k
    switch(c) {
689
2.97k
    case 'b':
690
2.97k
        c = '\b';
691
2.97k
        break;
692
248
    case 'f':
693
248
        c = '\f';
694
248
        break;
695
430
    case 'n':
696
430
        c = '\n';
697
430
        break;
698
350
    case 'r':
699
350
        c = '\r';
700
350
        break;
701
206
    case 't':
702
206
        c = '\t';
703
206
        break;
704
834
    case 'v':
705
834
        c = '\v';
706
834
        break;
707
479
    case 'x':
708
14.1k
    case 'u':
709
14.1k
        {
710
14.1k
            int h, n, i;
711
14.1k
            uint32_t c1;
712
713
14.1k
            if (*p == '{' && allow_utf16) {
714
1.34k
                p++;
715
1.34k
                c = 0;
716
3.15k
                for(;;) {
717
3.15k
                    h = from_hex(*p++);
718
3.15k
                    if (h < 0)
719
900
                        return -1;
720
2.25k
                    c = (c << 4) | h;
721
2.25k
                    if (c > 0x10FFFF)
722
241
                        return -1;
723
2.00k
                    if (*p == '}')
724
202
                        break;
725
2.00k
                }
726
202
                p++;
727
12.8k
            } else {
728
12.8k
                if (c == 'x') {
729
479
                    n = 2;
730
12.3k
                } else {
731
12.3k
                    n = 4;
732
12.3k
                }
733
734
12.8k
                c = 0;
735
51.3k
                for(i = 0; i < n; i++) {
736
43.2k
                    h = from_hex(*p++);
737
43.2k
                    if (h < 0) {
738
4.71k
                        return -1;
739
4.71k
                    }
740
38.5k
                    c = (c << 4) | h;
741
38.5k
                }
742
8.13k
                if (is_hi_surrogate(c) &&
743
5.97k
                    allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
744
                    /* convert an escaped surrogate pair into a
745
                       unicode char */
746
4.97k
                    c1 = 0;
747
15.1k
                    for(i = 0; i < 4; i++) {
748
14.2k
                        h = from_hex(p[2 + i]);
749
14.2k
                        if (h < 0)
750
4.07k
                            break;
751
10.1k
                        c1 = (c1 << 4) | h;
752
10.1k
                    }
753
4.97k
                    if (i == 4 && is_lo_surrogate(c1)) {
754
429
                        p += 6;
755
429
                        c = from_surrogate(c, c1);
756
429
                    }
757
4.97k
                }
758
8.13k
            }
759
14.1k
        }
760
8.33k
        break;
761
8.33k
    case '0': case '1': case '2': case '3':
762
3.55k
    case '4': case '5': case '6': case '7':
763
3.55k
        c -= '0';
764
3.55k
        if (allow_utf16 == 2) {
765
            /* only accept \0 not followed by digit */
766
0
            if (c != 0 || is_digit(*p))
767
0
                return -1;
768
3.55k
        } else {
769
            /* parse a legacy octal sequence */
770
3.55k
            uint32_t v;
771
3.55k
            v = *p - '0';
772
3.55k
            if (v > 7)
773
2.93k
                break;
774
628
            c = (c << 3) | v;
775
628
            p++;
776
628
            if (c >= 32)
777
214
                break;
778
414
            v = *p - '0';
779
414
            if (v > 7)
780
216
                break;
781
198
            c = (c << 3) | v;
782
198
            p++;
783
198
        }
784
198
        break;
785
63.2k
    default:
786
63.2k
        return -2;
787
86.0k
    }
788
16.9k
    *pp = p;
789
16.9k
    return c;
790
86.0k
}
791
792
#ifdef CONFIG_ALL_UNICODE
793
/* XXX: we use the same chars for name and value */
794
static BOOL is_unicode_char(int c)
795
0
{
796
0
    return ((c >= '0' && c <= '9') ||
797
0
            (c >= 'A' && c <= 'Z') ||
798
0
            (c >= 'a' && c <= 'z') ||
799
0
            (c == '_'));
800
0
}
801
802
/* XXX: memory error test */
803
static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len)
804
0
{
805
0
    REStringList *sl = opaque;
806
0
    re_string_add(sl, seq_len, seq);
807
0
}
808
809
static int parse_unicode_property(REParseState *s, REStringList *cr,
810
                                  const uint8_t **pp, BOOL is_inv,
811
                                  BOOL allow_sequence_prop)
812
0
{
813
0
    const uint8_t *p;
814
0
    char name[64], value[64];
815
0
    char *q;
816
0
    BOOL script_ext;
817
0
    int ret;
818
819
0
    p = *pp;
820
0
    if (*p != '{')
821
0
        return re_parse_error(s, "expecting '{' after \\p");
822
0
    p++;
823
0
    q = name;
824
0
    while (is_unicode_char(*p)) {
825
0
        if ((q - name) >= sizeof(name) - 1)
826
0
            goto unknown_property_name;
827
0
        *q++ = *p++;
828
0
    }
829
0
    *q = '\0';
830
0
    q = value;
831
0
    if (*p == '=') {
832
0
        p++;
833
0
        while (is_unicode_char(*p)) {
834
0
            if ((q - value) >= sizeof(value) - 1)
835
0
                return re_parse_error(s, "unknown unicode property value");
836
0
            *q++ = *p++;
837
0
        }
838
0
    }
839
0
    *q = '\0';
840
0
    if (*p != '}')
841
0
        return re_parse_error(s, "expecting '}'");
842
0
    p++;
843
    //    printf("name=%s value=%s\n", name, value);
844
845
0
    if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
846
0
        script_ext = FALSE;
847
0
        goto do_script;
848
0
    } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
849
0
        script_ext = TRUE;
850
0
    do_script:
851
0
        re_string_list_init(s, cr);
852
0
        ret = unicode_script(&cr->cr, value, script_ext);
853
0
        if (ret) {
854
0
            re_string_list_free(cr);
855
0
            if (ret == -2)
856
0
                return re_parse_error(s, "unknown unicode script");
857
0
            else
858
0
                goto out_of_memory;
859
0
        }
860
0
    } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
861
0
        re_string_list_init(s, cr);
862
0
        ret = unicode_general_category(&cr->cr, value);
863
0
        if (ret) {
864
0
            re_string_list_free(cr);
865
0
            if (ret == -2)
866
0
                return re_parse_error(s, "unknown unicode general category");
867
0
            else
868
0
                goto out_of_memory;
869
0
        }
870
0
    } else if (value[0] == '\0') {
871
0
        re_string_list_init(s, cr);
872
0
        ret = unicode_general_category(&cr->cr, name);
873
0
        if (ret == -1) {
874
0
            re_string_list_free(cr);
875
0
            goto out_of_memory;
876
0
        }
877
0
        if (ret < 0) {
878
0
            ret = unicode_prop(&cr->cr, name);
879
0
            if (ret == -1) {
880
0
                re_string_list_free(cr);
881
0
                goto out_of_memory;
882
0
            }
883
0
        }
884
0
        if (ret < 0 && !is_inv && allow_sequence_prop) {
885
0
            CharRange cr_tmp;
886
0
            cr_init(&cr_tmp, s->opaque, lre_realloc);
887
0
            ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp);
888
0
            cr_free(&cr_tmp);
889
0
            if (ret == -1) {
890
0
                re_string_list_free(cr);
891
0
                goto out_of_memory;
892
0
            }
893
0
        }
894
0
        if (ret < 0)
895
0
            goto unknown_property_name;
896
0
    } else {
897
0
    unknown_property_name:
898
0
        return re_parse_error(s, "unknown unicode property name");
899
0
    }
900
901
    /* the ordering of case folding and inversion  differs with
902
       unicode_sets. 'unicode_sets' ordering is more consistent */
903
    /* XXX: the spec seems incorrect, we do it as the other engines
904
       seem to do it. */
905
0
    if (s->ignore_case && s->unicode_sets) {
906
0
        if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
907
0
            re_string_list_free(cr);
908
0
            goto out_of_memory;
909
0
        }
910
0
    }
911
0
    if (is_inv) {
912
0
        if (cr_invert(&cr->cr)) {
913
0
            re_string_list_free(cr);
914
0
            goto out_of_memory;
915
0
        }
916
0
    }
917
0
    if (s->ignore_case && !s->unicode_sets) {
918
0
        if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
919
0
            re_string_list_free(cr);
920
0
            goto out_of_memory;
921
0
        }
922
0
    }
923
0
    *pp = p;
924
0
    return 0;
925
0
 out_of_memory:
926
0
    return re_parse_out_of_memory(s);
927
0
}
928
#endif /* CONFIG_ALL_UNICODE */
929
930
static int get_class_atom(REParseState *s, REStringList *cr,
931
                          const uint8_t **pp, BOOL inclass);
932
933
static int parse_class_string_disjunction(REParseState *s, REStringList *cr,
934
                                          const uint8_t **pp)
935
0
{
936
0
    const uint8_t *p;
937
0
    DynBuf str;
938
0
    int c;
939
    
940
0
    p = *pp;
941
0
    if (*p != '{')
942
0
        return re_parse_error(s, "expecting '{' after \\q");
943
944
0
    dbuf_init2(&str, s->opaque, lre_realloc);
945
0
    re_string_list_init(s, cr);
946
    
947
0
    p++;
948
0
    for(;;) {
949
0
        str.size = 0;
950
0
        while (*p != '}' && *p != '|') {
951
0
            c = get_class_atom(s, NULL, &p, FALSE);
952
0
            if (c < 0)
953
0
                goto fail;
954
0
            if (dbuf_put_u32(&str, c)) {
955
0
                re_parse_out_of_memory(s);
956
0
                goto fail;
957
0
            }
958
0
        }
959
0
        if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) {
960
0
            re_parse_out_of_memory(s);
961
0
            goto fail;
962
0
        }
963
0
        if (*p == '}')
964
0
            break;
965
0
        p++;
966
0
    }
967
0
    if (s->ignore_case) {
968
0
        if (re_string_list_canonicalize(s, cr, TRUE))
969
0
            goto fail;
970
0
    }
971
0
    p++; /* skip the '}' */
972
0
    dbuf_free(&str);
973
0
    *pp = p;
974
0
    return 0;
975
0
 fail:
976
0
    dbuf_free(&str);
977
0
    re_string_list_free(cr);
978
0
    return -1;
979
0
}
980
981
/* return -1 if error otherwise the character or a class range
982
   (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is
983
   initialized. Otherwise, it is ignored. */
984
static int get_class_atom(REParseState *s, REStringList *cr,
985
                          const uint8_t **pp, BOOL inclass)
986
434k
{
987
434k
    const uint8_t *p;
988
434k
    uint32_t c;
989
434k
    int ret;
990
991
434k
    p = *pp;
992
993
434k
    c = *p;
994
434k
    switch(c) {
995
22.9k
    case '\\':
996
22.9k
        p++;
997
22.9k
        if (p >= s->buf_end)
998
100
            goto unexpected_end;
999
22.8k
        c = *p++;
1000
22.8k
        switch(c) {
1001
535
        case 'd':
1002
535
            c = CHAR_RANGE_d;
1003
535
            goto class_range;
1004
406
        case 'D':
1005
406
            c = CHAR_RANGE_D;
1006
406
            goto class_range;
1007
914
        case 's':
1008
914
            c = CHAR_RANGE_s;
1009
914
            goto class_range;
1010
866
        case 'S':
1011
866
            c = CHAR_RANGE_S;
1012
866
            goto class_range;
1013
654
        case 'w':
1014
654
            c = CHAR_RANGE_w;
1015
654
            goto class_range;
1016
560
        case 'W':
1017
560
            c = CHAR_RANGE_W;
1018
3.93k
        class_range:
1019
3.93k
            if (!cr)
1020
0
                goto default_escape;
1021
3.93k
            if (cr_init_char_range(s, cr, c))
1022
0
                return -1;
1023
3.93k
            c = CLASS_RANGE_BASE;
1024
3.93k
            break;
1025
2.44k
        case 'c':
1026
2.44k
            c = *p;
1027
2.44k
            if ((c >= 'a' && c <= 'z') ||
1028
2.24k
                (c >= 'A' && c <= 'Z') ||
1029
2.05k
                (((c >= '0' && c <= '9') || c == '_') &&
1030
587
                 inclass && !s->is_unicode)) {   /* Annex B.1.4 */
1031
587
                c &= 0x1f;
1032
587
                p++;
1033
1.86k
            } else if (s->is_unicode) {
1034
0
                goto invalid_escape;
1035
1.86k
            } else {
1036
                /* otherwise return '\' and 'c' */
1037
1.86k
                p--;
1038
1.86k
                c = '\\';
1039
1.86k
            }
1040
2.44k
            break;
1041
2.44k
        case '-':
1042
444
            if (!inclass && s->is_unicode)
1043
0
                goto invalid_escape;
1044
444
            break;
1045
444
        case '^':
1046
429
        case '$':
1047
2.10k
        case '\\':
1048
2.31k
        case '.':
1049
2.51k
        case '*':
1050
2.71k
        case '+':
1051
2.92k
        case '?':
1052
3.19k
        case '(':
1053
3.42k
        case ')':
1054
3.64k
        case '[':
1055
3.85k
        case ']':
1056
4.05k
        case '{':
1057
4.24k
        case '}':
1058
4.44k
        case '|':
1059
4.64k
        case '/':
1060
            /* always valid to escape these characters */
1061
4.64k
            break;
1062
0
#ifdef CONFIG_ALL_UNICODE
1063
195
        case 'p':
1064
405
        case 'P':
1065
405
            if (s->is_unicode && cr) {
1066
0
                if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets))
1067
0
                    return -1;
1068
0
                c = CLASS_RANGE_BASE;
1069
0
                break;
1070
0
            }
1071
405
            goto default_escape;
1072
405
#endif
1073
405
        case 'q':
1074
215
            if (s->unicode_sets && cr && inclass) {
1075
0
                if (parse_class_string_disjunction(s, cr, &p))
1076
0
                    return -1;
1077
0
                c = CLASS_RANGE_BASE;
1078
0
                break;
1079
0
            }
1080
215
            goto default_escape;
1081
10.7k
        default:
1082
11.4k
        default_escape:
1083
11.4k
            p--;
1084
11.4k
            ret = lre_parse_escape(&p, s->is_unicode * 2);
1085
11.4k
            if (ret >= 0) {
1086
6.15k
                c = ret;
1087
6.15k
            } else {
1088
5.25k
                if (s->is_unicode) {
1089
0
                invalid_escape:
1090
0
                    return re_parse_error(s, "invalid escape sequence in regular expression");
1091
5.25k
                } else {
1092
                    /* just ignore the '\' */
1093
5.25k
                    goto normal_char;
1094
5.25k
                }
1095
5.25k
            }
1096
6.15k
            break;
1097
22.8k
        }
1098
17.6k
        break;
1099
17.6k
    case '\0':
1100
5.47k
        if (p >= s->buf_end) {
1101
799
        unexpected_end:
1102
799
            return re_parse_error(s, "unexpected end");
1103
699
        }
1104
        /* fall thru */
1105
4.77k
        goto normal_char;
1106
1107
4.77k
    case '&':
1108
1.24k
    case '!':
1109
1.79k
    case '#':
1110
2.09k
    case '$':
1111
2.43k
    case '%':
1112
2.65k
    case '*':
1113
2.88k
    case '+':
1114
8.07k
    case ',':
1115
8.47k
    case '.':
1116
12.1k
    case ':':
1117
12.9k
    case ';':
1118
16.3k
    case '<':
1119
21.7k
    case '=':
1120
23.6k
    case '>':
1121
23.9k
    case '?':
1122
24.6k
    case '@':
1123
29.8k
    case '^':
1124
44.1k
    case '`':
1125
44.8k
    case '~':
1126
44.8k
        if (s->unicode_sets && p[1] == c) {
1127
            /* forbidden double characters */
1128
0
            return re_parse_error(s, "invalid class set operation in regular expression");
1129
0
        }
1130
44.8k
        goto normal_char;
1131
1132
44.8k
    case '(':
1133
579
    case ')':
1134
1.45k
    case '[':
1135
2.29k
    case ']':
1136
8.96k
    case '{':
1137
9.49k
    case '}':
1138
11.2k
    case '/':
1139
20.5k
    case '-':
1140
20.9k
    case '|':
1141
20.9k
        if (s->unicode_sets) {
1142
            /* invalid characters in unicode sets */
1143
0
            return re_parse_error(s, "invalid character in class in regular expression");
1144
0
        }
1145
20.9k
        goto normal_char;
1146
1147
340k
    default:
1148
416k
    normal_char:
1149
        /* normal char */
1150
416k
        if (c >= 128) {
1151
39.8k
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1152
39.8k
            if ((unsigned)c > 0xffff && !s->is_unicode) {
1153
                /* XXX: should handle non BMP-1 code points */
1154
400
                return re_parse_error(s, "malformed unicode char");
1155
400
            }
1156
376k
        } else {
1157
376k
            p++;
1158
376k
        }
1159
415k
        break;
1160
434k
    }
1161
433k
    *pp = p;
1162
433k
    return c;
1163
434k
}
1164
1165
static int re_emit_range(REParseState *s, const CharRange *cr)
1166
5.03k
{
1167
5.03k
    int len, i;
1168
5.03k
    uint32_t high;
1169
1170
5.03k
    len = (unsigned)cr->len / 2;
1171
5.03k
    if (len >= 65535)
1172
0
        return re_parse_error(s, "too many ranges");
1173
5.03k
    if (len == 0) {
1174
1.12k
        re_emit_op_u32(s, REOP_char32, -1);
1175
3.90k
    } else {
1176
3.90k
        high = cr->points[cr->len - 1];
1177
3.90k
        if (high == UINT32_MAX)
1178
1.82k
            high = cr->points[cr->len - 2];
1179
3.90k
        if (high <= 0xffff) {
1180
            /* can use 16 bit ranges with the conversion that 0xffff =
1181
               infinity */
1182
3.47k
            re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len);
1183
28.4k
            for(i = 0; i < cr->len; i += 2) {
1184
25.0k
                dbuf_put_u16(&s->byte_code, cr->points[i]);
1185
25.0k
                high = cr->points[i + 1] - 1;
1186
25.0k
                if (high == UINT32_MAX - 1)
1187
1.80k
                    high = 0xffff;
1188
25.0k
                dbuf_put_u16(&s->byte_code, high);
1189
25.0k
            }
1190
3.47k
        } else {
1191
435
            re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len);
1192
4.51k
            for(i = 0; i < cr->len; i += 2) {
1193
4.08k
                dbuf_put_u32(&s->byte_code, cr->points[i]);
1194
4.08k
                dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
1195
4.08k
            }
1196
435
        }
1197
3.90k
    }
1198
5.03k
    return 0;
1199
5.03k
}
1200
1201
static int re_string_cmp_len(const void *a, const void *b, void *arg)
1202
0
{
1203
0
    REString *p1 = *(REString **)a;
1204
0
    REString *p2 = *(REString **)b;
1205
0
    return (p1->len < p2->len) - (p1->len > p2->len);
1206
0
}
1207
1208
static void re_emit_char(REParseState *s, int c)
1209
331k
{
1210
331k
    if (c <= 0xffff)
1211
331k
        re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c);
1212
0
    else
1213
0
        re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c);
1214
331k
}
1215
1216
static int re_emit_string_list(REParseState *s, const REStringList *sl)
1217
5.03k
{
1218
5.03k
    REString **tab, *p;
1219
5.03k
    int i, j, split_pos, last_match_pos, n;
1220
5.03k
    BOOL has_empty_string, is_last;
1221
    
1222
    //    re_string_list_dump("sl", sl);
1223
5.03k
    if (sl->n_strings == 0) {
1224
        /* simple case: only characters */
1225
5.03k
        if (re_emit_range(s, &sl->cr))
1226
0
            return -1;
1227
5.03k
    } else {
1228
        /* at least one string list is present : match the longest ones first */
1229
        /* XXX: add a new op_switch opcode to compile as a trie */
1230
0
        tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings);
1231
0
        if (!tab) {
1232
0
            re_parse_out_of_memory(s);
1233
0
            return -1;
1234
0
        }
1235
0
        has_empty_string = FALSE;
1236
0
        n = 0;
1237
0
        for(i = 0; i < sl->hash_size; i++) {
1238
0
            for(p = sl->hash_table[i]; p != NULL; p = p->next) {
1239
0
                if (p->len == 0) {
1240
0
                    has_empty_string = TRUE;
1241
0
                } else {
1242
0
                    tab[n++] = p;
1243
0
                }
1244
0
            }
1245
0
        }
1246
0
        assert(n <= sl->n_strings);
1247
        
1248
0
        rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL);
1249
1250
0
        last_match_pos = -1;
1251
0
        for(i = 0; i < n; i++) {
1252
0
            p = tab[i];
1253
0
            is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1);
1254
0
            if (!is_last)
1255
0
                split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1256
0
            else
1257
0
                split_pos = 0;
1258
0
            for(j = 0; j < p->len; j++) {
1259
0
                re_emit_char(s, p->buf[j]);
1260
0
            }
1261
0
            if (!is_last) {
1262
0
                last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
1263
0
                put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1264
0
            }
1265
0
        }
1266
1267
0
        if (sl->cr.len != 0) {
1268
            /* char range */
1269
0
            is_last = !has_empty_string;
1270
0
            if (!is_last)
1271
0
                split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1272
0
            else
1273
0
                split_pos = 0; /* not used */
1274
0
            if (re_emit_range(s, &sl->cr)) {
1275
0
                lre_realloc(s->opaque, tab, 0);
1276
0
                return -1;
1277
0
            }
1278
0
            if (!is_last)
1279
0
                put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1280
0
        }
1281
1282
        /* patch the 'goto match' */
1283
0
        while (last_match_pos != -1) {
1284
0
            int next_pos = get_u32(s->byte_code.buf + last_match_pos);
1285
0
            put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4));
1286
0
            last_match_pos = next_pos;
1287
0
        }
1288
        
1289
0
        lre_realloc(s->opaque, tab, 0);
1290
0
    }
1291
5.03k
    return 0;
1292
5.03k
}
1293
1294
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp);
1295
1296
static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp)
1297
0
{
1298
0
    int c1;
1299
0
    const uint8_t *p = *pp;
1300
    
1301
0
    if (*p == '[') {
1302
0
        if (re_parse_nested_class(s, cr, pp))
1303
0
            return -1;
1304
0
    } else {
1305
0
        c1 = get_class_atom(s, cr, pp, TRUE);
1306
0
        if (c1 < 0)
1307
0
            return -1;
1308
0
        if (c1 < CLASS_RANGE_BASE) {
1309
            /* create a range with a single character */
1310
0
            re_string_list_init(s, cr);
1311
0
            if (s->ignore_case)
1312
0
                c1 = lre_canonicalize(c1, s->is_unicode);
1313
0
            if (cr_union_interval(&cr->cr, c1, c1)) {
1314
0
                re_string_list_free(cr);
1315
0
                return -1;
1316
0
            }
1317
0
        }
1318
0
    }
1319
0
    return 0;
1320
0
}
1321
1322
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp)
1323
3.50k
{
1324
3.50k
    const uint8_t *p;
1325
3.50k
    uint32_t c1, c2;
1326
3.50k
    int ret;
1327
3.50k
    REStringList cr1_s, *cr1 = &cr1_s;
1328
3.50k
    BOOL invert, is_first;
1329
1330
3.50k
    if (lre_check_stack_overflow(s->opaque, 0))
1331
0
        return re_parse_error(s, "stack overflow");
1332
1333
3.50k
    re_string_list_init(s, cr);
1334
3.50k
    p = *pp;
1335
3.50k
    p++;    /* skip '[' */
1336
1337
3.50k
    invert = FALSE;
1338
3.50k
    if (*p == '^') {
1339
357
        p++;
1340
357
        invert = TRUE;
1341
357
    }
1342
    
1343
    /* handle unions */
1344
3.50k
    is_first = TRUE;
1345
102k
    for(;;) {
1346
102k
        if (*p == ']')
1347
2.70k
            break;
1348
99.3k
        if (*p == '[' && s->unicode_sets) {
1349
0
            if (re_parse_nested_class(s, cr1, &p))
1350
0
                goto fail;
1351
0
            goto class_union;
1352
99.3k
        } else {
1353
99.3k
            c1 = get_class_atom(s, cr1, &p, TRUE);
1354
99.3k
            if ((int)c1 < 0)
1355
765
                goto fail;
1356
98.6k
            if (*p == '-' && p[1] != ']') {
1357
6.22k
                const uint8_t *p0 = p + 1;
1358
6.22k
                if (p[1] == '-' && s->unicode_sets && is_first)
1359
0
                    goto class_atom; /* first character class followed by '--' */
1360
6.22k
                if (c1 >= CLASS_RANGE_BASE) {
1361
350
                    if (s->is_unicode) {
1362
0
                        re_string_list_free(cr1);
1363
0
                        goto invalid_class_range;
1364
0
                    }
1365
                    /* Annex B: match '-' character */
1366
350
                    goto class_atom;
1367
350
                }
1368
5.87k
                c2 = get_class_atom(s, cr1, &p0, TRUE);
1369
5.87k
                if ((int)c2 < 0)
1370
28
                    goto fail;
1371
5.85k
                if (c2 >= CLASS_RANGE_BASE) {
1372
265
                    re_string_list_free(cr1);
1373
265
                    if (s->is_unicode) {
1374
0
                        goto invalid_class_range;
1375
0
                    }
1376
                    /* Annex B: match '-' character */
1377
265
                    goto class_atom;
1378
265
                }
1379
5.58k
                p = p0;
1380
5.58k
                if (c2 < c1) {
1381
10
                invalid_class_range:
1382
10
                    re_parse_error(s, "invalid class range");
1383
10
                    goto fail;
1384
10
                }
1385
5.57k
                if (s->ignore_case) {
1386
5.30k
                    CharRange cr2_s, *cr2 = &cr2_s;
1387
5.30k
                    cr_init(cr2, s->opaque, lre_realloc);
1388
5.30k
                    if (cr_add_interval(cr2, c1, c2 + 1) ||
1389
5.30k
                        cr_regexp_canonicalize(cr2, s->is_unicode) ||
1390
5.30k
                        cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) {
1391
0
                        cr_free(cr2);
1392
0
                        goto memory_error;
1393
0
                    }
1394
5.30k
                    cr_free(cr2);
1395
5.30k
                } else {
1396
274
                    if (cr_union_interval(&cr->cr, c1, c2))
1397
0
                        goto memory_error;
1398
274
                }
1399
5.57k
                is_first = FALSE; /* union operation */
1400
92.3k
            } else {
1401
93.0k
            class_atom:
1402
93.0k
                if (c1 >= CLASS_RANGE_BASE) {
1403
1.33k
                class_union:
1404
1.33k
                    ret = re_string_list_op(cr, cr1, CR_OP_UNION);
1405
1.33k
                    re_string_list_free(cr1);
1406
1.33k
                    if (ret)
1407
0
                        goto memory_error;
1408
91.6k
                } else {
1409
91.6k
                    if (s->ignore_case)
1410
81.9k
                        c1 = lre_canonicalize(c1, s->is_unicode);
1411
91.6k
                    if (cr_union_interval(&cr->cr, c1, c1))
1412
0
                        goto memory_error;
1413
91.6k
                }
1414
93.0k
            }
1415
98.6k
        }
1416
98.5k
        if (s->unicode_sets && is_first) {
1417
0
            if (*p == '&' && p[1] == '&' && p[2] != '&') {
1418
                /* handle '&&' */
1419
0
                for(;;) {
1420
0
                    if (*p == ']') {
1421
0
                        break;
1422
0
                    } else if (*p == '&' && p[1] == '&' && p[2] != '&') {
1423
0
                        p += 2;
1424
0
                    } else {
1425
0
                        goto invalid_operation;
1426
0
                    }
1427
0
                    if (re_parse_class_set_operand(s, cr1, &p))
1428
0
                        goto fail;
1429
0
                    ret = re_string_list_op(cr, cr1, CR_OP_INTER);
1430
0
                    re_string_list_free(cr1);
1431
0
                    if (ret)
1432
0
                        goto memory_error;
1433
0
                }
1434
0
            } else if (*p == '-' && p[1] == '-') {
1435
                /* handle '--' */
1436
0
                for(;;) {
1437
0
                    if (*p == ']') {
1438
0
                        break;
1439
0
                    } else if (*p == '-' && p[1] == '-') {
1440
0
                        p += 2;
1441
0
                    } else {
1442
0
                    invalid_operation:
1443
0
                        re_parse_error(s, "invalid operation in regular expression");
1444
0
                        goto fail;
1445
0
                    }
1446
0
                    if (re_parse_class_set_operand(s, cr1, &p))
1447
0
                        goto fail;
1448
0
                    ret = re_string_list_op(cr, cr1, CR_OP_SUB);
1449
0
                    re_string_list_free(cr1);
1450
0
                    if (ret)
1451
0
                        goto memory_error;
1452
0
                }
1453
0
            }
1454
0
        }
1455
98.5k
        is_first = FALSE;
1456
98.5k
    }
1457
1458
2.70k
    p++;    /* skip ']' */
1459
2.70k
    *pp = p;
1460
2.70k
    if (invert) {
1461
        /* XXX: add may_contain_string syntax check to be fully
1462
           compliant. The test here accepts more input than the
1463
           spec. */
1464
347
        if (cr->n_strings != 0) {
1465
0
            re_parse_error(s, "negated character class with strings in regular expression debugger eval code");
1466
0
            goto fail;
1467
0
        }
1468
347
        if (cr_invert(&cr->cr))
1469
0
            goto memory_error;
1470
347
    }
1471
2.70k
    return 0;
1472
0
 memory_error:
1473
0
    re_parse_out_of_memory(s);
1474
803
 fail:
1475
803
    re_string_list_free(cr);
1476
803
    return -1;
1477
0
}
1478
1479
static int re_parse_char_class(REParseState *s, const uint8_t **pp)
1480
3.50k
{
1481
3.50k
    REStringList cr_s, *cr = &cr_s;
1482
1483
3.50k
    if (re_parse_nested_class(s, cr, pp))
1484
803
        return -1;
1485
2.70k
    if (re_emit_string_list(s, cr))
1486
0
        goto fail;
1487
2.70k
    re_string_list_free(cr);
1488
2.70k
    return 0;
1489
0
 fail:
1490
0
    re_string_list_free(cr);
1491
0
    return -1;
1492
2.70k
}
1493
1494
/* Return:
1495
   - true if the opcodes may not advance the char pointer
1496
   - false if the opcodes always advance the char pointer
1497
*/
1498
static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
1499
12.6k
{
1500
12.6k
    int pos, opcode, len;
1501
12.6k
    uint32_t val;
1502
12.6k
    BOOL ret;
1503
1504
12.6k
    ret = TRUE;
1505
12.6k
    pos = 0;
1506
88.5k
    while (pos < bc_buf_len) {
1507
84.6k
        opcode = bc_buf[pos];
1508
84.6k
        len = reopcode_info[opcode].size;
1509
84.6k
        switch(opcode) {
1510
637
        case REOP_range:
1511
1.08k
        case REOP_range_i:
1512
1.08k
            val = get_u16(bc_buf + pos + 1);
1513
1.08k
            len += val * 4;
1514
1.08k
            goto simple_char;
1515
397
        case REOP_range32:
1516
655
        case REOP_range32_i:
1517
655
            val = get_u16(bc_buf + pos + 1);
1518
655
            len += val * 8;
1519
655
            goto simple_char;
1520
5.46k
        case REOP_char:
1521
6.81k
        case REOP_char_i:
1522
7.25k
        case REOP_char32:
1523
7.25k
        case REOP_char32_i:
1524
7.95k
        case REOP_dot:
1525
8.19k
        case REOP_any:
1526
9.93k
        simple_char:
1527
9.93k
            ret = FALSE;
1528
9.93k
            break;
1529
620
        case REOP_line_start:
1530
969
        case REOP_line_start_m:
1531
1.42k
        case REOP_line_end:
1532
1.82k
        case REOP_line_end_m:
1533
7.39k
        case REOP_push_i32:
1534
7.39k
        case REOP_push_char_pos:
1535
7.39k
        case REOP_drop:
1536
8.04k
        case REOP_word_boundary:
1537
8.47k
        case REOP_word_boundary_i:
1538
8.71k
        case REOP_not_word_boundary:
1539
9.10k
        case REOP_not_word_boundary_i:
1540
12.4k
        case REOP_prev:
1541
            /* no effect */
1542
12.4k
            break;
1543
45.1k
        case REOP_save_start:
1544
48.6k
        case REOP_save_end:
1545
52.2k
        case REOP_save_reset:
1546
52.5k
        case REOP_back_reference:
1547
52.7k
        case REOP_back_reference_i:
1548
52.9k
        case REOP_backward_back_reference:
1549
53.3k
        case REOP_backward_back_reference_i:
1550
53.3k
            break;
1551
8.78k
        default:
1552
            /* safe behavior: we cannot predict the outcome */
1553
8.78k
            return TRUE;
1554
84.6k
        }
1555
75.8k
        pos += len;
1556
75.8k
    }
1557
3.91k
    return ret;
1558
12.6k
}
1559
1560
/* return -1 if a simple quantifier cannot be used. Otherwise return
1561
   the number of characters in the atom. */
1562
static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
1563
13.9k
{
1564
13.9k
    int pos, opcode, len, count;
1565
13.9k
    uint32_t val;
1566
1567
13.9k
    count = 0;
1568
13.9k
    pos = 0;
1569
90.8k
    while (pos < bc_buf_len) {
1570
87.2k
        opcode = bc_buf[pos];
1571
87.2k
        len = reopcode_info[opcode].size;
1572
87.2k
        switch(opcode) {
1573
458
        case REOP_range:
1574
899
        case REOP_range_i:
1575
899
            val = get_u16(bc_buf + pos + 1);
1576
899
            len += val * 4;
1577
899
            goto simple_char;
1578
265
        case REOP_range32:
1579
477
        case REOP_range32_i:
1580
477
            val = get_u16(bc_buf + pos + 1);
1581
477
            len += val * 8;
1582
477
            goto simple_char;
1583
69.5k
        case REOP_char:
1584
72.2k
        case REOP_char_i:
1585
72.5k
        case REOP_char32:
1586
72.5k
        case REOP_char32_i:
1587
73.3k
        case REOP_dot:
1588
73.5k
        case REOP_any:
1589
74.9k
        simple_char:
1590
74.9k
            count++;
1591
74.9k
            break;
1592
357
        case REOP_line_start:
1593
579
        case REOP_line_start_m:
1594
798
        case REOP_line_end:
1595
1.01k
        case REOP_line_end_m:
1596
1.21k
        case REOP_word_boundary:
1597
1.42k
        case REOP_word_boundary_i:
1598
1.62k
        case REOP_not_word_boundary:
1599
1.83k
        case REOP_not_word_boundary_i:
1600
1.83k
            break;
1601
10.4k
        default:
1602
10.4k
            return -1;
1603
87.2k
        }
1604
76.8k
        pos += len;
1605
76.8k
    }
1606
3.52k
    return count;
1607
13.9k
}
1608
1609
/* '*pp' is the first char after '<' */
1610
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
1611
21.0k
{
1612
21.0k
    const uint8_t *p, *p1;
1613
21.0k
    uint32_t c, d;
1614
21.0k
    char *q;
1615
1616
21.0k
    p = *pp;
1617
21.0k
    q = buf;
1618
66.5k
    for(;;) {
1619
66.5k
        c = *p;
1620
66.5k
        if (c == '\\') {
1621
12.3k
            p++;
1622
12.3k
            if (*p != 'u')
1623
486
                return -1;
1624
11.9k
            c = lre_parse_escape(&p, 2); // accept surrogate pairs
1625
54.1k
        } else if (c == '>') {
1626
8.32k
            break;
1627
45.8k
        } else if (c >= 128) {
1628
3.87k
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1629
3.87k
            if (is_hi_surrogate(c)) {
1630
426
                d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
1631
426
                if (is_lo_surrogate(d)) {
1632
201
                    c = from_surrogate(c, d);
1633
201
                    p = p1;
1634
201
                }
1635
426
            }
1636
41.9k
        } else {
1637
41.9k
            p++;
1638
41.9k
        }
1639
57.7k
        if (c > 0x10FFFF)
1640
5.38k
            return -1;
1641
52.3k
        if (q == buf) {
1642
15.3k
            if (!lre_js_is_ident_first(c))
1643
5.55k
                return -1;
1644
36.9k
        } else {
1645
36.9k
            if (!lre_js_is_ident_next(c))
1646
1.11k
                return -1;
1647
36.9k
        }
1648
45.6k
        if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1649
195
            return -1;
1650
45.4k
        if (c < 128) {
1651
40.7k
            *q++ = c;
1652
40.7k
        } else {
1653
4.74k
            q += unicode_to_utf8((uint8_t*)q, c);
1654
4.74k
        }
1655
45.4k
    }
1656
8.32k
    if (q == buf)
1657
213
        return -1;
1658
8.11k
    *q = '\0';
1659
8.11k
    p++;
1660
8.11k
    *pp = p;
1661
8.11k
    return 0;
1662
8.32k
}
1663
1664
/* if capture_name = NULL: return the number of captures + 1.
1665
   Otherwise, return the capture index corresponding to capture_name
1666
   or -1 if none */
1667
static int re_parse_captures(REParseState *s, int *phas_named_captures,
1668
                             const char *capture_name)
1669
5.21k
{
1670
5.21k
    const uint8_t *p;
1671
5.21k
    int capture_index;
1672
5.21k
    char name[TMP_BUF_SIZE];
1673
1674
5.21k
    capture_index = 1;
1675
5.21k
    *phas_named_captures = 0;
1676
1.12M
    for (p = s->buf_start; p < s->buf_end; p++) {
1677
1.12M
        switch (*p) {
1678
162k
        case '(':
1679
162k
            if (p[1] == '?') {
1680
95.3k
                if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1681
15.7k
                    *phas_named_captures = 1;
1682
                    /* potential named capture */
1683
15.7k
                    if (capture_name) {
1684
14.9k
                        p += 3;
1685
14.9k
                        if (re_parse_group_name(name, sizeof(name), &p) == 0) {
1686
3.41k
                            if (!strcmp(name, capture_name))
1687
2.75k
                                return capture_index;
1688
3.41k
                        }
1689
14.9k
                    }
1690
13.0k
                    capture_index++;
1691
13.0k
                    if (capture_index >= CAPTURE_COUNT_MAX)
1692
4
                        goto done;
1693
13.0k
                }
1694
95.3k
            } else {
1695
66.9k
                capture_index++;
1696
66.9k
                if (capture_index >= CAPTURE_COUNT_MAX)
1697
208
                    goto done;
1698
66.9k
            }
1699
159k
            break;
1700
159k
        case '\\':
1701
146k
            p++;
1702
146k
            break;
1703
1.17k
        case '[':
1704
6.81k
            for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1705
5.63k
                if (*p == '\\')
1706
3.28k
                    p++;
1707
5.63k
            }
1708
1.17k
            break;
1709
1.12M
        }
1710
1.12M
    }
1711
2.46k
 done:
1712
2.46k
    if (capture_name)
1713
1.32k
        return -1;
1714
1.14k
    else
1715
1.14k
        return capture_index;
1716
2.46k
}
1717
1718
static int re_count_captures(REParseState *s)
1719
4.75k
{
1720
4.75k
    if (s->total_capture_count < 0) {
1721
1.14k
        s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1722
1.14k
                                                   NULL);
1723
1.14k
    }
1724
4.75k
    return s->total_capture_count;
1725
4.75k
}
1726
1727
static BOOL re_has_named_captures(REParseState *s)
1728
2.71k
{
1729
2.71k
    if (s->has_named_captures < 0)
1730
771
        re_count_captures(s);
1731
2.71k
    return s->has_named_captures;
1732
2.71k
}
1733
1734
static int find_group_name(REParseState *s, const char *name)
1735
4.69k
{
1736
4.69k
    const char *p, *buf_end;
1737
4.69k
    size_t len, name_len;
1738
4.69k
    int capture_index;
1739
1740
4.69k
    p = (char *)s->group_names.buf;
1741
4.69k
    if (!p) return -1;
1742
985
    buf_end = (char *)s->group_names.buf + s->group_names.size;
1743
985
    name_len = strlen(name);
1744
985
    capture_index = 1;
1745
9.99k
    while (p < buf_end) {
1746
9.21k
        len = strlen(p);
1747
9.21k
        if (len == name_len && memcmp(name, p, name_len) == 0)
1748
205
            return capture_index;
1749
9.00k
        p += len + 1;
1750
9.00k
        capture_index++;
1751
9.00k
    }
1752
780
    return -1;
1753
985
}
1754
1755
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1756
1757
static int re_parse_modifiers(REParseState *s, const uint8_t **pp)
1758
3.51k
{
1759
3.51k
    const uint8_t *p = *pp;
1760
3.51k
    int mask = 0;
1761
3.51k
    int val;
1762
1763
6.30k
    for(;;) {
1764
6.30k
        if (*p == 'i') {
1765
1.35k
            val = LRE_FLAG_IGNORECASE;
1766
4.94k
        } else if (*p == 'm') {
1767
566
            val = LRE_FLAG_MULTILINE;
1768
4.38k
        } else if (*p == 's') {
1769
880
            val = LRE_FLAG_DOTALL;
1770
3.50k
        } else {
1771
3.50k
            break;
1772
3.50k
        }
1773
2.80k
        if (mask & val)
1774
10
            return re_parse_error(s, "duplicate modifier: '%c'", *p);
1775
2.79k
        mask |= val;
1776
2.79k
        p++;
1777
2.79k
    }
1778
3.50k
    *pp = p;
1779
3.50k
    return mask;
1780
3.51k
}
1781
1782
static BOOL update_modifier(BOOL val, int add_mask, int remove_mask,
1783
                            int mask)
1784
7.59k
{
1785
7.59k
    if (add_mask & mask)
1786
1.85k
        val = TRUE;
1787
7.59k
    if (remove_mask & mask)
1788
856
        val = FALSE;
1789
7.59k
    return val;
1790
7.59k
}
1791
1792
static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1793
612k
{
1794
612k
    const uint8_t *p;
1795
612k
    int c, last_atom_start, quant_min, quant_max, last_capture_count;
1796
612k
    BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1797
612k
    REStringList cr_s, *cr = &cr_s;
1798
1799
612k
    last_atom_start = -1;
1800
612k
    last_capture_count = 0;
1801
612k
    p = s->buf_ptr;
1802
612k
    c = *p;
1803
612k
    switch(c) {
1804
723
    case '^':
1805
723
        p++;
1806
723
        re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start);
1807
723
        break;
1808
1.41k
    case '$':
1809
1.41k
        p++;
1810
1.41k
        re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end);
1811
1.41k
        break;
1812
2.78k
    case '.':
1813
2.78k
        p++;
1814
2.78k
        last_atom_start = s->byte_code.size;
1815
2.78k
        last_capture_count = s->capture_count;
1816
2.78k
        if (is_backward_dir)
1817
506
            re_emit_op(s, REOP_prev);
1818
2.78k
        re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1819
2.78k
        if (is_backward_dir)
1820
506
            re_emit_op(s, REOP_prev);
1821
2.78k
        break;
1822
6.64k
    case '{':
1823
6.64k
        if (s->is_unicode) {
1824
0
            return re_parse_error(s, "syntax error");
1825
6.64k
        } else if (!is_digit(p[1])) {
1826
            /* Annex B: we accept '{' not followed by digits as a
1827
               normal atom */
1828
4.11k
            goto parse_class_atom;
1829
4.11k
        } else {
1830
2.52k
            const uint8_t *p1 = p + 1;
1831
            /* Annex B: error if it is like a repetition count */
1832
2.52k
            parse_digits(&p1, TRUE);
1833
2.52k
            if (*p1 == ',') {
1834
1.52k
                p1++;
1835
1.52k
                if (is_digit(*p1)) {
1836
745
                    parse_digits(&p1, TRUE);
1837
745
                }
1838
1.52k
            }
1839
2.52k
            if (*p1 != '}') {
1840
2.52k
                goto parse_class_atom;
1841
2.52k
            }
1842
2.52k
        }
1843
        /* fall thru */
1844
9
    case '*':
1845
20
    case '+':
1846
26
    case '?':
1847
26
        return re_parse_error(s, "nothing to repeat");
1848
263k
    case '(':
1849
263k
        if (p[1] == '?') {
1850
223k
            if (p[2] == ':') {
1851
212k
                p += 3;
1852
212k
                last_atom_start = s->byte_code.size;
1853
212k
                last_capture_count = s->capture_count;
1854
212k
                s->buf_ptr = p;
1855
212k
                if (re_parse_disjunction(s, is_backward_dir))
1856
211k
                    return -1;
1857
1.02k
                p = s->buf_ptr;
1858
1.02k
                if (re_parse_expect(s, &p, ')'))
1859
26
                    return -1;
1860
10.7k
            } else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') {
1861
2.61k
                BOOL saved_ignore_case, saved_multi_line, saved_dotall;
1862
2.61k
                int add_mask, remove_mask;
1863
2.61k
                p += 2;
1864
2.61k
                remove_mask = 0;
1865
2.61k
                add_mask = re_parse_modifiers(s, &p);
1866
2.61k
                if (add_mask < 0)
1867
6
                    return -1;
1868
2.60k
                if (*p == '-') {
1869
900
                    p++;
1870
900
                    remove_mask = re_parse_modifiers(s, &p);
1871
900
                    if (remove_mask < 0)
1872
4
                        return -1;
1873
900
                }
1874
2.60k
                if ((add_mask == 0 && remove_mask == 0) ||
1875
2.58k
                    (add_mask & remove_mask) != 0) {
1876
15
                    return re_parse_error(s, "invalid modifiers");
1877
15
                }
1878
2.58k
                if (re_parse_expect(s, &p, ':'))
1879
52
                    return -1;
1880
2.53k
                saved_ignore_case = s->ignore_case;
1881
2.53k
                saved_multi_line = s->multi_line;
1882
2.53k
                saved_dotall = s->dotall;
1883
2.53k
                s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE);
1884
2.53k
                s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE);
1885
2.53k
                s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL);
1886
1887
2.53k
                last_atom_start = s->byte_code.size;
1888
2.53k
                last_capture_count = s->capture_count;
1889
2.53k
                s->buf_ptr = p;
1890
2.53k
                if (re_parse_disjunction(s, is_backward_dir))
1891
1.77k
                    return -1;
1892
761
                p = s->buf_ptr;
1893
761
                if (re_parse_expect(s, &p, ')'))
1894
182
                    return -1;
1895
579
                s->ignore_case = saved_ignore_case;
1896
579
                s->multi_line = saved_multi_line;
1897
579
                s->dotall = saved_dotall;
1898
8.11k
            } else if ((p[2] == '=' || p[2] == '!')) {
1899
5.62k
                is_neg = (p[2] == '!');
1900
5.62k
                is_backward_lookahead = FALSE;
1901
5.62k
                p += 3;
1902
5.62k
                goto lookahead;
1903
5.62k
            } else if (p[2] == '<' &&
1904
2.39k
                       (p[3] == '=' || p[3] == '!')) {
1905
1.37k
                int pos;
1906
1.37k
                is_neg = (p[3] == '!');
1907
1.37k
                is_backward_lookahead = TRUE;
1908
1.37k
                p += 4;
1909
                /* lookahead */
1910
7.00k
            lookahead:
1911
                /* Annex B allows lookahead to be used as an atom for
1912
                   the quantifiers */
1913
7.00k
                if (!s->is_unicode && !is_backward_lookahead)  {
1914
5.62k
                    last_atom_start = s->byte_code.size;
1915
5.62k
                    last_capture_count = s->capture_count;
1916
5.62k
                }
1917
7.00k
                pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1918
7.00k
                s->buf_ptr = p;
1919
7.00k
                if (re_parse_disjunction(s, is_backward_lookahead))
1920
6.09k
                    return -1;
1921
910
                p = s->buf_ptr;
1922
910
                if (re_parse_expect(s, &p, ')'))
1923
140
                    return -1;
1924
770
                re_emit_op(s, REOP_match);
1925
                /* jump after the 'match' after the lookahead is successful */
1926
770
                if (dbuf_error(&s->byte_code))
1927
4
                    return -1;
1928
766
                put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1929
1.11k
            } else if (p[2] == '<') {
1930
1.02k
                p += 3;
1931
1.02k
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1932
1.02k
                                        &p)) {
1933
598
                    return re_parse_error(s, "invalid group name");
1934
598
                }
1935
425
                if (find_group_name(s, s->u.tmp_buf) > 0) {
1936
5
                    return re_parse_error(s, "duplicate group name");
1937
5
                }
1938
                /* group name with a trailing zero */
1939
420
                dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1940
420
                         strlen(s->u.tmp_buf) + 1);
1941
420
                s->has_named_captures = 1;
1942
420
                goto parse_capture;
1943
425
            } else {
1944
87
                return re_parse_error(s, "invalid group");
1945
87
            }
1946
223k
        } else {
1947
40.0k
            int capture_index;
1948
40.0k
            p++;
1949
            /* capture without group name */
1950
40.0k
            dbuf_putc(&s->group_names, 0);
1951
40.4k
        parse_capture:
1952
40.4k
            if (s->capture_count >= CAPTURE_COUNT_MAX)
1953
18
                return re_parse_error(s, "too many captures");
1954
40.4k
            last_atom_start = s->byte_code.size;
1955
40.4k
            last_capture_count = s->capture_count;
1956
40.4k
            capture_index = s->capture_count++;
1957
40.4k
            re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1958
40.4k
                          capture_index);
1959
1960
40.4k
            s->buf_ptr = p;
1961
40.4k
            if (re_parse_disjunction(s, is_backward_dir))
1962
27.0k
                return -1;
1963
13.3k
            p = s->buf_ptr;
1964
1965
13.3k
            re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
1966
13.3k
                          capture_index);
1967
1968
13.3k
            if (re_parse_expect(s, &p, ')'))
1969
307
                return -1;
1970
13.3k
        }
1971
15.3k
        break;
1972
25.8k
    case '\\':
1973
25.8k
        switch(p[1]) {
1974
597
        case 'b':
1975
1.06k
        case 'B':
1976
1.06k
            if (p[1] != 'b') {
1977
471
                re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
1978
597
            } else {
1979
597
                re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
1980
597
            }
1981
1.06k
            p += 2;
1982
1.06k
            break;
1983
5.66k
        case 'k':
1984
5.66k
            {
1985
5.66k
                const uint8_t *p1;
1986
5.66k
                int dummy_res;
1987
1988
5.66k
                p1 = p;
1989
5.66k
                if (p1[2] != '<') {
1990
                    /* annex B: we tolerate invalid group names in non
1991
                       unicode mode if there is no named capture
1992
                       definition */
1993
601
                    if (s->is_unicode || re_has_named_captures(s))
1994
27
                        return re_parse_error(s, "expecting group name");
1995
574
                    else
1996
574
                        goto parse_class_atom;
1997
601
                }
1998
5.06k
                p1 += 3;
1999
5.06k
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
2000
5.06k
                                        &p1)) {
2001
793
                    if (s->is_unicode || re_has_named_captures(s))
2002
6
                        return re_parse_error(s, "invalid group name");
2003
787
                    else
2004
787
                        goto parse_class_atom;
2005
793
                }
2006
4.27k
                c = find_group_name(s, s->u.tmp_buf);
2007
4.27k
                if (c < 0) {
2008
                    /* no capture name parsed before, try to look
2009
                       after (inefficient, but hopefully not common */
2010
4.07k
                    c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
2011
4.07k
                    if (c < 0) {
2012
1.32k
                        if (s->is_unicode || re_has_named_captures(s))
2013
262
                            return re_parse_error(s, "group name not defined");
2014
1.05k
                        else
2015
1.05k
                            goto parse_class_atom;
2016
1.32k
                    }
2017
4.07k
                }
2018
2.95k
                p = p1;
2019
2.95k
            }
2020
0
            goto emit_back_reference;
2021
1.47k
        case '0':
2022
1.47k
            p += 2;
2023
1.47k
            c = 0;
2024
1.47k
            if (s->is_unicode) {
2025
0
                if (is_digit(*p)) {
2026
0
                    return re_parse_error(s, "invalid decimal escape in regular expression");
2027
0
                }
2028
1.47k
            } else {
2029
                /* Annex B.1.4: accept legacy octal */
2030
1.47k
                if (*p >= '0' && *p <= '7') {
2031
935
                    c = *p++ - '0';
2032
935
                    if (*p >= '0' && *p <= '7') {
2033
392
                        c = (c << 3) + *p++ - '0';
2034
392
                    }
2035
935
                }
2036
1.47k
            }
2037
1.47k
            goto normal_char;
2038
2.96k
        case '1': case '2': case '3': case '4':
2039
4.65k
        case '5': case '6': case '7': case '8':
2040
4.86k
        case '9':
2041
4.86k
            {
2042
4.86k
                const uint8_t *q = ++p;
2043
2044
4.86k
                c = parse_digits(&p, FALSE);
2045
4.86k
                if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
2046
3.50k
                    if (!s->is_unicode) {
2047
                        /* Annex B.1.4: accept legacy octal */
2048
3.50k
                        p = q;
2049
3.50k
                        if (*p <= '7') {
2050
2.95k
                            c = 0;
2051
2.95k
                            if (*p <= '3')
2052
1.21k
                                c = *p++ - '0';
2053
2.95k
                            if (*p >= '0' && *p <= '7') {
2054
1.84k
                                c = (c << 3) + *p++ - '0';
2055
1.84k
                                if (*p >= '0' && *p <= '7') {
2056
530
                                    c = (c << 3) + *p++ - '0';
2057
530
                                }
2058
1.84k
                            }
2059
2.95k
                        } else {
2060
547
                            c = *p++;
2061
547
                        }
2062
3.50k
                        goto normal_char;
2063
3.50k
                    }
2064
0
                    return re_parse_error(s, "back reference out of range in regular expression");
2065
3.50k
                }
2066
4.31k
            emit_back_reference:
2067
4.31k
                last_atom_start = s->byte_code.size;
2068
4.31k
                last_capture_count = s->capture_count;
2069
                
2070
4.31k
                re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, c);
2071
4.31k
            }
2072
0
            break;
2073
12.7k
        default:
2074
12.7k
            goto parse_class_atom;
2075
25.8k
        }
2076
5.38k
        break;
2077
5.38k
    case '[':
2078
3.50k
        last_atom_start = s->byte_code.size;
2079
3.50k
        last_capture_count = s->capture_count;
2080
3.50k
        if (is_backward_dir)
2081
260
            re_emit_op(s, REOP_prev);
2082
3.50k
        if (re_parse_char_class(s, &p))
2083
803
            return -1;
2084
2.70k
        if (is_backward_dir)
2085
253
            re_emit_op(s, REOP_prev);
2086
2.70k
        break;
2087
849
    case ']':
2088
1.37k
    case '}':
2089
1.37k
        if (s->is_unicode)
2090
0
            return re_parse_error(s, "syntax error");
2091
1.37k
        goto parse_class_atom;
2092
306k
    default:
2093
329k
    parse_class_atom:
2094
329k
        c = get_class_atom(s, cr, &p, FALSE);
2095
329k
        if ((int)c < 0)
2096
406
            return -1;
2097
333k
    normal_char:
2098
333k
        last_atom_start = s->byte_code.size;
2099
333k
        last_capture_count = s->capture_count;
2100
333k
        if (is_backward_dir)
2101
3.95k
            re_emit_op(s, REOP_prev);
2102
333k
        if (c >= CLASS_RANGE_BASE) {
2103
2.33k
            int ret;
2104
2.33k
            ret = re_emit_string_list(s, cr);
2105
2.33k
            re_string_list_free(cr);
2106
2.33k
            if (ret)
2107
0
                return -1;
2108
331k
        } else {
2109
331k
            if (s->ignore_case)
2110
11.7k
                c = lre_canonicalize(c, s->is_unicode);
2111
331k
            re_emit_char(s, c);
2112
331k
        }
2113
333k
        if (is_backward_dir)
2114
3.95k
            re_emit_op(s, REOP_prev);
2115
333k
        break;
2116
612k
    }
2117
2118
    /* quantifier */
2119
362k
    if (last_atom_start >= 0) {
2120
358k
        c = *p;
2121
358k
        switch(c) {
2122
2.53k
        case '*':
2123
2.53k
            p++;
2124
2.53k
            quant_min = 0;
2125
2.53k
            quant_max = INT32_MAX;
2126
2.53k
            goto quantifier;
2127
6.52k
        case '+':
2128
6.52k
            p++;
2129
6.52k
            quant_min = 1;
2130
6.52k
            quant_max = INT32_MAX;
2131
6.52k
            goto quantifier;
2132
4.01k
        case '?':
2133
4.01k
            p++;
2134
4.01k
            quant_min = 0;
2135
4.01k
            quant_max = 1;
2136
4.01k
            goto quantifier;
2137
9.02k
        case '{':
2138
9.02k
            {
2139
9.02k
                const uint8_t *p1 = p;
2140
                /* As an extension (see ES6 annex B), we accept '{' not
2141
                   followed by digits as a normal atom */
2142
9.02k
                if (!is_digit(p[1])) {
2143
4.01k
                    if (s->is_unicode)
2144
0
                        goto invalid_quant_count;
2145
4.01k
                    break;
2146
4.01k
                }
2147
5.00k
                p++;
2148
5.00k
                quant_min = parse_digits(&p, TRUE);
2149
5.00k
                quant_max = quant_min;
2150
5.00k
                if (*p == ',') {
2151
3.10k
                    p++;
2152
3.10k
                    if (is_digit(*p)) {
2153
2.13k
                        quant_max = parse_digits(&p, TRUE);
2154
2.13k
                        if (quant_max < quant_min) {
2155
2
                        invalid_quant_count:
2156
2
                            return re_parse_error(s, "invalid repetition count");
2157
2
                        }
2158
2.13k
                    } else {
2159
967
                        quant_max = INT32_MAX; /* infinity */
2160
967
                    }
2161
3.10k
                }
2162
5.00k
                if (*p != '}' && !s->is_unicode) {
2163
                    /* Annex B: normal atom if invalid '{' syntax */
2164
2.32k
                    p = p1;
2165
2.32k
                    break;
2166
2.32k
                }
2167
2.67k
                if (re_parse_expect(s, &p, '}'))
2168
0
                    return -1;
2169
2.67k
            }
2170
15.7k
        quantifier:
2171
15.7k
            greedy = TRUE;
2172
15.7k
            if (*p == '?') {
2173
1.52k
                p++;
2174
1.52k
                greedy = FALSE;
2175
1.52k
            }
2176
15.7k
            if (last_atom_start < 0) {
2177
0
                return re_parse_error(s, "nothing to repeat");
2178
0
            }
2179
15.7k
            if (greedy) {
2180
14.2k
                int len, pos;
2181
2182
14.2k
                if (quant_max > 0) {
2183
                    /* specific optimization for simple quantifiers */
2184
14.0k
                    if (dbuf_error(&s->byte_code))
2185
10
                        goto out_of_memory;
2186
13.9k
                    len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
2187
13.9k
                                                 s->byte_code.size - last_atom_start);
2188
13.9k
                    if (len > 0) {
2189
3.03k
                        re_emit_op(s, REOP_match);
2190
2191
3.03k
                        if (dbuf_insert(&s->byte_code, last_atom_start, 17))
2192
1
                            goto out_of_memory;
2193
3.03k
                        pos = last_atom_start;
2194
3.03k
                        s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
2195
3.03k
                        put_u32(&s->byte_code.buf[pos],
2196
3.03k
                                s->byte_code.size - last_atom_start - 17);
2197
3.03k
                        pos += 4;
2198
3.03k
                        put_u32(&s->byte_code.buf[pos], quant_min);
2199
3.03k
                        pos += 4;
2200
3.03k
                        put_u32(&s->byte_code.buf[pos], quant_max);
2201
3.03k
                        pos += 4;
2202
3.03k
                        put_u32(&s->byte_code.buf[pos], len);
2203
3.03k
                        pos += 4;
2204
3.03k
                        goto done;
2205
3.03k
                    }
2206
13.9k
                }
2207
2208
11.1k
                if (dbuf_error(&s->byte_code))
2209
1
                    goto out_of_memory;
2210
11.1k
            }
2211
            /* the spec tells that if there is no advance when
2212
               running the atom after the first quant_min times,
2213
               then there is no match. We remove this test when we
2214
               are sure the atom always advances the position. */
2215
12.6k
            add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
2216
12.6k
                                                           s->byte_code.size - last_atom_start);
2217
2218
12.6k
            {
2219
12.6k
                int len, pos;
2220
12.6k
                len = s->byte_code.size - last_atom_start;
2221
12.6k
                if (quant_min == 0) {
2222
                    /* need to reset the capture in case the atom is
2223
                       not executed */
2224
5.43k
                    if (last_capture_count != s->capture_count) {
2225
3.72k
                        if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2226
0
                            goto out_of_memory;
2227
3.72k
                        s->byte_code.buf[last_atom_start++] = REOP_save_reset;
2228
3.72k
                        s->byte_code.buf[last_atom_start++] = last_capture_count;
2229
3.72k
                        s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
2230
3.72k
                    }
2231
5.43k
                    if (quant_max == 0) {
2232
215
                        s->byte_code.size = last_atom_start;
2233
5.21k
                    } else if (quant_max == 1 || quant_max == INT32_MAX) {
2234
4.44k
                        BOOL has_goto = (quant_max == INT32_MAX);
2235
4.44k
                        if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
2236
1
                            goto out_of_memory;
2237
4.43k
                        s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
2238
4.43k
                            greedy;
2239
4.43k
                        put_u32(s->byte_code.buf + last_atom_start + 1,
2240
4.43k
                                len + 5 * has_goto + add_zero_advance_check * 2);
2241
4.43k
                        if (add_zero_advance_check) {
2242
3.74k
                            s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
2243
3.74k
                            re_emit_op(s, REOP_check_advance);
2244
3.74k
                        }
2245
4.43k
                        if (has_goto)
2246
1.80k
                            re_emit_goto(s, REOP_goto, last_atom_start);
2247
4.43k
                    } else {
2248
778
                        if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
2249
1
                            goto out_of_memory;
2250
777
                        pos = last_atom_start;
2251
777
                        s->byte_code.buf[pos++] = REOP_push_i32;
2252
777
                        put_u32(s->byte_code.buf + pos, quant_max);
2253
777
                        pos += 4;
2254
777
                        s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
2255
777
                        put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
2256
777
                        pos += 4;
2257
777
                        if (add_zero_advance_check) {
2258
527
                            s->byte_code.buf[pos++] = REOP_push_char_pos;
2259
527
                            re_emit_op(s, REOP_check_advance);
2260
527
                        }
2261
777
                        re_emit_goto(s, REOP_loop, last_atom_start + 5);
2262
777
                        re_emit_op(s, REOP_drop);
2263
777
                    }
2264
7.26k
                } else if (quant_min == 1 && quant_max == INT32_MAX &&
2265
5.67k
                           !add_zero_advance_check) {
2266
518
                    re_emit_goto(s, REOP_split_next_first - greedy,
2267
518
                                 last_atom_start);
2268
6.74k
                } else {
2269
6.74k
                    if (quant_min == 1) {
2270
                        /* nothing to add */
2271
5.74k
                    } else {
2272
1.00k
                        if (dbuf_insert(&s->byte_code, last_atom_start, 5))
2273
1
                            goto out_of_memory;
2274
1.00k
                        s->byte_code.buf[last_atom_start] = REOP_push_i32;
2275
1.00k
                        put_u32(s->byte_code.buf + last_atom_start + 1,
2276
1.00k
                                quant_min);
2277
1.00k
                        last_atom_start += 5;
2278
1.00k
                        re_emit_goto(s, REOP_loop, last_atom_start);
2279
1.00k
                        re_emit_op(s, REOP_drop);
2280
1.00k
                    }
2281
6.74k
                    if (quant_max == INT32_MAX) {
2282
5.36k
                        pos = s->byte_code.size;
2283
5.36k
                        re_emit_op_u32(s, REOP_split_goto_first + greedy,
2284
5.36k
                                       len + 5 + add_zero_advance_check * 2);
2285
5.36k
                        if (add_zero_advance_check)
2286
5.17k
                            re_emit_op(s, REOP_push_char_pos);
2287
                        /* copy the atom */
2288
5.36k
                        dbuf_put_self(&s->byte_code, last_atom_start, len);
2289
5.36k
                        if (add_zero_advance_check)
2290
5.17k
                            re_emit_op(s, REOP_check_advance);
2291
5.36k
                        re_emit_goto(s, REOP_goto, pos);
2292
5.36k
                    } else if (quant_max > quant_min) {
2293
628
                        re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
2294
628
                        pos = s->byte_code.size;
2295
628
                        re_emit_op_u32(s, REOP_split_goto_first + greedy,
2296
628
                                       len + 5 + add_zero_advance_check * 2);
2297
628
                        if (add_zero_advance_check)
2298
264
                            re_emit_op(s, REOP_push_char_pos);
2299
                        /* copy the atom */
2300
628
                        dbuf_put_self(&s->byte_code, last_atom_start, len);
2301
628
                        if (add_zero_advance_check)
2302
264
                            re_emit_op(s, REOP_check_advance);
2303
628
                        re_emit_goto(s, REOP_loop, pos);
2304
628
                        re_emit_op(s, REOP_drop);
2305
628
                    }
2306
6.74k
                }
2307
12.6k
                last_atom_start = -1;
2308
12.6k
            }
2309
0
            break;
2310
336k
        default:
2311
336k
            break;
2312
358k
        }
2313
358k
    }
2314
362k
 done:
2315
362k
    s->buf_ptr = p;
2316
362k
    return 0;
2317
15
 out_of_memory:
2318
15
    return re_parse_out_of_memory(s);
2319
362k
}
2320
2321
static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
2322
306k
{
2323
306k
    const uint8_t *p;
2324
306k
    int ret;
2325
306k
    size_t start, term_start, end, term_size;
2326
2327
306k
    start = s->byte_code.size;
2328
668k
    for(;;) {
2329
668k
        p = s->buf_ptr;
2330
668k
        if (p >= s->buf_end)
2331
39.0k
            break;
2332
629k
        if (*p == '|' || *p == ')')
2333
17.4k
            break;
2334
612k
        term_start = s->byte_code.size;
2335
612k
        ret = re_parse_term(s, is_backward_dir);
2336
612k
        if (ret)
2337
249k
            return ret;
2338
362k
        if (is_backward_dir) {
2339
            /* reverse the order of the terms (XXX: inefficient, but
2340
               speed is not really critical here) */
2341
5.49k
            end = s->byte_code.size;
2342
5.49k
            term_size = end - term_start;
2343
5.49k
            if (dbuf_realloc(&s->byte_code, end + term_size))
2344
4
                return -1;
2345
5.48k
            memmove(s->byte_code.buf + start + term_size,
2346
5.48k
                    s->byte_code.buf + start,
2347
5.48k
                    end - start);
2348
5.48k
            memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
2349
5.48k
                   term_size);
2350
5.48k
        }
2351
362k
    }
2352
56.5k
    return 0;
2353
306k
}
2354
2355
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
2356
304k
{
2357
304k
    int start, len, pos;
2358
2359
304k
    if (lre_check_stack_overflow(s->opaque, 0))
2360
1
        return re_parse_error(s, "stack overflow");
2361
2362
304k
    start = s->byte_code.size;
2363
304k
    if (re_parse_alternative(s, is_backward_dir))
2364
249k
        return -1;
2365
56.5k
    while (*s->buf_ptr == '|') {
2366
2.05k
        s->buf_ptr++;
2367
2368
2.05k
        len = s->byte_code.size - start;
2369
2370
        /* insert a split before the first alternative */
2371
2.05k
        if (dbuf_insert(&s->byte_code, start, 5)) {
2372
1
            return re_parse_out_of_memory(s);
2373
1
        }
2374
2.05k
        s->byte_code.buf[start] = REOP_split_next_first;
2375
2.05k
        put_u32(s->byte_code.buf + start + 1, len + 5);
2376
2377
2.05k
        pos = re_emit_op_u32(s, REOP_goto, 0);
2378
2379
2.05k
        if (re_parse_alternative(s, is_backward_dir))
2380
397
            return -1;
2381
2382
        /* patch the goto */
2383
1.65k
        len = s->byte_code.size - (pos + 4);
2384
1.65k
        put_u32(s->byte_code.buf + pos, len);
2385
1.65k
    }
2386
54.5k
    return 0;
2387
54.9k
}
2388
2389
/* the control flow is recursive so the analysis can be linear */
2390
static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
2391
38.4k
{
2392
38.4k
    int stack_size, stack_size_max, pos, opcode, len;
2393
38.4k
    uint32_t val;
2394
2395
38.4k
    stack_size = 0;
2396
38.4k
    stack_size_max = 0;
2397
38.4k
    bc_buf += RE_HEADER_LEN;
2398
38.4k
    bc_buf_len -= RE_HEADER_LEN;
2399
38.4k
    pos = 0;
2400
1.89G
    while (pos < bc_buf_len) {
2401
1.89G
        opcode = bc_buf[pos];
2402
1.89G
        len = reopcode_info[opcode].size;
2403
1.89G
        assert(opcode < REOP_COUNT);
2404
1.89G
        assert((pos + len) <= bc_buf_len);
2405
1.89G
        switch(opcode) {
2406
8.86M
        case REOP_push_i32:
2407
140M
        case REOP_push_char_pos:
2408
140M
            stack_size++;
2409
140M
            if (stack_size > stack_size_max) {
2410
6.82k
                if (stack_size > STACK_SIZE_MAX)
2411
1
                    return -1;
2412
6.82k
                stack_size_max = stack_size;
2413
6.82k
            }
2414
140M
            break;
2415
140M
        case REOP_drop:
2416
140M
        case REOP_check_advance:
2417
140M
            assert(stack_size > 0);
2418
140M
            stack_size--;
2419
140M
            break;
2420
144M
        case REOP_range:
2421
144M
        case REOP_range_i:
2422
144M
            val = get_u16(bc_buf + pos + 1);
2423
144M
            len += val * 4;
2424
144M
            break;
2425
1.65k
        case REOP_range32:
2426
2.95k
        case REOP_range32_i:
2427
2.95k
            val = get_u16(bc_buf + pos + 1);
2428
2.95k
            len += val * 8;
2429
2.95k
            break;
2430
1.89G
        }
2431
1.89G
        pos += len;
2432
1.89G
    }
2433
38.4k
    return stack_size_max;
2434
38.4k
}
2435
2436
static void *lre_bytecode_realloc(void *opaque, void *ptr, size_t size)
2437
315k
{
2438
315k
    if (size > (INT32_MAX / 2)) {
2439
        /* the bytecode cannot be larger than 2G. Leave some slack to 
2440
           avoid some overflows. */
2441
51
        return NULL;
2442
315k
    } else {
2443
315k
        return lre_realloc(opaque, ptr, size);
2444
315k
    }
2445
315k
}
2446
2447
/* 'buf' must be a zero terminated UTF-8 string of length buf_len.
2448
   Return NULL if error and allocate an error message in *perror_msg,
2449
   otherwise the compiled bytecode and its length in plen.
2450
*/
2451
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
2452
                     const char *buf, size_t buf_len, int re_flags,
2453
                     void *opaque)
2454
41.4k
{
2455
41.4k
    REParseState s_s, *s = &s_s;
2456
41.4k
    int stack_size;
2457
41.4k
    BOOL is_sticky;
2458
2459
41.4k
    memset(s, 0, sizeof(*s));
2460
41.4k
    s->opaque = opaque;
2461
41.4k
    s->buf_ptr = (const uint8_t *)buf;
2462
41.4k
    s->buf_end = s->buf_ptr + buf_len;
2463
41.4k
    s->buf_start = s->buf_ptr;
2464
41.4k
    s->re_flags = re_flags;
2465
41.4k
    s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
2466
41.4k
    is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
2467
41.4k
    s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
2468
41.4k
    s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0);
2469
41.4k
    s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
2470
41.4k
    s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
2471
41.4k
    s->capture_count = 1;
2472
41.4k
    s->total_capture_count = -1;
2473
41.4k
    s->has_named_captures = -1;
2474
2475
41.4k
    dbuf_init2(&s->byte_code, opaque, lre_bytecode_realloc);
2476
41.4k
    dbuf_init2(&s->group_names, opaque, lre_realloc);
2477
2478
41.4k
    dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */
2479
41.4k
    dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
2480
41.4k
    dbuf_putc(&s->byte_code, 0); /* stack size */
2481
41.4k
    dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
2482
2483
41.4k
    if (!is_sticky) {
2484
        /* iterate thru all positions (about the same as .*?( ... ) )
2485
           .  We do it without an explicit loop so that lock step
2486
           thread execution will be possible in an optimized
2487
           implementation */
2488
41.4k
        re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
2489
41.4k
        re_emit_op(s, REOP_any);
2490
41.4k
        re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
2491
41.4k
    }
2492
41.4k
    re_emit_op_u8(s, REOP_save_start, 0);
2493
2494
41.4k
    if (re_parse_disjunction(s, FALSE)) {
2495
3.05k
    error:
2496
3.05k
        dbuf_free(&s->byte_code);
2497
3.05k
        dbuf_free(&s->group_names);
2498
3.05k
        pstrcpy(error_msg, error_msg_size, s->u.error_msg);
2499
3.05k
        *plen = 0;
2500
3.05k
        return NULL;
2501
2.99k
    }
2502
2503
38.4k
    re_emit_op_u8(s, REOP_save_end, 0);
2504
2505
38.4k
    re_emit_op(s, REOP_match);
2506
2507
38.4k
    if (*s->buf_ptr != '\0') {
2508
43
        re_parse_error(s, "extraneous characters at the end");
2509
43
        goto error;
2510
43
    }
2511
2512
38.4k
    if (dbuf_error(&s->byte_code)) {
2513
9
        re_parse_out_of_memory(s);
2514
9
        goto error;
2515
9
    }
2516
2517
38.4k
    stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
2518
38.4k
    if (stack_size < 0) {
2519
1
        re_parse_error(s, "too many imbricated quantifiers");
2520
1
        goto error;
2521
1
    }
2522
2523
38.4k
    s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
2524
38.4k
    s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
2525
38.4k
    put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
2526
38.4k
            s->byte_code.size - RE_HEADER_LEN);
2527
2528
    /* add the named groups if needed */
2529
38.4k
    if (s->group_names.size > (s->capture_count - 1)) {
2530
22
        dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
2531
22
        put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
2532
22
                lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
2533
22
    }
2534
38.4k
    dbuf_free(&s->group_names);
2535
2536
#ifdef DUMP_REOP
2537
    lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
2538
#endif
2539
2540
38.4k
    error_msg[0] = '\0';
2541
38.4k
    *plen = s->byte_code.size;
2542
38.4k
    return s->byte_code.buf;
2543
38.4k
}
2544
2545
static BOOL is_line_terminator(uint32_t c)
2546
5.93M
{
2547
5.93M
    return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
2548
5.93M
}
2549
2550
static BOOL is_word_char(uint32_t c)
2551
4.42k
{
2552
4.42k
    return ((c >= '0' && c <= '9') ||
2553
3.86k
            (c >= 'a' && c <= 'z') ||
2554
3.39k
            (c >= 'A' && c <= 'Z') ||
2555
2.89k
            (c == '_'));
2556
4.42k
}
2557
2558
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
2559
178M
    do {                                                                \
2560
178M
        if (cbuf_type == 0) {                                           \
2561
174M
            c = *cptr++;                                                \
2562
174M
        } else {                                                        \
2563
3.94M
            const uint16_t *_p = (const uint16_t *)cptr;                \
2564
3.94M
            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
2565
3.94M
            c = *_p++;                                                  \
2566
3.94M
            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
2567
6
                if (_p < _end && is_lo_surrogate(*_p)) {                \
2568
0
                    c = from_surrogate(c, *_p++);                       \
2569
0
                }                                                       \
2570
6
            }                                                           \
2571
3.94M
            cptr = (const void *)_p;                                    \
2572
3.94M
        }                                                               \
2573
178M
    } while (0)
2574
2575
#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type)                         \
2576
3.02k
    do {                                                                \
2577
3.02k
        if (cbuf_type == 0) {                                           \
2578
3.02k
            c = cptr[0];                                                \
2579
3.02k
        } else {                                                        \
2580
0
            const uint16_t *_p = (const uint16_t *)cptr;                \
2581
0
            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
2582
0
            c = *_p++;                                                  \
2583
0
            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
2584
0
                if (_p < _end && is_lo_surrogate(*_p)) {                \
2585
0
                    c = from_surrogate(c, *_p);                         \
2586
0
                }                                                       \
2587
0
            }                                                           \
2588
0
        }                                                               \
2589
3.02k
    } while (0)
2590
2591
#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                  \
2592
2.99k
    do {                                                                \
2593
2.99k
        if (cbuf_type == 0) {                                           \
2594
2.99k
            c = cptr[-1];                                               \
2595
2.99k
        } else {                                                        \
2596
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2597
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2598
0
            c = *_p;                                                    \
2599
0
            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
2600
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2601
0
                    c = from_surrogate(*--_p, c);                       \
2602
0
                }                                                       \
2603
0
            }                                                           \
2604
0
        }                                                               \
2605
2.99k
    } while (0)
2606
2607
#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                   \
2608
15.8M
    do {                                                                \
2609
15.8M
        if (cbuf_type == 0) {                                           \
2610
15.8M
            cptr--;                                                     \
2611
15.8M
            c = cptr[0];                                                \
2612
15.8M
        } else {                                                        \
2613
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2614
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2615
0
            c = *_p;                                                    \
2616
0
            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
2617
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2618
0
                    c = from_surrogate(*--_p, c);                       \
2619
0
                }                                                       \
2620
0
            }                                                           \
2621
0
            cptr = (const void *)_p;                                    \
2622
0
        }                                                               \
2623
15.8M
    } while (0)
2624
2625
#define PREV_CHAR(cptr, cbuf_start, cbuf_type)                          \
2626
95.5k
    do {                                                                \
2627
95.5k
        if (cbuf_type == 0) {                                           \
2628
95.5k
            cptr--;                                                     \
2629
95.5k
        } else {                                                        \
2630
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2631
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2632
0
            if (is_lo_surrogate(*_p) && cbuf_type == 2) {               \
2633
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2634
0
                    --_p;                                               \
2635
0
                }                                                       \
2636
0
            }                                                           \
2637
0
            cptr = (const void *)_p;                                    \
2638
0
        }                                                               \
2639
95.5k
    } while (0)
2640
2641
typedef uintptr_t StackInt;
2642
2643
typedef enum {
2644
    RE_EXEC_STATE_SPLIT,
2645
    RE_EXEC_STATE_LOOKAHEAD,
2646
    RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
2647
    RE_EXEC_STATE_GREEDY_QUANT,
2648
} REExecStateEnum;
2649
2650
typedef struct REExecState {
2651
    REExecStateEnum type : 8;
2652
    uint8_t stack_len;
2653
    size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
2654
    const uint8_t *cptr;
2655
    const uint8_t *pc;
2656
    void *buf[0];
2657
} REExecState;
2658
2659
typedef struct {
2660
    const uint8_t *cbuf;
2661
    const uint8_t *cbuf_end;
2662
    /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
2663
    int cbuf_type;
2664
    int capture_count;
2665
    int stack_size_max;
2666
    BOOL is_unicode;
2667
    int interrupt_counter;
2668
    void *opaque; /* used for stack overflow check */
2669
2670
    size_t state_size;
2671
    uint8_t *state_stack;
2672
    size_t state_stack_size;
2673
    size_t state_stack_len;
2674
} REExecContext;
2675
2676
static int push_state(REExecContext *s,
2677
                      uint8_t **capture,
2678
                      StackInt *stack, size_t stack_len,
2679
                      const uint8_t *pc, const uint8_t *cptr,
2680
                      REExecStateEnum type, size_t count)
2681
90.8M
{
2682
90.8M
    REExecState *rs;
2683
90.8M
    uint8_t *new_stack;
2684
90.8M
    size_t new_size, i, n;
2685
90.8M
    StackInt *stack_buf;
2686
2687
90.8M
    if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
2688
        /* reallocate the stack */
2689
1.66M
        new_size = s->state_stack_size * 3 / 2;
2690
1.66M
        if (new_size < 8)
2691
1.66M
            new_size = 8;
2692
1.66M
        new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
2693
1.66M
        if (!new_stack)
2694
1
            return -1;
2695
1.66M
        s->state_stack_size = new_size;
2696
1.66M
        s->state_stack = new_stack;
2697
1.66M
    }
2698
90.8M
    rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
2699
90.8M
    s->state_stack_len++;
2700
90.8M
    rs->type = type;
2701
90.8M
    rs->count = count;
2702
90.8M
    rs->stack_len = stack_len;
2703
90.8M
    rs->cptr = cptr;
2704
90.8M
    rs->pc = pc;
2705
90.8M
    n = 2 * s->capture_count;
2706
354M
    for(i = 0; i < n; i++)
2707
263M
        rs->buf[i] = capture[i];
2708
90.8M
    stack_buf = (StackInt *)(rs->buf + n);
2709
104M
    for(i = 0; i < stack_len; i++)
2710
13.1M
        stack_buf[i] = stack[i];
2711
90.8M
    return 0;
2712
90.8M
}
2713
2714
static int lre_poll_timeout(REExecContext *s)
2715
179M
{
2716
179M
    if (unlikely(--s->interrupt_counter <= 0)) {
2717
208
        s->interrupt_counter = INTERRUPT_COUNTER_INIT;
2718
208
        if (lre_check_timeout(s->opaque))
2719
108
            return LRE_RET_TIMEOUT;
2720
208
    }
2721
179M
    return 0;
2722
179M
}
2723
2724
/* return 1 if match, 0 if not match or < 0 if error. */
2725
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2726
                                   StackInt *stack, int stack_len,
2727
                                   const uint8_t *pc, const uint8_t *cptr,
2728
                                   BOOL no_recurse)
2729
4.99M
{
2730
4.99M
    int opcode, ret;
2731
4.99M
    int cbuf_type;
2732
4.99M
    uint32_t val, c;
2733
4.99M
    const uint8_t *cbuf_end;
2734
2735
4.99M
    cbuf_type = s->cbuf_type;
2736
4.99M
    cbuf_end = s->cbuf_end;
2737
2738
460M
    for(;;) {
2739
        //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2740
460M
        opcode = *pc++;
2741
460M
        switch(opcode) {
2742
1.91M
        case REOP_match:
2743
1.91M
            {
2744
1.91M
                REExecState *rs;
2745
1.91M
                if (no_recurse)
2746
201k
                    return (intptr_t)cptr;
2747
1.71M
                ret = 1;
2748
1.71M
                goto recurse;
2749
89.2M
            no_match:
2750
89.2M
                if (no_recurse)
2751
96.8k
                    return 0;
2752
89.1M
                ret = 0;
2753
90.8M
            recurse:
2754
92.5M
                for(;;) {
2755
92.5M
                    if (lre_poll_timeout(s))
2756
62
                        return LRE_RET_TIMEOUT;
2757
92.5M
                    if (s->state_stack_len == 0)
2758
4.69M
                        return ret;
2759
87.8M
                    rs = (REExecState *)(s->state_stack +
2760
87.8M
                                         (s->state_stack_len - 1) * s->state_size);
2761
87.8M
                    if (rs->type == RE_EXEC_STATE_SPLIT) {
2762
87.6M
                        if (!ret) {
2763
86.0M
                        pop_state:
2764
86.0M
                            memcpy(capture, rs->buf,
2765
86.0M
                                   sizeof(capture[0]) * 2 * s->capture_count);
2766
86.0M
                        pop_state1:
2767
86.0M
                            pc = rs->pc;
2768
86.0M
                            cptr = rs->cptr;
2769
86.0M
                            stack_len = rs->stack_len;
2770
86.0M
                            memcpy(stack, rs->buf + 2 * s->capture_count,
2771
86.0M
                                   stack_len * sizeof(stack[0]));
2772
86.0M
                            s->state_stack_len--;
2773
86.0M
                            break;
2774
86.0M
                        }
2775
87.6M
                    } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2776
92.7k
                        if (!ret) {
2777
92.2k
                            uint32_t char_count, i;
2778
92.2k
                            memcpy(capture, rs->buf,
2779
92.2k
                                   sizeof(capture[0]) * 2 * s->capture_count);
2780
92.2k
                            stack_len = rs->stack_len;
2781
92.2k
                            memcpy(stack, rs->buf + 2 * s->capture_count,
2782
92.2k
                                   stack_len * sizeof(stack[0]));
2783
92.2k
                            pc = rs->pc;
2784
92.2k
                            cptr = rs->cptr;
2785
                            /* go backward */
2786
92.2k
                            char_count = get_u32(pc + 12);
2787
186k
                            for(i = 0; i < char_count; i++) {
2788
94.2k
                                PREV_CHAR(cptr, s->cbuf, cbuf_type);
2789
94.2k
                            }
2790
92.2k
                            pc = (pc + 16) + (int)get_u32(pc);
2791
92.2k
                            rs->cptr = cptr;
2792
92.2k
                            rs->count--;
2793
92.2k
                            if (rs->count == 0) {
2794
43.2k
                                s->state_stack_len--;
2795
43.2k
                            }
2796
92.2k
                            break;
2797
92.2k
                        }
2798
92.7k
                    } else {
2799
84.8k
                        ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2800
40.5k
                               (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2801
84.8k
                        if (ret) {
2802
                            /* keep the capture in case of positive lookahead */
2803
56.8k
                            if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2804
44.3k
                                goto pop_state1;
2805
12.5k
                            else
2806
12.5k
                                goto pop_state;
2807
56.8k
                        }
2808
84.8k
                    }
2809
1.69M
                    s->state_stack_len--;
2810
1.69M
                }
2811
90.8M
            }
2812
86.1M
            break;
2813
86.1M
        case REOP_char32:
2814
288
        case REOP_char32_i:
2815
288
            val = get_u32(pc);
2816
288
            pc += 4;
2817
288
            goto test_char;
2818
87.4M
        case REOP_char:
2819
87.4M
        case REOP_char_i:
2820
87.4M
            val = get_u16(pc);
2821
87.4M
            pc += 2;
2822
87.4M
        test_char:
2823
87.4M
            if (cptr >= cbuf_end)
2824
385k
                goto no_match;
2825
87.0M
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2826
87.0M
            if (opcode == REOP_char_i || opcode == REOP_char32_i) {
2827
1.46k
                c = lre_canonicalize(c, s->is_unicode);
2828
1.46k
            }
2829
87.0M
            if (val != c)
2830
85.3M
                goto no_match;
2831
1.65M
            break;
2832
89.6M
        case REOP_split_goto_first:
2833
90.6M
        case REOP_split_next_first:
2834
90.6M
            {
2835
90.6M
                const uint8_t *pc1;
2836
2837
90.6M
                val = get_u32(pc);
2838
90.6M
                pc += 4;
2839
90.6M
                if (opcode == REOP_split_next_first) {
2840
1.00M
                    pc1 = pc + (int)val;
2841
89.6M
                } else {
2842
89.6M
                    pc1 = pc;
2843
89.6M
                    pc = pc + (int)val;
2844
89.6M
                }
2845
90.6M
                ret = push_state(s, capture, stack, stack_len,
2846
90.6M
                                 pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2847
90.6M
                if (ret < 0)
2848
1
                    return LRE_RET_MEMORY_ERROR;
2849
90.6M
                break;
2850
90.6M
            }
2851
90.6M
        case REOP_lookahead:
2852
84.8k
        case REOP_negative_lookahead:
2853
84.8k
            val = get_u32(pc);
2854
84.8k
            pc += 4;
2855
84.8k
            ret = push_state(s, capture, stack, stack_len,
2856
84.8k
                             pc + (int)val, cptr,
2857
84.8k
                             RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2858
84.8k
                             0);
2859
84.8k
            if (ret < 0)
2860
0
                return LRE_RET_MEMORY_ERROR;
2861
84.8k
            break;
2862
2863
85.7M
        case REOP_goto:
2864
85.7M
            val = get_u32(pc);
2865
85.7M
            pc += 4 + (int)val;
2866
85.7M
            if (lre_poll_timeout(s))
2867
9
                return LRE_RET_TIMEOUT;
2868
85.7M
            break;
2869
85.7M
        case REOP_line_start:
2870
22.2k
        case REOP_line_start_m:
2871
22.2k
            if (cptr == s->cbuf)
2872
21.3k
                break;
2873
941
            if (opcode == REOP_line_start)
2874
227
                goto no_match;
2875
714
            PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2876
714
            if (!is_line_terminator(c))
2877
299
                goto no_match;
2878
415
            break;
2879
5.23k
        case REOP_line_end:
2880
6.57k
        case REOP_line_end_m:
2881
6.57k
            if (cptr == cbuf_end)
2882
5.48k
                break;
2883
1.08k
            if (opcode == REOP_line_end)
2884
204
                goto no_match;
2885
878
            PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2886
878
            if (!is_line_terminator(c))
2887
449
                goto no_match;
2888
429
            break;
2889
5.98M
        case REOP_dot:
2890
5.98M
            if (cptr == cbuf_end)
2891
43.2k
                goto no_match;
2892
5.93M
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2893
5.93M
            if (is_line_terminator(c))
2894
3.02M
                goto no_match;
2895
2.90M
            break;
2896
85.2M
        case REOP_any:
2897
85.2M
            if (cptr == cbuf_end)
2898
17.9k
                goto no_match;
2899
85.2M
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2900
85.2M
            break;
2901
93.9M
        case REOP_save_start:
2902
100M
        case REOP_save_end:
2903
100M
            val = *pc++;
2904
100M
            assert(val < s->capture_count);
2905
100M
            capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2906
100M
            break;
2907
456k
        case REOP_save_reset:
2908
456k
            {
2909
456k
                uint32_t val2;
2910
456k
                val = pc[0];
2911
456k
                val2 = pc[1];
2912
456k
                pc += 2;
2913
456k
                assert(val2 < s->capture_count);
2914
3.60M
                while (val <= val2) {
2915
3.14M
                    capture[2 * val] = NULL;
2916
3.14M
                    capture[2 * val + 1] = NULL;
2917
3.14M
                    val++;
2918
3.14M
                }
2919
456k
            }
2920
456k
            break;
2921
26.8k
        case REOP_push_i32:
2922
26.8k
            val = get_u32(pc);
2923
26.8k
            pc += 4;
2924
26.8k
            stack[stack_len++] = val;
2925
26.8k
            break;
2926
387k
        case REOP_drop:
2927
387k
            stack_len--;
2928
387k
            break;
2929
970k
        case REOP_loop:
2930
970k
            val = get_u32(pc);
2931
970k
            pc += 4;
2932
970k
            if (--stack[stack_len - 1] != 0) {
2933
596k
                pc += (int)val;
2934
596k
                if (lre_poll_timeout(s))
2935
27
                    return LRE_RET_TIMEOUT;
2936
596k
            }
2937
970k
            break;
2938
970k
        case REOP_push_char_pos:
2939
600k
            stack[stack_len++] = (uintptr_t)cptr;
2940
600k
            break;
2941
730k
        case REOP_check_advance:
2942
730k
            if (stack[--stack_len] == (uintptr_t)cptr)
2943
307k
                goto no_match;
2944
423k
            break;
2945
423k
        case REOP_word_boundary:
2946
2.70k
        case REOP_word_boundary_i:
2947
7.08k
        case REOP_not_word_boundary:
2948
7.62k
        case REOP_not_word_boundary_i:
2949
7.62k
            {
2950
7.62k
                BOOL v1, v2;
2951
7.62k
                int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i);
2952
7.62k
                BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i);
2953
                /* char before */
2954
7.62k
                if (cptr == s->cbuf) {
2955
5.34k
                    v1 = FALSE;
2956
5.34k
                } else {
2957
2.27k
                    PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2958
2.27k
                    if (ignore_case)
2959
507
                        c = lre_canonicalize(c, s->is_unicode);
2960
2.27k
                    v1 = is_word_char(c);
2961
2.27k
                }
2962
                /* current char */
2963
7.62k
                if (cptr >= cbuf_end) {
2964
5.47k
                    v2 = FALSE;
2965
5.47k
                } else {
2966
2.14k
                    PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2967
2.14k
                    if (ignore_case)
2968
406
                        c = lre_canonicalize(c, s->is_unicode);
2969
2.14k
                    v2 = is_word_char(c);
2970
2.14k
                }
2971
7.62k
                if (v1 ^ v2 ^ is_boundary)
2972
4.07k
                    goto no_match;
2973
7.62k
            }
2974
3.55k
            break;
2975
5.38k
        case REOP_back_reference:
2976
5.98k
        case REOP_back_reference_i:
2977
44.0k
        case REOP_backward_back_reference:
2978
44.5k
        case REOP_backward_back_reference_i:
2979
44.5k
            {
2980
44.5k
                const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2981
44.5k
                uint32_t c1, c2;
2982
2983
44.5k
                val = *pc++;
2984
44.5k
                if (val >= s->capture_count)
2985
238
                    goto no_match;
2986
44.3k
                cptr1_start = capture[2 * val];
2987
44.3k
                cptr1_end = capture[2 * val + 1];
2988
44.3k
                if (!cptr1_start || !cptr1_end)
2989
16.0k
                    break;
2990
28.2k
                if (opcode == REOP_back_reference ||
2991
25.7k
                    opcode == REOP_back_reference_i) {
2992
3.02k
                    cptr1 = cptr1_start;
2993
3.26k
                    while (cptr1 < cptr1_end) {
2994
2.61k
                        if (cptr >= cbuf_end)
2995
1.49k
                            goto no_match;
2996
1.12k
                        GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
2997
1.12k
                        GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
2998
1.12k
                        if (opcode == REOP_back_reference_i) {
2999
284
                            c1 = lre_canonicalize(c1, s->is_unicode);
3000
284
                            c2 = lre_canonicalize(c2, s->is_unicode);
3001
284
                        }
3002
1.12k
                        if (c1 != c2)
3003
877
                            goto no_match;
3004
1.12k
                    }
3005
25.2k
                } else {
3006
25.2k
                    cptr1 = cptr1_end;
3007
7.95M
                    while (cptr1 > cptr1_start) {
3008
7.93M
                        if (cptr == s->cbuf)
3009
5.12k
                            goto no_match;
3010
7.93M
                        GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
3011
7.93M
                        GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
3012
7.93M
                        if (opcode == REOP_backward_back_reference_i) {
3013
768
                            c1 = lre_canonicalize(c1, s->is_unicode);
3014
768
                            c2 = lre_canonicalize(c2, s->is_unicode);
3015
768
                        }
3016
7.93M
                        if (c1 != c2)
3017
7.20k
                            goto no_match;
3018
7.93M
                    }
3019
25.2k
                }
3020
28.2k
            }
3021
13.5k
            break;
3022
15.1k
        case REOP_range:
3023
17.1k
        case REOP_range_i:
3024
17.1k
            {
3025
17.1k
                int n;
3026
17.1k
                uint32_t low, high, idx_min, idx_max, idx;
3027
3028
17.1k
                n = get_u16(pc); /* n must be >= 1 */
3029
17.1k
                pc += 2;
3030
17.1k
                if (cptr >= cbuf_end)
3031
12.8k
                    goto no_match;
3032
4.33k
                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3033
4.33k
                if (opcode == REOP_range_i) {
3034
998
                    c = lre_canonicalize(c, s->is_unicode);
3035
998
                }
3036
4.33k
                idx_min = 0;
3037
4.33k
                low = get_u16(pc + 0 * 4);
3038
4.33k
                if (c < low)
3039
240
                    goto no_match;
3040
4.09k
                idx_max = n - 1;
3041
4.09k
                high = get_u16(pc + idx_max * 4 + 2);
3042
                /* 0xffff in for last value means +infinity */
3043
4.09k
                if (unlikely(c >= 0xffff) && high == 0xffff)
3044
0
                    goto range_match;
3045
4.09k
                if (c > high)
3046
258
                    goto no_match;
3047
11.1k
                while (idx_min <= idx_max) {
3048
10.9k
                    idx = (idx_min + idx_max) / 2;
3049
10.9k
                    low = get_u16(pc + idx * 4);
3050
10.9k
                    high = get_u16(pc + idx * 4 + 2);
3051
10.9k
                    if (c < low)
3052
4.26k
                        idx_max = idx - 1;
3053
6.67k
                    else if (c > high)
3054
3.10k
                        idx_min = idx + 1;
3055
3.57k
                    else
3056
3.57k
                        goto range_match;
3057
10.9k
                }
3058
256
                goto no_match;
3059
3.57k
            range_match:
3060
3.57k
                pc += 4 * n;
3061
3.57k
            }
3062
0
            break;
3063
1.85k
        case REOP_range32:
3064
2.43k
        case REOP_range32_i:
3065
2.43k
            {
3066
2.43k
                int n;
3067
2.43k
                uint32_t low, high, idx_min, idx_max, idx;
3068
3069
2.43k
                n = get_u16(pc); /* n must be >= 1 */
3070
2.43k
                pc += 2;
3071
2.43k
                if (cptr >= cbuf_end)
3072
963
                    goto no_match;
3073
1.47k
                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3074
1.47k
                if (opcode == REOP_range32_i) {
3075
482
                    c = lre_canonicalize(c, s->is_unicode);
3076
482
                }
3077
1.47k
                idx_min = 0;
3078
1.47k
                low = get_u32(pc + 0 * 8);
3079
1.47k
                if (c < low)
3080
743
                    goto no_match;
3081
733
                idx_max = n - 1;
3082
733
                high = get_u32(pc + idx_max * 8 + 4);
3083
733
                if (c > high)
3084
0
                    goto no_match;
3085
1.65k
                while (idx_min <= idx_max) {
3086
1.32k
                    idx = (idx_min + idx_max) / 2;
3087
1.32k
                    low = get_u32(pc + idx * 8);
3088
1.32k
                    high = get_u32(pc + idx * 8 + 4);
3089
1.32k
                    if (c < low)
3090
504
                        idx_max = idx - 1;
3091
821
                    else if (c > high)
3092
420
                        idx_min = idx + 1;
3093
401
                    else
3094
401
                        goto range32_match;
3095
1.32k
                }
3096
332
                goto no_match;
3097
401
            range32_match:
3098
401
                pc += 8 * n;
3099
401
            }
3100
0
            break;
3101
1.68k
        case REOP_prev:
3102
            /* go to the previous char */
3103
1.68k
            if (cptr == s->cbuf)
3104
465
                goto no_match;
3105
1.21k
            PREV_CHAR(cptr, s->cbuf, cbuf_type);
3106
1.21k
            break;
3107
180k
        case REOP_simple_greedy_quant:
3108
180k
            {
3109
180k
                uint32_t next_pos, quant_min, quant_max;
3110
180k
                size_t q;
3111
180k
                intptr_t res;
3112
180k
                const uint8_t *pc1;
3113
3114
180k
                next_pos = get_u32(pc);
3115
180k
                quant_min = get_u32(pc + 4);
3116
180k
                quant_max = get_u32(pc + 8);
3117
180k
                pc += 16;
3118
180k
                pc1 = pc;
3119
180k
                pc += (int)next_pos;
3120
3121
180k
                q = 0;
3122
297k
                for(;;) {
3123
297k
                    if (lre_poll_timeout(s))
3124
10
                        return LRE_RET_TIMEOUT;
3125
297k
                    res = lre_exec_backtrack(s, capture, stack, stack_len,
3126
297k
                                             pc1, cptr, TRUE);
3127
297k
                    if (res == LRE_RET_MEMORY_ERROR ||
3128
297k
                        res == LRE_RET_TIMEOUT)
3129
0
                        return res;
3130
297k
                    if (!res)
3131
96.8k
                        break;
3132
201k
                    cptr = (uint8_t *)res;
3133
201k
                    q++;
3134
201k
                    if (q >= quant_max && quant_max != INT32_MAX)
3135
83.9k
                        break;
3136
201k
                }
3137
180k
                if (q < quant_min)
3138
5.32k
                    goto no_match;
3139
175k
                if (q > quant_min) {
3140
                    /* will examine all matches down to quant_min */
3141
93.5k
                    ret = push_state(s, capture, stack, stack_len,
3142
93.5k
                                     pc1 - 16, cptr,
3143
93.5k
                                     RE_EXEC_STATE_GREEDY_QUANT,
3144
93.5k
                                     q - quant_min);
3145
93.5k
                    if (ret < 0)
3146
0
                        return LRE_RET_MEMORY_ERROR;
3147
93.5k
                }
3148
175k
            }
3149
175k
            break;
3150
175k
        default:
3151
0
            abort();
3152
460M
        }
3153
460M
    }
3154
4.99M
}
3155
3156
/* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the
3157
   starting position of the match and must be such as 0 <= cindex <=
3158
   clen. */
3159
int lre_exec(uint8_t **capture,
3160
             const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
3161
             int cbuf_type, void *opaque)
3162
4.69M
{
3163
4.69M
    REExecContext s_s, *s = &s_s;
3164
4.69M
    int re_flags, i, alloca_size, ret;
3165
4.69M
    StackInt *stack_buf;
3166
4.69M
    const uint8_t *cptr;
3167
3168
4.69M
    re_flags = lre_get_flags(bc_buf);
3169
4.69M
    s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
3170
4.69M
    s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
3171
4.69M
    s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
3172
4.69M
    s->cbuf = cbuf;
3173
4.69M
    s->cbuf_end = cbuf + (clen << cbuf_type);
3174
4.69M
    s->cbuf_type = cbuf_type;
3175
4.69M
    if (s->cbuf_type == 1 && s->is_unicode)
3176
3.02M
        s->cbuf_type = 2;
3177
4.69M
    s->interrupt_counter = INTERRUPT_COUNTER_INIT;
3178
4.69M
    s->opaque = opaque;
3179
3180
4.69M
    s->state_size = sizeof(REExecState) +
3181
4.69M
        s->capture_count * sizeof(capture[0]) * 2 +
3182
4.69M
        s->stack_size_max * sizeof(stack_buf[0]);
3183
4.69M
    s->state_stack = NULL;
3184
4.69M
    s->state_stack_len = 0;
3185
4.69M
    s->state_stack_size = 0;
3186
3187
14.0M
    for(i = 0; i < s->capture_count * 2; i++)
3188
9.40M
        capture[i] = NULL;
3189
4.69M
    alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
3190
4.69M
    stack_buf = alloca(alloca_size);
3191
3192
4.69M
    cptr = cbuf + (cindex << cbuf_type);
3193
4.69M
    if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
3194
3.02M
        const uint16_t *p = (const uint16_t *)cptr;
3195
3.02M
        if (is_lo_surrogate(*p) && is_hi_surrogate(p[-1])) {
3196
0
            cptr = (const uint8_t *)(p - 1);
3197
0
        }
3198
3.02M
    }
3199
3200
4.69M
    ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
3201
4.69M
                             cptr, FALSE);
3202
4.69M
    lre_realloc(s->opaque, s->state_stack, 0);
3203
4.69M
    return ret;
3204
4.69M
}
3205
3206
int lre_get_capture_count(const uint8_t *bc_buf)
3207
3.07M
{
3208
3.07M
    return bc_buf[RE_HEADER_CAPTURE_COUNT];
3209
3.07M
}
3210
3211
int lre_get_flags(const uint8_t *bc_buf)
3212
10.4M
{
3213
10.4M
    return get_u16(bc_buf + RE_HEADER_FLAGS);
3214
10.4M
}
3215
3216
/* Return NULL if no group names. Otherwise, return a pointer to
3217
   'capture_count - 1' zero terminated UTF-8 strings. */
3218
const char *lre_get_groupnames(const uint8_t *bc_buf)
3219
33.6k
{
3220
33.6k
    uint32_t re_bytecode_len;
3221
33.6k
    if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
3222
33.6k
        return NULL;
3223
0
    re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
3224
0
    return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
3225
33.6k
}
3226
3227
#ifdef TEST
3228
3229
BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
3230
{
3231
    return FALSE;
3232
}
3233
3234
void *lre_realloc(void *opaque, void *ptr, size_t size)
3235
{
3236
    return realloc(ptr, size);
3237
}
3238
3239
int main(int argc, char **argv)
3240
{
3241
    int len, flags, ret, i;
3242
    uint8_t *bc;
3243
    char error_msg[64];
3244
    uint8_t *capture[CAPTURE_COUNT_MAX * 2];
3245
    const char *input;
3246
    int input_len, capture_count;
3247
3248
    if (argc < 4) {
3249
        printf("usage: %s regexp flags input\n", argv[0]);
3250
        return 1;
3251
    }
3252
    flags = atoi(argv[2]);
3253
    bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
3254
                     strlen(argv[1]), flags, NULL);
3255
    if (!bc) {
3256
        fprintf(stderr, "error: %s\n", error_msg);
3257
        exit(1);
3258
    }
3259
3260
    input = argv[3];
3261
    input_len = strlen(input);
3262
3263
    ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
3264
    printf("ret=%d\n", ret);
3265
    if (ret == 1) {
3266
        capture_count = lre_get_capture_count(bc);
3267
        for(i = 0; i < 2 * capture_count; i++) {
3268
            uint8_t *ptr;
3269
            ptr = capture[i];
3270
            printf("%d: ", i);
3271
            if (!ptr)
3272
                printf("<nil>");
3273
            else
3274
                printf("%u", (int)(ptr - (uint8_t *)input));
3275
            printf("\n");
3276
        }
3277
    }
3278
    return 0;
3279
}
3280
#endif