Coverage Report

Created: 2026-02-26 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/quickjs/libregexp.c
Line
Count
Source
1
/*
2
 * Regular Expression Engine
3
 *
4
 * Copyright (c) 2017-2018 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <stdarg.h>
27
#include <inttypes.h>
28
#include <string.h>
29
#include <assert.h>
30
31
#include "cutils.h"
32
#include "libregexp.h"
33
#include "libunicode.h"
34
35
/*
36
  TODO:
37
  - remove REOP_char_i and REOP_range_i by precomputing the case folding.
38
  - add specific opcodes for simple unicode property tests so that the
39
    generated bytecode is smaller.
40
  - Add a lock step execution mode (=linear time execution guaranteed)
41
    when the regular expression is "simple" i.e. no backreference nor
42
    complicated lookahead. The opcodes are designed for this execution
43
    model.
44
*/
45
46
#if defined(TEST) 
47
#define DUMP_REOP
48
#endif
49
//#define DUMP_REOP
50
//#define DUMP_EXEC
51
52
typedef enum {
53
#define DEF(id, size) REOP_ ## id,
54
#include "libregexp-opcode.h"
55
#undef DEF
56
    REOP_COUNT,
57
} REOPCodeEnum;
58
59
0
#define CAPTURE_COUNT_MAX 255
60
0
#define REGISTER_COUNT_MAX 255
61
/* must be large enough to have a negligible runtime cost and small
62
   enough to call the interrupt callback often. */
63
0
#define INTERRUPT_COUNTER_INIT 10000
64
65
/* unicode code points */
66
0
#define CP_LS   0x2028
67
0
#define CP_PS   0x2029
68
69
#define TMP_BUF_SIZE 128
70
71
typedef struct {
72
    DynBuf byte_code;
73
    const uint8_t *buf_ptr;
74
    const uint8_t *buf_end;
75
    const uint8_t *buf_start;
76
    int re_flags;
77
    BOOL is_unicode;
78
    BOOL unicode_sets; /* if set, is_unicode is also set */
79
    BOOL ignore_case;
80
    BOOL multi_line;
81
    BOOL dotall;
82
    uint8_t group_name_scope;
83
    int capture_count;
84
    int total_capture_count; /* -1 = not computed yet */
85
    int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
86
    void *opaque;
87
    DynBuf group_names;
88
    union {
89
        char error_msg[TMP_BUF_SIZE];
90
        char tmp_buf[TMP_BUF_SIZE];
91
    } u;
92
} REParseState;
93
94
typedef struct {
95
#ifdef DUMP_REOP
96
    const char *name;
97
#endif
98
    uint8_t size;
99
} REOpCode;
100
101
static const REOpCode reopcode_info[REOP_COUNT] = {
102
#ifdef DUMP_REOP
103
#define DEF(id, size) { #id, size },
104
#else
105
#define DEF(id, size) { size },
106
#endif
107
#include "libregexp-opcode.h"
108
#undef DEF
109
};
110
111
0
#define RE_HEADER_FLAGS          0
112
0
#define RE_HEADER_CAPTURE_COUNT  2
113
0
#define RE_HEADER_REGISTER_COUNT 3
114
0
#define RE_HEADER_BYTECODE_LEN   4
115
116
0
#define RE_HEADER_LEN 8
117
118
0
static inline int is_digit(int c) {
119
0
    return c >= '0' && c <= '9';
120
0
}
121
122
/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
123
static int dbuf_insert(DynBuf *s, int pos, int len)
124
0
{
125
0
    if (dbuf_claim(s, len))
126
0
        return -1;
127
0
    memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
128
0
    s->size += len;
129
0
    return 0;
130
0
}
131
132
typedef struct REString {
133
    struct REString *next;
134
    uint32_t hash;
135
    uint32_t len;
136
    uint32_t buf[];
137
} REString;
138
139
typedef struct {
140
    /* the string list is the union of 'char_range' and of the strings
141
       in hash_table[]. The strings in hash_table[] have a length !=
142
       1. */
143
    CharRange cr;
144
    uint32_t n_strings;
145
    uint32_t hash_size;
146
    int hash_bits;
147
    REString **hash_table;
148
} REStringList;
149
150
static uint32_t re_string_hash(int len, const uint32_t *buf)
151
0
{
152
0
    int i;
153
0
    uint32_t h;
154
0
    h = 1;
155
0
    for(i = 0; i < len; i++)
156
0
        h = h * 263 + buf[i];
157
0
    return h * 0x61C88647;
158
0
}
159
160
static void re_string_list_init(REParseState *s1, REStringList *s)
161
0
{
162
0
    cr_init(&s->cr, s1->opaque, lre_realloc);
163
0
    s->n_strings = 0;
164
0
    s->hash_size = 0;
165
0
    s->hash_bits = 0;
166
0
    s->hash_table = NULL;
167
0
}
168
169
static void re_string_list_free(REStringList *s)
170
0
{
171
0
    REString *p, *p_next;
172
0
    int i;
173
0
    for(i = 0; i < s->hash_size; i++) {
174
0
        for(p = s->hash_table[i]; p != NULL; p = p_next) {
175
0
            p_next = p->next;
176
0
            lre_realloc(s->cr.mem_opaque, p, 0);
177
0
        }
178
0
    }
179
0
    lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
180
181
0
    cr_free(&s->cr);
182
0
}
183
184
static void lre_print_char(int c, BOOL is_range)
185
0
{
186
0
    if (c == '\'' || c == '\\' ||
187
0
        (is_range && (c == '-' || c == ']'))) {
188
0
        printf("\\%c", c);
189
0
    } else if (c >= ' ' && c <= 126) {
190
0
        printf("%c", c);
191
0
    } else {
192
0
        printf("\\u{%04x}", c);
193
0
    }
194
0
}
195
196
static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
197
0
{
198
0
    REString *p;
199
0
    const CharRange *cr;
200
0
    int i, j, k;
201
0
202
0
    printf("%s:\n", str);
203
0
    printf("  ranges: [");
204
0
    cr = &s->cr;
205
0
    for(i = 0; i < cr->len; i += 2) {
206
0
        lre_print_char(cr->points[i], TRUE);
207
0
        if (cr->points[i] != cr->points[i + 1] - 1) {
208
0
            printf("-");
209
0
            lre_print_char(cr->points[i + 1] - 1, TRUE);
210
0
        }
211
0
    }
212
0
    printf("]\n");
213
0
    
214
0
    j = 0;
215
0
    for(i = 0; i < s->hash_size; i++) {
216
0
        for(p = s->hash_table[i]; p != NULL; p = p->next) {
217
0
            printf("  %d/%d: '", j, s->n_strings);
218
0
            for(k = 0; k < p->len; k++) {
219
0
                lre_print_char(p->buf[k], FALSE);
220
0
            }
221
0
            printf("'\n");
222
0
            j++;
223
0
        }
224
0
    }
225
0
}
226
227
static int re_string_find2(REStringList *s, int len, const uint32_t *buf,
228
                           uint32_t h0, BOOL add_flag)
229
0
{
230
0
    uint32_t h = 0; /* avoid warning */
231
0
    REString *p;
232
0
    if (s->n_strings != 0) {
233
0
        h = h0 >> (32 - s->hash_bits);
234
0
        for(p = s->hash_table[h]; p != NULL; p = p->next) {
235
0
            if (p->hash == h0 && p->len == len &&
236
0
                !memcmp(p->buf, buf, len * sizeof(buf[0]))) {
237
0
                return 1;
238
0
            }
239
0
        }
240
0
    }
241
    /* not found */
242
0
    if (!add_flag)
243
0
        return 0;
244
    /* increase the size of the hash table if needed */
245
0
    if (unlikely((s->n_strings + 1) > s->hash_size)) {
246
0
        REString **new_hash_table, *p_next;
247
0
        int new_hash_bits, i;
248
0
        uint32_t new_hash_size;
249
0
        new_hash_bits = max_int(s->hash_bits + 1, 4);
250
0
        new_hash_size = 1 << new_hash_bits;
251
0
        new_hash_table = lre_realloc(s->cr.mem_opaque, NULL,
252
0
                                     sizeof(new_hash_table[0]) * new_hash_size);
253
0
        if (!new_hash_table)
254
0
            return -1;
255
0
        memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
256
0
        for(i = 0; i < s->hash_size; i++) {
257
0
            for(p = s->hash_table[i]; p != NULL; p = p_next) {
258
0
                p_next = p->next;
259
0
                h = p->hash >> (32 - new_hash_bits);
260
0
                p->next = new_hash_table[h];
261
0
                new_hash_table[h] = p;
262
0
            }
263
0
        }
264
0
        lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
265
0
        s->hash_bits = new_hash_bits;
266
0
        s->hash_size = new_hash_size;
267
0
        s->hash_table = new_hash_table;
268
0
        h = h0 >> (32 - s->hash_bits);
269
0
    }
270
271
0
    p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0]));
272
0
    if (!p)
273
0
        return -1;
274
0
    p->next = s->hash_table[h];
275
0
    s->hash_table[h] = p;
276
0
    s->n_strings++;
277
0
    p->hash = h0;
278
0
    p->len = len;
279
0
    memcpy(p->buf, buf, sizeof(buf[0]) * len);
280
0
    return 1;
281
0
}
282
283
static int re_string_find(REStringList *s, int len, const uint32_t *buf,
284
                          BOOL add_flag)
285
0
{
286
0
    uint32_t h0;
287
0
    h0 = re_string_hash(len, buf);
288
0
    return re_string_find2(s, len, buf, h0, add_flag);
289
0
}
290
291
/* return -1 if memory error, 0 if OK */
292
static int re_string_add(REStringList *s, int len, const uint32_t *buf)
293
0
{
294
0
    if (len == 1) {
295
0
        return cr_union_interval(&s->cr, buf[0], buf[0]);
296
0
    }
297
0
    if (re_string_find(s, len, buf, TRUE) < 0)
298
0
        return -1;
299
0
    return 0;
300
0
}
301
302
/* a = a op b */
303
static int re_string_list_op(REStringList *a, REStringList *b, int op)
304
0
{
305
0
    int i, ret;
306
0
    REString *p, **pp;
307
308
0
    if (cr_op1(&a->cr, b->cr.points, b->cr.len, op))
309
0
        return -1;
310
311
0
    switch(op) {
312
0
    case CR_OP_UNION:
313
0
        if (b->n_strings != 0) {
314
0
            for(i = 0; i < b->hash_size; i++) {
315
0
                for(p = b->hash_table[i]; p != NULL; p = p->next) {
316
0
                    if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0)
317
0
                        return -1;
318
0
                }
319
0
            }
320
0
        }
321
0
        break;
322
0
    case CR_OP_INTER:
323
0
    case CR_OP_SUB:
324
0
        for(i = 0; i < a->hash_size; i++) {
325
0
            pp = &a->hash_table[i];
326
0
            for(;;) {
327
0
                p = *pp;
328
0
                if (p == NULL)
329
0
                    break;
330
0
                ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE);
331
0
                if (op == CR_OP_SUB)
332
0
                    ret = !ret;
333
0
                if (!ret) {
334
                    /* remove it */
335
0
                    *pp = p->next;
336
0
                    a->n_strings--;
337
0
                    lre_realloc(a->cr.mem_opaque, p, 0);
338
0
                } else {
339
                    /* keep it */
340
0
                    pp = &p->next;
341
0
                }
342
0
            }
343
0
        }
344
0
        break;
345
0
    default:
346
0
        abort();
347
0
    }
348
0
    return 0;
349
0
}
350
351
static int re_string_list_canonicalize(REParseState *s1,
352
                                       REStringList *s, BOOL is_unicode)
353
0
{
354
0
    if (cr_regexp_canonicalize(&s->cr, is_unicode))
355
0
        return -1;
356
0
    if (s->n_strings != 0) {
357
0
        REStringList a_s, *a = &a_s;
358
0
        int i, j;
359
0
        REString *p;
360
        
361
        /* XXX: simplify */
362
0
        re_string_list_init(s1, a);
363
364
0
        a->n_strings = s->n_strings;
365
0
        a->hash_size = s->hash_size;
366
0
        a->hash_bits = s->hash_bits;
367
0
        a->hash_table = s->hash_table;
368
        
369
0
        s->n_strings = 0;
370
0
        s->hash_size = 0;
371
0
        s->hash_bits = 0;
372
0
        s->hash_table = NULL;
373
374
0
        for(i = 0; i < a->hash_size; i++) {
375
0
            for(p = a->hash_table[i]; p != NULL; p = p->next) {
376
0
                for(j = 0; j < p->len; j++) {
377
0
                    p->buf[j] = lre_canonicalize(p->buf[j], is_unicode);
378
0
                }
379
0
                if (re_string_add(s, p->len, p->buf)) {
380
0
                    re_string_list_free(a);
381
0
                    return -1;
382
0
                }
383
0
            }
384
0
        }
385
0
        re_string_list_free(a);
386
0
    }
387
0
    return 0;
388
0
}
389
390
static const uint16_t char_range_d[] = {
391
    1,
392
    0x0030, 0x0039 + 1,
393
};
394
395
/* code point ranges for Zs,Zl or Zp property */
396
static const uint16_t char_range_s[] = {
397
    10,
398
    0x0009, 0x000D + 1,
399
    0x0020, 0x0020 + 1,
400
    0x00A0, 0x00A0 + 1,
401
    0x1680, 0x1680 + 1,
402
    0x2000, 0x200A + 1,
403
    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
404
    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
405
    0x2028, 0x2029 + 1,
406
    0x202F, 0x202F + 1,
407
    0x205F, 0x205F + 1,
408
    0x3000, 0x3000 + 1,
409
    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
410
    0xFEFF, 0xFEFF + 1,
411
};
412
413
static const uint16_t char_range_w[] = {
414
    4,
415
    0x0030, 0x0039 + 1,
416
    0x0041, 0x005A + 1,
417
    0x005F, 0x005F + 1,
418
    0x0061, 0x007A + 1,
419
};
420
421
0
#define CLASS_RANGE_BASE 0x40000000
422
423
typedef enum {
424
    CHAR_RANGE_d,
425
    CHAR_RANGE_D,
426
    CHAR_RANGE_s,
427
    CHAR_RANGE_S,
428
    CHAR_RANGE_w,
429
    CHAR_RANGE_W,
430
} CharRangeEnum;
431
432
static const uint16_t * const char_range_table[] = {
433
    char_range_d,
434
    char_range_s,
435
    char_range_w,
436
};
437
438
static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c)
439
0
{
440
0
    BOOL invert;
441
0
    const uint16_t *c_pt;
442
0
    int len, i;
443
444
0
    invert = c & 1;
445
0
    c_pt = char_range_table[c >> 1];
446
0
    len = *c_pt++;
447
0
    re_string_list_init(s, cr);
448
0
    for(i = 0; i < len * 2; i++) {
449
0
        if (cr_add_point(&cr->cr, c_pt[i]))
450
0
            goto fail;
451
0
    }
452
0
    if (invert) {
453
0
        if (cr_invert(&cr->cr))
454
0
            goto fail;
455
0
    }
456
0
    return 0;
457
0
 fail:
458
0
    re_string_list_free(cr);
459
0
    return -1;
460
0
}
461
462
#ifdef DUMP_REOP
463
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
464
                                                     int buf_len)
465
{
466
    int pos, len, opcode, bc_len, re_flags, i;
467
    uint32_t val, val2;
468
469
    assert(buf_len >= RE_HEADER_LEN);
470
471
    re_flags = lre_get_flags(buf);
472
    bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
473
    assert(bc_len + RE_HEADER_LEN <= buf_len);
474
    printf("flags: 0x%x capture_count=%d reg_count=%d\n",
475
           re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_REGISTER_COUNT]);
476
    if (re_flags & LRE_FLAG_NAMED_GROUPS) {
477
        const char *p;
478
        p = (char *)buf + RE_HEADER_LEN + bc_len;
479
        printf("named groups: ");
480
        for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
481
            if (i != 1)
482
                printf(",");
483
            printf("<%s>", p);
484
            p += strlen(p) + LRE_GROUP_NAME_TRAILER_LEN;
485
        }
486
        printf("\n");
487
        assert(p == (char *)(buf + buf_len));
488
    }
489
    printf("bytecode_len=%d\n", bc_len);
490
491
    buf += RE_HEADER_LEN;
492
    pos = 0;
493
    while (pos < bc_len) {
494
        printf("%5u: ", pos);
495
        opcode = buf[pos];
496
        len = reopcode_info[opcode].size;
497
        if (opcode >= REOP_COUNT) {
498
            printf(" invalid opcode=0x%02x\n", opcode);
499
            break;
500
        }
501
        if ((pos + len) > bc_len) {
502
            printf(" buffer overflow (opcode=0x%02x)\n", opcode);
503
            break;
504
        }
505
        printf("%s", reopcode_info[opcode].name);
506
        switch(opcode) {
507
        case REOP_char:
508
        case REOP_char_i:
509
            val = get_u16(buf + pos + 1);
510
            if (val >= ' ' && val <= 126)
511
                printf(" '%c'", val);
512
            else
513
                printf(" 0x%04x", val);
514
            break;
515
        case REOP_char32:
516
        case REOP_char32_i:
517
            val = get_u32(buf + pos + 1);
518
            if (val >= ' ' && val <= 126)
519
                printf(" '%c'", val);
520
            else
521
                printf(" 0x%08x", val);
522
            break;
523
        case REOP_goto:
524
        case REOP_split_goto_first:
525
        case REOP_split_next_first:
526
        case REOP_lookahead:
527
        case REOP_negative_lookahead:
528
            val = get_u32(buf + pos + 1);
529
            val += (pos + 5);
530
            printf(" %u", val);
531
            break;
532
        case REOP_loop:
533
            val2 = buf[pos + 1];
534
            val = get_u32(buf + pos + 2);
535
            val += (pos + 6);
536
            printf(" r%u, %u", val2, val);
537
            break;
538
        case REOP_loop_split_goto_first:
539
        case REOP_loop_split_next_first:
540
        case REOP_loop_check_adv_split_goto_first:
541
        case REOP_loop_check_adv_split_next_first:
542
            {
543
                uint32_t limit;
544
                val2 = buf[pos + 1];
545
                limit = get_u32(buf + pos + 2);
546
                val = get_u32(buf + pos + 6);
547
                val += (pos + 10);
548
                printf(" r%u, %u, %u", val2, limit, val);
549
            }
550
            break;
551
        case REOP_save_start:
552
        case REOP_save_end:
553
            printf(" %u", buf[pos + 1]);
554
            break;
555
        case REOP_back_reference:
556
        case REOP_back_reference_i:
557
        case REOP_backward_back_reference:
558
        case REOP_backward_back_reference_i:
559
            {
560
                int n, i;
561
                n = buf[pos + 1];
562
                len += n;
563
                for(i = 0; i < n; i++) {
564
                    if (i != 0)
565
                        printf(",");
566
                    printf(" %u", buf[pos + 2 + i]);
567
                }
568
            }
569
            break;
570
        case REOP_save_reset:
571
            printf(" %u %u", buf[pos + 1], buf[pos + 2]);
572
            break;
573
        case REOP_set_i32:
574
            val = buf[pos + 1];
575
            val2 = get_u32(buf + pos + 2);
576
            printf(" r%u, %d", val, val2);
577
            break;
578
        case REOP_set_char_pos:
579
        case REOP_check_advance:
580
            val = buf[pos + 1];
581
            printf(" r%u", val);
582
            break;
583
        case REOP_range:
584
        case REOP_range_i:
585
            {
586
                int n, i;
587
                n = get_u16(buf + pos + 1);
588
                len += n * 4;
589
                for(i = 0; i < n * 2; i++) {
590
                    val = get_u16(buf + pos + 3 + i * 2);
591
                    printf(" 0x%04x", val);
592
                }
593
            }
594
            break;
595
        case REOP_range32:
596
        case REOP_range32_i:
597
            {
598
                int n, i;
599
                n = get_u16(buf + pos + 1);
600
                len += n * 8;
601
                for(i = 0; i < n * 2; i++) {
602
                    val = get_u32(buf + pos + 3 + i * 4);
603
                    printf(" 0x%08x", val);
604
                }
605
            }
606
            break;
607
        default:
608
            break;
609
        }
610
        printf("\n");
611
        pos += len;
612
    }
613
}
614
#endif
615
616
static void re_emit_op(REParseState *s, int op)
617
0
{
618
0
    dbuf_putc(&s->byte_code, op);
619
0
}
620
621
/* return the offset of the u32 value */
622
static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
623
0
{
624
0
    int pos;
625
0
    dbuf_putc(&s->byte_code, op);
626
0
    pos = s->byte_code.size;
627
0
    dbuf_put_u32(&s->byte_code, val);
628
0
    return pos;
629
0
}
630
631
static int re_emit_goto(REParseState *s, int op, uint32_t val)
632
0
{
633
0
    int pos;
634
0
    dbuf_putc(&s->byte_code, op);
635
0
    pos = s->byte_code.size;
636
0
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
637
0
    return pos;
638
0
}
639
640
static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val)
641
0
{
642
0
    int pos;
643
0
    dbuf_putc(&s->byte_code, op);
644
0
    dbuf_putc(&s->byte_code, arg);
645
0
    pos = s->byte_code.size;
646
0
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
647
0
    return pos;
648
0
}
649
650
static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val)
651
0
{
652
0
    int pos;
653
0
    dbuf_putc(&s->byte_code, op);
654
0
    dbuf_putc(&s->byte_code, arg0);
655
0
    dbuf_put_u32(&s->byte_code, arg1);
656
0
    pos = s->byte_code.size;
657
0
    dbuf_put_u32(&s->byte_code, val - (pos + 4));
658
0
    return pos;
659
0
}
660
661
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
662
0
{
663
0
    dbuf_putc(&s->byte_code, op);
664
0
    dbuf_putc(&s->byte_code, val);
665
0
}
666
667
static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
668
0
{
669
0
    dbuf_putc(&s->byte_code, op);
670
0
    dbuf_put_u16(&s->byte_code, val);
671
0
}
672
673
static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
674
0
{
675
0
    va_list ap;
676
0
    va_start(ap, fmt);
677
0
    vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
678
0
    va_end(ap);
679
0
    return -1;
680
0
}
681
682
static int re_parse_out_of_memory(REParseState *s)
683
0
{
684
0
    return re_parse_error(s, "out of memory");
685
0
}
686
687
/* If allow_overflow is false, return -1 in case of
688
   overflow. Otherwise return INT32_MAX. */
689
static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
690
0
{
691
0
    const uint8_t *p;
692
0
    uint64_t v;
693
0
    int c;
694
695
0
    p = *pp;
696
0
    v = 0;
697
0
    for(;;) {
698
0
        c = *p;
699
0
        if (c < '0' || c > '9')
700
0
            break;
701
0
        v = v * 10 + c - '0';
702
0
        if (v >= INT32_MAX) {
703
0
            if (allow_overflow)
704
0
                v = INT32_MAX;
705
0
            else
706
0
                return -1;
707
0
        }
708
0
        p++;
709
0
    }
710
0
    *pp = p;
711
0
    return v;
712
0
}
713
714
static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
715
0
{
716
0
    const uint8_t *p;
717
0
    p = *pp;
718
0
    if (*p != c)
719
0
        return re_parse_error(s, "expecting '%c'", c);
720
0
    p++;
721
0
    *pp = p;
722
0
    return 0;
723
0
}
724
725
/* Parse an escape sequence, *pp points after the '\':
726
   allow_utf16 value:
727
   0 : no UTF-16 escapes allowed
728
   1 : UTF-16 escapes allowed
729
   2 : UTF-16 escapes allowed and escapes of surrogate pairs are
730
   converted to a unicode character (unicode regexp case).
731
732
   Return the unicode char and update *pp if recognized,
733
   return -1 if malformed escape,
734
   return -2 otherwise. */
735
int lre_parse_escape(const uint8_t **pp, int allow_utf16)
736
0
{
737
0
    const uint8_t *p;
738
0
    uint32_t c;
739
740
0
    p = *pp;
741
0
    c = *p++;
742
0
    switch(c) {
743
0
    case 'b':
744
0
        c = '\b';
745
0
        break;
746
0
    case 'f':
747
0
        c = '\f';
748
0
        break;
749
0
    case 'n':
750
0
        c = '\n';
751
0
        break;
752
0
    case 'r':
753
0
        c = '\r';
754
0
        break;
755
0
    case 't':
756
0
        c = '\t';
757
0
        break;
758
0
    case 'v':
759
0
        c = '\v';
760
0
        break;
761
0
    case 'x':
762
0
        {
763
0
            int h0, h1;
764
765
0
            h0 = from_hex(*p++);
766
0
            if (h0 < 0)
767
0
                return -1;
768
0
            h1 = from_hex(*p++);
769
0
            if (h1 < 0)
770
0
                return -1;
771
0
            c = (h0 << 4) | h1;
772
0
        }
773
0
        break;
774
0
    case 'u':
775
0
        {
776
0
            int h, i;
777
0
            uint32_t c1;
778
779
0
            if (*p == '{' && allow_utf16) {
780
0
                p++;
781
0
                c = 0;
782
0
                for(;;) {
783
0
                    h = from_hex(*p++);
784
0
                    if (h < 0)
785
0
                        return -1;
786
0
                    c = (c << 4) | h;
787
0
                    if (c > 0x10FFFF)
788
0
                        return -1;
789
0
                    if (*p == '}')
790
0
                        break;
791
0
                }
792
0
                p++;
793
0
            } else {
794
0
                c = 0;
795
0
                for(i = 0; i < 4; i++) {
796
0
                    h = from_hex(*p++);
797
0
                    if (h < 0) {
798
0
                        return -1;
799
0
                    }
800
0
                    c = (c << 4) | h;
801
0
                }
802
0
                if (is_hi_surrogate(c) &&
803
0
                    allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
804
                    /* convert an escaped surrogate pair into a
805
                       unicode char */
806
0
                    c1 = 0;
807
0
                    for(i = 0; i < 4; i++) {
808
0
                        h = from_hex(p[2 + i]);
809
0
                        if (h < 0)
810
0
                            break;
811
0
                        c1 = (c1 << 4) | h;
812
0
                    }
813
0
                    if (i == 4 && is_lo_surrogate(c1)) {
814
0
                        p += 6;
815
0
                        c = from_surrogate(c, c1);
816
0
                    }
817
0
                }
818
0
            }
819
0
        }
820
0
        break;
821
0
    case '0': case '1': case '2': case '3':
822
0
    case '4': case '5': case '6': case '7':
823
0
        c -= '0';
824
0
        if (allow_utf16 == 2) {
825
            /* only accept \0 not followed by digit */
826
0
            if (c != 0 || is_digit(*p))
827
0
                return -1;
828
0
        } else {
829
            /* parse a legacy octal sequence */
830
0
            uint32_t v;
831
0
            v = *p - '0';
832
0
            if (v > 7)
833
0
                break;
834
0
            c = (c << 3) | v;
835
0
            p++;
836
0
            if (c >= 32)
837
0
                break;
838
0
            v = *p - '0';
839
0
            if (v > 7)
840
0
                break;
841
0
            c = (c << 3) | v;
842
0
            p++;
843
0
        }
844
0
        break;
845
0
    default:
846
0
        return -2;
847
0
    }
848
0
    *pp = p;
849
0
    return c;
850
0
}
851
852
#ifdef CONFIG_ALL_UNICODE
853
/* XXX: we use the same chars for name and value */
854
static BOOL is_unicode_char(int c)
855
0
{
856
0
    return ((c >= '0' && c <= '9') ||
857
0
            (c >= 'A' && c <= 'Z') ||
858
0
            (c >= 'a' && c <= 'z') ||
859
0
            (c == '_'));
860
0
}
861
862
/* XXX: memory error test */
863
static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len)
864
0
{
865
0
    REStringList *sl = opaque;
866
0
    re_string_add(sl, seq_len, seq);
867
0
}
868
869
static int parse_unicode_property(REParseState *s, REStringList *cr,
870
                                  const uint8_t **pp, BOOL is_inv,
871
                                  BOOL allow_sequence_prop)
872
0
{
873
0
    const uint8_t *p;
874
0
    char name[64], value[64];
875
0
    char *q;
876
0
    BOOL script_ext;
877
0
    int ret;
878
879
0
    p = *pp;
880
0
    if (*p != '{')
881
0
        return re_parse_error(s, "expecting '{' after \\p");
882
0
    p++;
883
0
    q = name;
884
0
    while (is_unicode_char(*p)) {
885
0
        if ((q - name) >= sizeof(name) - 1)
886
0
            goto unknown_property_name;
887
0
        *q++ = *p++;
888
0
    }
889
0
    *q = '\0';
890
0
    q = value;
891
0
    if (*p == '=') {
892
0
        p++;
893
0
        while (is_unicode_char(*p)) {
894
0
            if ((q - value) >= sizeof(value) - 1)
895
0
                return re_parse_error(s, "unknown unicode property value");
896
0
            *q++ = *p++;
897
0
        }
898
0
    }
899
0
    *q = '\0';
900
0
    if (*p != '}')
901
0
        return re_parse_error(s, "expecting '}'");
902
0
    p++;
903
    //    printf("name=%s value=%s\n", name, value);
904
905
0
    if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
906
0
        script_ext = FALSE;
907
0
        goto do_script;
908
0
    } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
909
0
        script_ext = TRUE;
910
0
    do_script:
911
0
        re_string_list_init(s, cr);
912
0
        ret = unicode_script(&cr->cr, value, script_ext);
913
0
        if (ret) {
914
0
            re_string_list_free(cr);
915
0
            if (ret == -2)
916
0
                return re_parse_error(s, "unknown unicode script");
917
0
            else
918
0
                goto out_of_memory;
919
0
        }
920
0
    } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
921
0
        re_string_list_init(s, cr);
922
0
        ret = unicode_general_category(&cr->cr, value);
923
0
        if (ret) {
924
0
            re_string_list_free(cr);
925
0
            if (ret == -2)
926
0
                return re_parse_error(s, "unknown unicode general category");
927
0
            else
928
0
                goto out_of_memory;
929
0
        }
930
0
    } else if (value[0] == '\0') {
931
0
        re_string_list_init(s, cr);
932
0
        ret = unicode_general_category(&cr->cr, name);
933
0
        if (ret == -1) {
934
0
            re_string_list_free(cr);
935
0
            goto out_of_memory;
936
0
        }
937
0
        if (ret < 0) {
938
0
            ret = unicode_prop(&cr->cr, name);
939
0
            if (ret == -1) {
940
0
                re_string_list_free(cr);
941
0
                goto out_of_memory;
942
0
            }
943
0
        }
944
0
        if (ret < 0 && !is_inv && allow_sequence_prop) {
945
0
            CharRange cr_tmp;
946
0
            cr_init(&cr_tmp, s->opaque, lre_realloc);
947
0
            ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp);
948
0
            cr_free(&cr_tmp);
949
0
            if (ret == -1) {
950
0
                re_string_list_free(cr);
951
0
                goto out_of_memory;
952
0
            }
953
0
        }
954
0
        if (ret < 0)
955
0
            goto unknown_property_name;
956
0
    } else {
957
0
    unknown_property_name:
958
0
        return re_parse_error(s, "unknown unicode property name");
959
0
    }
960
961
    /* the ordering of case folding and inversion  differs with
962
       unicode_sets. 'unicode_sets' ordering is more consistent */
963
    /* XXX: the spec seems incorrect, we do it as the other engines
964
       seem to do it. */
965
0
    if (s->ignore_case && s->unicode_sets) {
966
0
        if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
967
0
            re_string_list_free(cr);
968
0
            goto out_of_memory;
969
0
        }
970
0
    }
971
0
    if (is_inv) {
972
0
        if (cr_invert(&cr->cr)) {
973
0
            re_string_list_free(cr);
974
0
            goto out_of_memory;
975
0
        }
976
0
    }
977
0
    if (s->ignore_case && !s->unicode_sets) {
978
0
        if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
979
0
            re_string_list_free(cr);
980
0
            goto out_of_memory;
981
0
        }
982
0
    }
983
0
    *pp = p;
984
0
    return 0;
985
0
 out_of_memory:
986
0
    return re_parse_out_of_memory(s);
987
0
}
988
#endif /* CONFIG_ALL_UNICODE */
989
990
static int get_class_atom(REParseState *s, REStringList *cr,
991
                          const uint8_t **pp, BOOL inclass);
992
993
static int parse_class_string_disjunction(REParseState *s, REStringList *cr,
994
                                          const uint8_t **pp)
995
0
{
996
0
    const uint8_t *p;
997
0
    DynBuf str;
998
0
    int c;
999
    
1000
0
    p = *pp;
1001
0
    if (*p != '{')
1002
0
        return re_parse_error(s, "expecting '{' after \\q");
1003
1004
0
    dbuf_init2(&str, s->opaque, lre_realloc);
1005
0
    re_string_list_init(s, cr);
1006
    
1007
0
    p++;
1008
0
    for(;;) {
1009
0
        str.size = 0;
1010
0
        while (*p != '}' && *p != '|') {
1011
0
            c = get_class_atom(s, NULL, &p, FALSE);
1012
0
            if (c < 0)
1013
0
                goto fail;
1014
0
            if (dbuf_put_u32(&str, c)) {
1015
0
                re_parse_out_of_memory(s);
1016
0
                goto fail;
1017
0
            }
1018
0
        }
1019
0
        if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) {
1020
0
            re_parse_out_of_memory(s);
1021
0
            goto fail;
1022
0
        }
1023
0
        if (*p == '}')
1024
0
            break;
1025
0
        p++;
1026
0
    }
1027
0
    if (s->ignore_case) {
1028
0
        if (re_string_list_canonicalize(s, cr, TRUE))
1029
0
            goto fail;
1030
0
    }
1031
0
    p++; /* skip the '}' */
1032
0
    dbuf_free(&str);
1033
0
    *pp = p;
1034
0
    return 0;
1035
0
 fail:
1036
0
    dbuf_free(&str);
1037
0
    re_string_list_free(cr);
1038
0
    return -1;
1039
0
}
1040
1041
/* return -1 if error otherwise the character or a class range
1042
   (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is
1043
   initialized. Otherwise, it is ignored. */
1044
static int get_class_atom(REParseState *s, REStringList *cr,
1045
                          const uint8_t **pp, BOOL inclass)
1046
0
{
1047
0
    const uint8_t *p;
1048
0
    uint32_t c;
1049
0
    int ret;
1050
1051
0
    p = *pp;
1052
1053
0
    c = *p;
1054
0
    switch(c) {
1055
0
    case '\\':
1056
0
        p++;
1057
0
        if (p >= s->buf_end)
1058
0
            goto unexpected_end;
1059
0
        c = *p++;
1060
0
        switch(c) {
1061
0
        case 'd':
1062
0
            c = CHAR_RANGE_d;
1063
0
            goto class_range;
1064
0
        case 'D':
1065
0
            c = CHAR_RANGE_D;
1066
0
            goto class_range;
1067
0
        case 's':
1068
0
            c = CHAR_RANGE_s;
1069
0
            goto class_range;
1070
0
        case 'S':
1071
0
            c = CHAR_RANGE_S;
1072
0
            goto class_range;
1073
0
        case 'w':
1074
0
            c = CHAR_RANGE_w;
1075
0
            goto class_range;
1076
0
        case 'W':
1077
0
            c = CHAR_RANGE_W;
1078
0
        class_range:
1079
0
            if (!cr)
1080
0
                goto default_escape;
1081
0
            if (cr_init_char_range(s, cr, c))
1082
0
                return -1;
1083
0
            c += CLASS_RANGE_BASE;
1084
0
            break;
1085
0
        case 'c':
1086
0
            c = *p;
1087
0
            if ((c >= 'a' && c <= 'z') ||
1088
0
                (c >= 'A' && c <= 'Z') ||
1089
0
                (((c >= '0' && c <= '9') || c == '_') &&
1090
0
                 inclass && !s->is_unicode)) {   /* Annex B.1.4 */
1091
0
                c &= 0x1f;
1092
0
                p++;
1093
0
            } else if (s->is_unicode) {
1094
0
                goto invalid_escape;
1095
0
            } else {
1096
                /* otherwise return '\' and 'c' */
1097
0
                p--;
1098
0
                c = '\\';
1099
0
            }
1100
0
            break;
1101
0
        case '-':
1102
0
            if (!inclass && s->is_unicode)
1103
0
                goto invalid_escape;
1104
0
            break;
1105
0
        case '^':
1106
0
        case '$':
1107
0
        case '\\':
1108
0
        case '.':
1109
0
        case '*':
1110
0
        case '+':
1111
0
        case '?':
1112
0
        case '(':
1113
0
        case ')':
1114
0
        case '[':
1115
0
        case ']':
1116
0
        case '{':
1117
0
        case '}':
1118
0
        case '|':
1119
0
        case '/':
1120
            /* always valid to escape these characters */
1121
0
            break;
1122
0
#ifdef CONFIG_ALL_UNICODE
1123
0
        case 'p':
1124
0
        case 'P':
1125
0
            if (s->is_unicode && cr) {
1126
0
                if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets))
1127
0
                    return -1;
1128
0
                c = CLASS_RANGE_BASE;
1129
0
                break;
1130
0
            }
1131
0
            goto default_escape;
1132
0
#endif
1133
0
        case 'q':
1134
0
            if (s->unicode_sets && cr && inclass) {
1135
0
                if (parse_class_string_disjunction(s, cr, &p))
1136
0
                    return -1;
1137
0
                c = CLASS_RANGE_BASE;
1138
0
                break;
1139
0
            }
1140
0
            goto default_escape;
1141
0
        default:
1142
0
        default_escape:
1143
0
            p--;
1144
0
            ret = lre_parse_escape(&p, s->is_unicode * 2);
1145
0
            if (ret >= 0) {
1146
0
                c = ret;
1147
0
            } else {
1148
0
                if (s->is_unicode) {
1149
0
                invalid_escape:
1150
0
                    return re_parse_error(s, "invalid escape sequence in regular expression");
1151
0
                } else {
1152
                    /* just ignore the '\' */
1153
0
                    goto normal_char;
1154
0
                }
1155
0
            }
1156
0
            break;
1157
0
        }
1158
0
        break;
1159
0
    case '\0':
1160
0
        if (p >= s->buf_end) {
1161
0
        unexpected_end:
1162
0
            return re_parse_error(s, "unexpected end");
1163
0
        }
1164
        /* fall thru */
1165
0
        goto normal_char;
1166
1167
0
    case '&':
1168
0
    case '!':
1169
0
    case '#':
1170
0
    case '$':
1171
0
    case '%':
1172
0
    case '*':
1173
0
    case '+':
1174
0
    case ',':
1175
0
    case '.':
1176
0
    case ':':
1177
0
    case ';':
1178
0
    case '<':
1179
0
    case '=':
1180
0
    case '>':
1181
0
    case '?':
1182
0
    case '@':
1183
0
    case '^':
1184
0
    case '`':
1185
0
    case '~':
1186
0
        if (s->unicode_sets && p[1] == c) {
1187
            /* forbidden double characters */
1188
0
            return re_parse_error(s, "invalid class set operation in regular expression");
1189
0
        }
1190
0
        goto normal_char;
1191
1192
0
    case '(':
1193
0
    case ')':
1194
0
    case '[':
1195
0
    case ']':
1196
0
    case '{':
1197
0
    case '}':
1198
0
    case '/':
1199
0
    case '-':
1200
0
    case '|':
1201
0
        if (s->unicode_sets) {
1202
            /* invalid characters in unicode sets */
1203
0
            return re_parse_error(s, "invalid character in class in regular expression");
1204
0
        }
1205
0
        goto normal_char;
1206
1207
0
    default:
1208
0
    normal_char:
1209
        /* normal char */
1210
0
        if (c >= 128) {
1211
0
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1212
0
            if ((unsigned)c > 0xffff && !s->is_unicode) {
1213
                /* XXX: should handle non BMP-1 code points */
1214
0
                return re_parse_error(s, "malformed unicode char");
1215
0
            }
1216
0
        } else {
1217
0
            p++;
1218
0
        }
1219
0
        break;
1220
0
    }
1221
0
    *pp = p;
1222
0
    return c;
1223
0
}
1224
1225
static int re_emit_range(REParseState *s, const CharRange *cr)
1226
0
{
1227
0
    int len, i;
1228
0
    uint32_t high;
1229
1230
0
    len = (unsigned)cr->len / 2;
1231
0
    if (len >= 65535)
1232
0
        return re_parse_error(s, "too many ranges");
1233
0
    if (len == 0) {
1234
0
        re_emit_op_u32(s, REOP_char32, -1);
1235
0
    } else {
1236
0
        high = cr->points[cr->len - 1];
1237
0
        if (high == UINT32_MAX)
1238
0
            high = cr->points[cr->len - 2];
1239
0
        if (high <= 0xffff) {
1240
            /* can use 16 bit ranges with the conversion that 0xffff =
1241
               infinity */
1242
0
            re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len);
1243
0
            for(i = 0; i < cr->len; i += 2) {
1244
0
                dbuf_put_u16(&s->byte_code, cr->points[i]);
1245
0
                high = cr->points[i + 1] - 1;
1246
0
                if (high == UINT32_MAX - 1)
1247
0
                    high = 0xffff;
1248
0
                dbuf_put_u16(&s->byte_code, high);
1249
0
            }
1250
0
        } else {
1251
0
            re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len);
1252
0
            for(i = 0; i < cr->len; i += 2) {
1253
0
                dbuf_put_u32(&s->byte_code, cr->points[i]);
1254
0
                dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
1255
0
            }
1256
0
        }
1257
0
    }
1258
0
    return 0;
1259
0
}
1260
1261
static int re_string_cmp_len(const void *a, const void *b, void *arg)
1262
0
{
1263
0
    REString *p1 = *(REString **)a;
1264
0
    REString *p2 = *(REString **)b;
1265
0
    return (p1->len < p2->len) - (p1->len > p2->len);
1266
0
}
1267
1268
static void re_emit_char(REParseState *s, int c)
1269
0
{
1270
0
    if (c <= 0xffff)
1271
0
        re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c);
1272
0
    else
1273
0
        re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c);
1274
0
}
1275
1276
static int re_emit_string_list(REParseState *s, const REStringList *sl)
1277
0
{
1278
0
    REString **tab, *p;
1279
0
    int i, j, split_pos, last_match_pos, n;
1280
0
    BOOL has_empty_string, is_last;
1281
    
1282
    //    re_string_list_dump("sl", sl);
1283
0
    if (sl->n_strings == 0) {
1284
        /* simple case: only characters */
1285
0
        if (re_emit_range(s, &sl->cr))
1286
0
            return -1;
1287
0
    } else {
1288
        /* at least one string list is present : match the longest ones first */
1289
        /* XXX: add a new op_switch opcode to compile as a trie */
1290
0
        tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings);
1291
0
        if (!tab) {
1292
0
            re_parse_out_of_memory(s);
1293
0
            return -1;
1294
0
        }
1295
0
        has_empty_string = FALSE;
1296
0
        n = 0;
1297
0
        for(i = 0; i < sl->hash_size; i++) {
1298
0
            for(p = sl->hash_table[i]; p != NULL; p = p->next) {
1299
0
                if (p->len == 0) {
1300
0
                    has_empty_string = TRUE;
1301
0
                } else {
1302
0
                    tab[n++] = p;
1303
0
                }
1304
0
            }
1305
0
        }
1306
0
        assert(n <= sl->n_strings);
1307
        
1308
0
        rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL);
1309
1310
0
        last_match_pos = -1;
1311
0
        for(i = 0; i < n; i++) {
1312
0
            p = tab[i];
1313
0
            is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1);
1314
0
            if (!is_last)
1315
0
                split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1316
0
            else
1317
0
                split_pos = 0;
1318
0
            for(j = 0; j < p->len; j++) {
1319
0
                re_emit_char(s, p->buf[j]);
1320
0
            }
1321
0
            if (!is_last) {
1322
0
                last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
1323
0
                put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1324
0
            }
1325
0
        }
1326
1327
0
        if (sl->cr.len != 0) {
1328
            /* char range */
1329
0
            is_last = !has_empty_string;
1330
0
            if (!is_last)
1331
0
                split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1332
0
            else
1333
0
                split_pos = 0; /* not used */
1334
0
            if (re_emit_range(s, &sl->cr)) {
1335
0
                lre_realloc(s->opaque, tab, 0);
1336
0
                return -1;
1337
0
            }
1338
0
            if (!is_last)
1339
0
                put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1340
0
        }
1341
1342
        /* patch the 'goto match' */
1343
0
        while (last_match_pos != -1) {
1344
0
            int next_pos = get_u32(s->byte_code.buf + last_match_pos);
1345
0
            put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4));
1346
0
            last_match_pos = next_pos;
1347
0
        }
1348
        
1349
0
        lre_realloc(s->opaque, tab, 0);
1350
0
    }
1351
0
    return 0;
1352
0
}
1353
1354
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp);
1355
1356
static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp)
1357
0
{
1358
0
    int c1;
1359
0
    const uint8_t *p = *pp;
1360
    
1361
0
    if (*p == '[') {
1362
0
        if (re_parse_nested_class(s, cr, pp))
1363
0
            return -1;
1364
0
    } else {
1365
0
        c1 = get_class_atom(s, cr, pp, TRUE);
1366
0
        if (c1 < 0)
1367
0
            return -1;
1368
0
        if (c1 < CLASS_RANGE_BASE) {
1369
            /* create a range with a single character */
1370
0
            re_string_list_init(s, cr);
1371
0
            if (s->ignore_case)
1372
0
                c1 = lre_canonicalize(c1, s->is_unicode);
1373
0
            if (cr_union_interval(&cr->cr, c1, c1)) {
1374
0
                re_string_list_free(cr);
1375
0
                return -1;
1376
0
            }
1377
0
        }
1378
0
    }
1379
0
    return 0;
1380
0
}
1381
1382
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp)
1383
0
{
1384
0
    const uint8_t *p;
1385
0
    uint32_t c1, c2;
1386
0
    int ret;
1387
0
    REStringList cr1_s, *cr1 = &cr1_s;
1388
0
    BOOL invert, is_first;
1389
1390
0
    if (lre_check_stack_overflow(s->opaque, 0))
1391
0
        return re_parse_error(s, "stack overflow");
1392
1393
0
    re_string_list_init(s, cr);
1394
0
    p = *pp;
1395
0
    p++;    /* skip '[' */
1396
1397
0
    invert = FALSE;
1398
0
    if (*p == '^') {
1399
0
        p++;
1400
0
        invert = TRUE;
1401
0
    }
1402
    
1403
    /* handle unions */
1404
0
    is_first = TRUE;
1405
0
    for(;;) {
1406
0
        if (*p == ']')
1407
0
            break;
1408
0
        if (*p == '[' && s->unicode_sets) {
1409
0
            if (re_parse_nested_class(s, cr1, &p))
1410
0
                goto fail;
1411
0
            goto class_union;
1412
0
        } else {
1413
0
            c1 = get_class_atom(s, cr1, &p, TRUE);
1414
0
            if ((int)c1 < 0)
1415
0
                goto fail;
1416
0
            if (*p == '-' && p[1] != ']') {
1417
0
                const uint8_t *p0 = p + 1;
1418
0
                if (p[1] == '-' && s->unicode_sets && is_first)
1419
0
                    goto class_atom; /* first character class followed by '--' */
1420
0
                if (c1 >= CLASS_RANGE_BASE) {
1421
0
                    if (s->is_unicode) {
1422
0
                        re_string_list_free(cr1);
1423
0
                        goto invalid_class_range;
1424
0
                    }
1425
                    /* Annex B: match '-' character */
1426
0
                    goto class_atom;
1427
0
                }
1428
0
                c2 = get_class_atom(s, cr1, &p0, TRUE);
1429
0
                if ((int)c2 < 0)
1430
0
                    goto fail;
1431
0
                if (c2 >= CLASS_RANGE_BASE) {
1432
0
                    re_string_list_free(cr1);
1433
0
                    if (s->is_unicode) {
1434
0
                        goto invalid_class_range;
1435
0
                    }
1436
                    /* Annex B: match '-' character */
1437
0
                    goto class_atom;
1438
0
                }
1439
0
                p = p0;
1440
0
                if (c2 < c1) {
1441
0
                invalid_class_range:
1442
0
                    re_parse_error(s, "invalid class range");
1443
0
                    goto fail;
1444
0
                }
1445
0
                if (s->ignore_case) {
1446
0
                    CharRange cr2_s, *cr2 = &cr2_s;
1447
0
                    cr_init(cr2, s->opaque, lre_realloc);
1448
0
                    if (cr_add_interval(cr2, c1, c2 + 1) ||
1449
0
                        cr_regexp_canonicalize(cr2, s->is_unicode) ||
1450
0
                        cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) {
1451
0
                        cr_free(cr2);
1452
0
                        goto memory_error;
1453
0
                    }
1454
0
                    cr_free(cr2);
1455
0
                } else {
1456
0
                    if (cr_union_interval(&cr->cr, c1, c2))
1457
0
                        goto memory_error;
1458
0
                }
1459
0
                is_first = FALSE; /* union operation */
1460
0
            } else {
1461
0
            class_atom:
1462
0
                if (c1 >= CLASS_RANGE_BASE) {
1463
0
                class_union:
1464
0
                    ret = re_string_list_op(cr, cr1, CR_OP_UNION);
1465
0
                    re_string_list_free(cr1);
1466
0
                    if (ret)
1467
0
                        goto memory_error;
1468
0
                } else {
1469
0
                    if (s->ignore_case)
1470
0
                        c1 = lre_canonicalize(c1, s->is_unicode);
1471
0
                    if (cr_union_interval(&cr->cr, c1, c1))
1472
0
                        goto memory_error;
1473
0
                }
1474
0
            }
1475
0
        }
1476
0
        if (s->unicode_sets && is_first) {
1477
0
            if (*p == '&' && p[1] == '&' && p[2] != '&') {
1478
                /* handle '&&' */
1479
0
                for(;;) {
1480
0
                    if (*p == ']') {
1481
0
                        break;
1482
0
                    } else if (*p == '&' && p[1] == '&' && p[2] != '&') {
1483
0
                        p += 2;
1484
0
                    } else {
1485
0
                        goto invalid_operation;
1486
0
                    }
1487
0
                    if (re_parse_class_set_operand(s, cr1, &p))
1488
0
                        goto fail;
1489
0
                    ret = re_string_list_op(cr, cr1, CR_OP_INTER);
1490
0
                    re_string_list_free(cr1);
1491
0
                    if (ret)
1492
0
                        goto memory_error;
1493
0
                }
1494
0
            } else if (*p == '-' && p[1] == '-') {
1495
                /* handle '--' */
1496
0
                for(;;) {
1497
0
                    if (*p == ']') {
1498
0
                        break;
1499
0
                    } else if (*p == '-' && p[1] == '-') {
1500
0
                        p += 2;
1501
0
                    } else {
1502
0
                    invalid_operation:
1503
0
                        re_parse_error(s, "invalid operation in regular expression");
1504
0
                        goto fail;
1505
0
                    }
1506
0
                    if (re_parse_class_set_operand(s, cr1, &p))
1507
0
                        goto fail;
1508
0
                    ret = re_string_list_op(cr, cr1, CR_OP_SUB);
1509
0
                    re_string_list_free(cr1);
1510
0
                    if (ret)
1511
0
                        goto memory_error;
1512
0
                }
1513
0
            }
1514
0
        }
1515
0
        is_first = FALSE;
1516
0
    }
1517
1518
0
    p++;    /* skip ']' */
1519
0
    *pp = p;
1520
0
    if (invert) {
1521
        /* XXX: add may_contain_string syntax check to be fully
1522
           compliant. The test here accepts more input than the
1523
           spec. */
1524
0
        if (cr->n_strings != 0) {
1525
0
            re_parse_error(s, "negated character class with strings in regular expression debugger eval code");
1526
0
            goto fail;
1527
0
        }
1528
0
        if (cr_invert(&cr->cr))
1529
0
            goto memory_error;
1530
0
    }
1531
0
    return 0;
1532
0
 memory_error:
1533
0
    re_parse_out_of_memory(s);
1534
0
 fail:
1535
0
    re_string_list_free(cr);
1536
0
    return -1;
1537
0
}
1538
1539
static int re_parse_char_class(REParseState *s, const uint8_t **pp)
1540
0
{
1541
0
    REStringList cr_s, *cr = &cr_s;
1542
1543
0
    if (re_parse_nested_class(s, cr, pp))
1544
0
        return -1;
1545
0
    if (re_emit_string_list(s, cr))
1546
0
        goto fail;
1547
0
    re_string_list_free(cr);
1548
0
    return 0;
1549
0
 fail:
1550
0
    re_string_list_free(cr);
1551
0
    return -1;
1552
0
}
1553
1554
/* need_check_adv: false if the opcodes always advance the char pointer
1555
   need_capture_init: true if all the captures in the atom are not set
1556
*/
1557
static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
1558
                                               const uint8_t *bc_buf, int bc_buf_len)
1559
0
{
1560
0
    int pos, opcode, len;
1561
0
    uint32_t val;
1562
0
    BOOL need_check_adv, need_capture_init;
1563
1564
0
    need_check_adv = TRUE;
1565
0
    need_capture_init = FALSE;
1566
0
    pos = 0;
1567
0
    while (pos < bc_buf_len) {
1568
0
        opcode = bc_buf[pos];
1569
0
        len = reopcode_info[opcode].size;
1570
0
        switch(opcode) {
1571
0
        case REOP_range:
1572
0
        case REOP_range_i:
1573
0
            val = get_u16(bc_buf + pos + 1);
1574
0
            len += val * 4;
1575
0
            need_check_adv = FALSE;
1576
0
            break;
1577
0
        case REOP_range32:
1578
0
        case REOP_range32_i:
1579
0
            val = get_u16(bc_buf + pos + 1);
1580
0
            len += val * 8;
1581
0
            need_check_adv = FALSE;
1582
0
            break;
1583
0
        case REOP_char:
1584
0
        case REOP_char_i:
1585
0
        case REOP_char32:
1586
0
        case REOP_char32_i:
1587
0
        case REOP_dot:
1588
0
        case REOP_any:
1589
0
        case REOP_space:
1590
0
        case REOP_not_space:
1591
0
            need_check_adv = FALSE;
1592
0
            break;
1593
0
        case REOP_line_start:
1594
0
        case REOP_line_start_m:
1595
0
        case REOP_line_end:
1596
0
        case REOP_line_end_m:
1597
0
        case REOP_set_i32:
1598
0
        case REOP_set_char_pos:
1599
0
        case REOP_word_boundary:
1600
0
        case REOP_word_boundary_i:
1601
0
        case REOP_not_word_boundary:
1602
0
        case REOP_not_word_boundary_i:
1603
0
        case REOP_prev:
1604
            /* no effect */
1605
0
            break;
1606
0
        case REOP_save_start:
1607
0
        case REOP_save_end:
1608
0
        case REOP_save_reset:
1609
0
            break;
1610
0
        case REOP_back_reference:
1611
0
        case REOP_back_reference_i:
1612
0
        case REOP_backward_back_reference:
1613
0
        case REOP_backward_back_reference_i:
1614
0
            val = bc_buf[pos + 1];
1615
0
            len += val;
1616
0
            need_capture_init = TRUE;
1617
0
            break;
1618
0
        default:
1619
            /* safe behavior: we cannot predict the outcome */
1620
0
            need_capture_init = TRUE;
1621
0
            goto done;
1622
0
        }
1623
0
        pos += len;
1624
0
    }
1625
0
 done:
1626
0
    *pneed_capture_init = need_capture_init;
1627
0
    return need_check_adv;
1628
0
}
1629
1630
/* '*pp' is the first char after '<' */
1631
static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
1632
0
{
1633
0
    const uint8_t *p, *p1;
1634
0
    uint32_t c, d;
1635
0
    char *q;
1636
1637
0
    p = *pp;
1638
0
    q = buf;
1639
0
    for(;;) {
1640
0
        c = *p;
1641
0
        if (c == '\\') {
1642
0
            p++;
1643
0
            if (*p != 'u')
1644
0
                return -1;
1645
0
            c = lre_parse_escape(&p, 2); // accept surrogate pairs
1646
0
        } else if (c == '>') {
1647
0
            break;
1648
0
        } else if (c >= 128) {
1649
0
            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1650
0
            if (is_hi_surrogate(c)) {
1651
0
                d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
1652
0
                if (is_lo_surrogate(d)) {
1653
0
                    c = from_surrogate(c, d);
1654
0
                    p = p1;
1655
0
                }
1656
0
            }
1657
0
        } else {
1658
0
            p++;
1659
0
        }
1660
0
        if (c > 0x10FFFF)
1661
0
            return -1;
1662
0
        if (q == buf) {
1663
0
            if (!lre_js_is_ident_first(c))
1664
0
                return -1;
1665
0
        } else {
1666
0
            if (!lre_js_is_ident_next(c))
1667
0
                return -1;
1668
0
        }
1669
0
        if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1670
0
            return -1;
1671
0
        if (c < 128) {
1672
0
            *q++ = c;
1673
0
        } else {
1674
0
            q += unicode_to_utf8((uint8_t*)q, c);
1675
0
        }
1676
0
    }
1677
0
    if (q == buf)
1678
0
        return -1;
1679
0
    *q = '\0';
1680
0
    p++;
1681
0
    *pp = p;
1682
0
    return 0;
1683
0
}
1684
1685
/* if capture_name = NULL: return the number of captures + 1.
1686
   Otherwise, return the number of matching capture groups  */
1687
static int re_parse_captures(REParseState *s, int *phas_named_captures,
1688
                             const char *capture_name, BOOL emit_group_index)
1689
0
{
1690
0
    const uint8_t *p;
1691
0
    int capture_index, n;
1692
0
    char name[TMP_BUF_SIZE];
1693
1694
0
    capture_index = 1;
1695
0
    n = 0;
1696
0
    *phas_named_captures = 0;
1697
0
    for (p = s->buf_start; p < s->buf_end; p++) {
1698
0
        switch (*p) {
1699
0
        case '(':
1700
0
            if (p[1] == '?') {
1701
0
                if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1702
0
                    *phas_named_captures = 1;
1703
                    /* potential named capture */
1704
0
                    if (capture_name) {
1705
0
                        p += 3;
1706
0
                        if (re_parse_group_name(name, sizeof(name), &p) == 0) {
1707
0
                            if (!strcmp(name, capture_name)) {
1708
0
                                if (emit_group_index)
1709
0
                                    dbuf_putc(&s->byte_code, capture_index);
1710
0
                                n++;
1711
0
                            }
1712
0
                        }
1713
0
                    }
1714
0
                    capture_index++;
1715
0
                    if (capture_index >= CAPTURE_COUNT_MAX)
1716
0
                        goto done;
1717
0
                }
1718
0
            } else {
1719
0
                capture_index++;
1720
0
                if (capture_index >= CAPTURE_COUNT_MAX)
1721
0
                    goto done;
1722
0
            }
1723
0
            break;
1724
0
        case '\\':
1725
0
            p++;
1726
0
            break;
1727
0
        case '[':
1728
0
            for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1729
0
                if (*p == '\\')
1730
0
                    p++;
1731
0
            }
1732
0
            break;
1733
0
        }
1734
0
    }
1735
0
 done:
1736
0
    if (capture_name) {
1737
0
        return n;
1738
0
    } else {
1739
0
        return capture_index;
1740
0
    }
1741
0
}
1742
1743
static int re_count_captures(REParseState *s)
1744
0
{
1745
0
    if (s->total_capture_count < 0) {
1746
0
        s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1747
0
                                                   NULL, FALSE);
1748
0
    }
1749
0
    return s->total_capture_count;
1750
0
}
1751
1752
static BOOL re_has_named_captures(REParseState *s)
1753
0
{
1754
0
    if (s->has_named_captures < 0)
1755
0
        re_count_captures(s);
1756
0
    return s->has_named_captures;
1757
0
}
1758
1759
static int find_group_name(REParseState *s, const char *name, BOOL emit_group_index)
1760
0
{
1761
0
    const char *p, *buf_end;
1762
0
    size_t len, name_len;
1763
0
    int capture_index, n;
1764
1765
0
    p = (char *)s->group_names.buf;
1766
0
    if (!p)
1767
0
        return 0;
1768
0
    buf_end = (char *)s->group_names.buf + s->group_names.size;
1769
0
    name_len = strlen(name);
1770
0
    capture_index = 1;
1771
0
    n = 0;
1772
0
    while (p < buf_end) {
1773
0
        len = strlen(p);
1774
0
        if (len == name_len && memcmp(name, p, name_len) == 0) {
1775
0
            if (emit_group_index)
1776
0
                dbuf_putc(&s->byte_code, capture_index);
1777
0
            n++;
1778
0
        }
1779
0
        p += len + LRE_GROUP_NAME_TRAILER_LEN;
1780
0
        capture_index++;
1781
0
    }
1782
0
    return n;
1783
0
}
1784
1785
static BOOL is_duplicate_group_name(REParseState *s, const char *name, int scope)
1786
0
{
1787
0
    const char *p, *buf_end;
1788
0
    size_t len, name_len;
1789
0
    int scope1;
1790
    
1791
0
    p = (char *)s->group_names.buf;
1792
0
    if (!p)
1793
0
        return 0;
1794
0
    buf_end = (char *)s->group_names.buf + s->group_names.size;
1795
0
    name_len = strlen(name);
1796
0
    while (p < buf_end) {
1797
0
        len = strlen(p);
1798
0
        if (len == name_len && memcmp(name, p, name_len) == 0) {
1799
0
            scope1 = (uint8_t)p[len + 1];
1800
0
            if (scope == scope1)
1801
0
                return TRUE;
1802
0
        }
1803
0
        p += len + LRE_GROUP_NAME_TRAILER_LEN;
1804
0
    }
1805
0
    return FALSE;
1806
0
}
1807
1808
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1809
1810
static int re_parse_modifiers(REParseState *s, const uint8_t **pp)
1811
0
{
1812
0
    const uint8_t *p = *pp;
1813
0
    int mask = 0;
1814
0
    int val;
1815
1816
0
    for(;;) {
1817
0
        if (*p == 'i') {
1818
0
            val = LRE_FLAG_IGNORECASE;
1819
0
        } else if (*p == 'm') {
1820
0
            val = LRE_FLAG_MULTILINE;
1821
0
        } else if (*p == 's') {
1822
0
            val = LRE_FLAG_DOTALL;
1823
0
        } else {
1824
0
            break;
1825
0
        }
1826
0
        if (mask & val)
1827
0
            return re_parse_error(s, "duplicate modifier: '%c'", *p);
1828
0
        mask |= val;
1829
0
        p++;
1830
0
    }
1831
0
    *pp = p;
1832
0
    return mask;
1833
0
}
1834
1835
static BOOL update_modifier(BOOL val, int add_mask, int remove_mask,
1836
                            int mask)
1837
0
{
1838
0
    if (add_mask & mask)
1839
0
        val = TRUE;
1840
0
    if (remove_mask & mask)
1841
0
        val = FALSE;
1842
0
    return val;
1843
0
}
1844
1845
static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1846
0
{
1847
0
    const uint8_t *p;
1848
0
    int c, last_atom_start, quant_min, quant_max, last_capture_count;
1849
0
    BOOL greedy, is_neg, is_backward_lookahead;
1850
0
    REStringList cr_s, *cr = &cr_s;
1851
1852
0
    last_atom_start = -1;
1853
0
    last_capture_count = 0;
1854
0
    p = s->buf_ptr;
1855
0
    c = *p;
1856
0
    switch(c) {
1857
0
    case '^':
1858
0
        p++;
1859
0
        re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start);
1860
0
        break;
1861
0
    case '$':
1862
0
        p++;
1863
0
        re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end);
1864
0
        break;
1865
0
    case '.':
1866
0
        p++;
1867
0
        last_atom_start = s->byte_code.size;
1868
0
        last_capture_count = s->capture_count;
1869
0
        if (is_backward_dir)
1870
0
            re_emit_op(s, REOP_prev);
1871
0
        re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1872
0
        if (is_backward_dir)
1873
0
            re_emit_op(s, REOP_prev);
1874
0
        break;
1875
0
    case '{':
1876
0
        if (s->is_unicode) {
1877
0
            return re_parse_error(s, "syntax error");
1878
0
        } else if (!is_digit(p[1])) {
1879
            /* Annex B: we accept '{' not followed by digits as a
1880
               normal atom */
1881
0
            goto parse_class_atom;
1882
0
        } else {
1883
0
            const uint8_t *p1 = p + 1;
1884
            /* Annex B: error if it is like a repetition count */
1885
0
            parse_digits(&p1, TRUE);
1886
0
            if (*p1 == ',') {
1887
0
                p1++;
1888
0
                if (is_digit(*p1)) {
1889
0
                    parse_digits(&p1, TRUE);
1890
0
                }
1891
0
            }
1892
0
            if (*p1 != '}') {
1893
0
                goto parse_class_atom;
1894
0
            }
1895
0
        }
1896
        /* fall thru */
1897
0
    case '*':
1898
0
    case '+':
1899
0
    case '?':
1900
0
        return re_parse_error(s, "nothing to repeat");
1901
0
    case '(':
1902
0
        if (p[1] == '?') {
1903
0
            if (p[2] == ':') {
1904
0
                p += 3;
1905
0
                last_atom_start = s->byte_code.size;
1906
0
                last_capture_count = s->capture_count;
1907
0
                s->buf_ptr = p;
1908
0
                if (re_parse_disjunction(s, is_backward_dir))
1909
0
                    return -1;
1910
0
                p = s->buf_ptr;
1911
0
                if (re_parse_expect(s, &p, ')'))
1912
0
                    return -1;
1913
0
            } else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') {
1914
0
                BOOL saved_ignore_case, saved_multi_line, saved_dotall;
1915
0
                int add_mask, remove_mask;
1916
0
                p += 2;
1917
0
                remove_mask = 0;
1918
0
                add_mask = re_parse_modifiers(s, &p);
1919
0
                if (add_mask < 0)
1920
0
                    return -1;
1921
0
                if (*p == '-') {
1922
0
                    p++;
1923
0
                    remove_mask = re_parse_modifiers(s, &p);
1924
0
                    if (remove_mask < 0)
1925
0
                        return -1;
1926
0
                }
1927
0
                if ((add_mask == 0 && remove_mask == 0) ||
1928
0
                    (add_mask & remove_mask) != 0) {
1929
0
                    return re_parse_error(s, "invalid modifiers");
1930
0
                }
1931
0
                if (re_parse_expect(s, &p, ':'))
1932
0
                    return -1;
1933
0
                saved_ignore_case = s->ignore_case;
1934
0
                saved_multi_line = s->multi_line;
1935
0
                saved_dotall = s->dotall;
1936
0
                s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE);
1937
0
                s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE);
1938
0
                s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL);
1939
1940
0
                last_atom_start = s->byte_code.size;
1941
0
                last_capture_count = s->capture_count;
1942
0
                s->buf_ptr = p;
1943
0
                if (re_parse_disjunction(s, is_backward_dir))
1944
0
                    return -1;
1945
0
                p = s->buf_ptr;
1946
0
                if (re_parse_expect(s, &p, ')'))
1947
0
                    return -1;
1948
0
                s->ignore_case = saved_ignore_case;
1949
0
                s->multi_line = saved_multi_line;
1950
0
                s->dotall = saved_dotall;
1951
0
            } else if ((p[2] == '=' || p[2] == '!')) {
1952
0
                is_neg = (p[2] == '!');
1953
0
                is_backward_lookahead = FALSE;
1954
0
                p += 3;
1955
0
                goto lookahead;
1956
0
            } else if (p[2] == '<' &&
1957
0
                       (p[3] == '=' || p[3] == '!')) {
1958
0
                int pos;
1959
0
                is_neg = (p[3] == '!');
1960
0
                is_backward_lookahead = TRUE;
1961
0
                p += 4;
1962
                /* lookahead */
1963
0
            lookahead:
1964
                /* Annex B allows lookahead to be used as an atom for
1965
                   the quantifiers */
1966
0
                if (!s->is_unicode && !is_backward_lookahead)  {
1967
0
                    last_atom_start = s->byte_code.size;
1968
0
                    last_capture_count = s->capture_count;
1969
0
                }
1970
0
                pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1971
0
                s->buf_ptr = p;
1972
0
                if (re_parse_disjunction(s, is_backward_lookahead))
1973
0
                    return -1;
1974
0
                p = s->buf_ptr;
1975
0
                if (re_parse_expect(s, &p, ')'))
1976
0
                    return -1;
1977
0
                re_emit_op(s, REOP_lookahead_match + is_neg);
1978
                /* jump after the 'match' after the lookahead is successful */
1979
0
                if (dbuf_error(&s->byte_code))
1980
0
                    return -1;
1981
0
                put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1982
0
            } else if (p[2] == '<') {
1983
0
                p += 3;
1984
0
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1985
0
                                        &p)) {
1986
0
                    return re_parse_error(s, "invalid group name");
1987
0
                }
1988
                /* poor's man method to test duplicate group
1989
                   names. */
1990
                /* XXX: this method does not catch all the errors*/
1991
0
                if (is_duplicate_group_name(s, s->u.tmp_buf, s->group_name_scope)) {
1992
0
                    return re_parse_error(s, "duplicate group name");
1993
0
                }
1994
                /* group name with a trailing zero */
1995
0
                dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1996
0
                         strlen(s->u.tmp_buf) + 1);
1997
0
                dbuf_putc(&s->group_names, s->group_name_scope);
1998
0
                s->has_named_captures = 1;
1999
0
                goto parse_capture;
2000
0
            } else {
2001
0
                return re_parse_error(s, "invalid group");
2002
0
            }
2003
0
        } else {
2004
0
            int capture_index;
2005
0
            p++;
2006
            /* capture without group name */
2007
0
            dbuf_putc(&s->group_names, 0);
2008
0
            dbuf_putc(&s->group_names, 0);
2009
0
        parse_capture:
2010
0
            if (s->capture_count >= CAPTURE_COUNT_MAX)
2011
0
                return re_parse_error(s, "too many captures");
2012
0
            last_atom_start = s->byte_code.size;
2013
0
            last_capture_count = s->capture_count;
2014
0
            capture_index = s->capture_count++;
2015
0
            re_emit_op_u8(s, REOP_save_start + is_backward_dir,
2016
0
                          capture_index);
2017
2018
0
            s->buf_ptr = p;
2019
0
            if (re_parse_disjunction(s, is_backward_dir))
2020
0
                return -1;
2021
0
            p = s->buf_ptr;
2022
2023
0
            re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
2024
0
                          capture_index);
2025
2026
0
            if (re_parse_expect(s, &p, ')'))
2027
0
                return -1;
2028
0
        }
2029
0
        break;
2030
0
    case '\\':
2031
0
        switch(p[1]) {
2032
0
        case 'b':
2033
0
        case 'B':
2034
0
            if (p[1] != 'b') {
2035
0
                re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
2036
0
            } else {
2037
0
                re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
2038
0
            }
2039
0
            p += 2;
2040
0
            break;
2041
0
        case 'k':
2042
0
            {
2043
0
                const uint8_t *p1;
2044
0
                int dummy_res, n;
2045
0
                BOOL is_forward;
2046
                
2047
0
                p1 = p;
2048
0
                if (p1[2] != '<') {
2049
                    /* annex B: we tolerate invalid group names in non
2050
                       unicode mode if there is no named capture
2051
                       definition */
2052
0
                    if (s->is_unicode || re_has_named_captures(s))
2053
0
                        return re_parse_error(s, "expecting group name");
2054
0
                    else
2055
0
                        goto parse_class_atom;
2056
0
                }
2057
0
                p1 += 3;
2058
0
                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
2059
0
                                        &p1)) {
2060
0
                    if (s->is_unicode || re_has_named_captures(s))
2061
0
                        return re_parse_error(s, "invalid group name");
2062
0
                    else
2063
0
                        goto parse_class_atom;
2064
0
                }
2065
0
                is_forward = FALSE;
2066
0
                n = find_group_name(s, s->u.tmp_buf, FALSE);
2067
0
                if (n == 0) {
2068
                    /* no capture name parsed before, try to look
2069
                       after (inefficient, but hopefully not common */
2070
0
                    n = re_parse_captures(s, &dummy_res, s->u.tmp_buf, FALSE);
2071
0
                    if (n == 0) {
2072
0
                        if (s->is_unicode || re_has_named_captures(s))
2073
0
                            return re_parse_error(s, "group name not defined");
2074
0
                        else
2075
0
                            goto parse_class_atom;
2076
0
                    }
2077
0
                    is_forward = TRUE;
2078
0
                }
2079
0
                last_atom_start = s->byte_code.size;
2080
0
                last_capture_count = s->capture_count;
2081
                
2082
                /* emit back references to all the captures indexes matching the group name */
2083
0
                re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, n);
2084
0
                if (is_forward) {
2085
0
                    re_parse_captures(s, &dummy_res, s->u.tmp_buf, TRUE);
2086
0
                } else {
2087
0
                    find_group_name(s, s->u.tmp_buf, TRUE);
2088
0
                }
2089
0
                p = p1;
2090
0
            }
2091
0
            break;
2092
0
        case '0':
2093
0
            p += 2;
2094
0
            c = 0;
2095
0
            if (s->is_unicode) {
2096
0
                if (is_digit(*p)) {
2097
0
                    return re_parse_error(s, "invalid decimal escape in regular expression");
2098
0
                }
2099
0
            } else {
2100
                /* Annex B.1.4: accept legacy octal */
2101
0
                if (*p >= '0' && *p <= '7') {
2102
0
                    c = *p++ - '0';
2103
0
                    if (*p >= '0' && *p <= '7') {
2104
0
                        c = (c << 3) + *p++ - '0';
2105
0
                    }
2106
0
                }
2107
0
            }
2108
0
            goto normal_char;
2109
0
        case '1': case '2': case '3': case '4':
2110
0
        case '5': case '6': case '7': case '8':
2111
0
        case '9':
2112
0
            {
2113
0
                const uint8_t *q = ++p;
2114
2115
0
                c = parse_digits(&p, FALSE);
2116
0
                if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
2117
0
                    if (!s->is_unicode) {
2118
                        /* Annex B.1.4: accept legacy octal */
2119
0
                        p = q;
2120
0
                        if (*p <= '7') {
2121
0
                            c = 0;
2122
0
                            if (*p <= '3')
2123
0
                                c = *p++ - '0';
2124
0
                            if (*p >= '0' && *p <= '7') {
2125
0
                                c = (c << 3) + *p++ - '0';
2126
0
                                if (*p >= '0' && *p <= '7') {
2127
0
                                    c = (c << 3) + *p++ - '0';
2128
0
                                }
2129
0
                            }
2130
0
                        } else {
2131
0
                            c = *p++;
2132
0
                        }
2133
0
                        goto normal_char;
2134
0
                    }
2135
0
                    return re_parse_error(s, "back reference out of range in regular expression");
2136
0
                }
2137
0
                last_atom_start = s->byte_code.size;
2138
0
                last_capture_count = s->capture_count;
2139
                
2140
0
                re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, 1);
2141
0
                dbuf_putc(&s->byte_code, c);
2142
0
            }
2143
0
            break;
2144
0
        default:
2145
0
            goto parse_class_atom;
2146
0
        }
2147
0
        break;
2148
0
    case '[':
2149
0
        last_atom_start = s->byte_code.size;
2150
0
        last_capture_count = s->capture_count;
2151
0
        if (is_backward_dir)
2152
0
            re_emit_op(s, REOP_prev);
2153
0
        if (re_parse_char_class(s, &p))
2154
0
            return -1;
2155
0
        if (is_backward_dir)
2156
0
            re_emit_op(s, REOP_prev);
2157
0
        break;
2158
0
    case ']':
2159
0
    case '}':
2160
0
        if (s->is_unicode)
2161
0
            return re_parse_error(s, "syntax error");
2162
0
        goto parse_class_atom;
2163
0
    default:
2164
0
    parse_class_atom:
2165
0
        c = get_class_atom(s, cr, &p, FALSE);
2166
0
        if ((int)c < 0)
2167
0
            return -1;
2168
0
    normal_char:
2169
0
        last_atom_start = s->byte_code.size;
2170
0
        last_capture_count = s->capture_count;
2171
0
        if (is_backward_dir)
2172
0
            re_emit_op(s, REOP_prev);
2173
0
        if (c >= CLASS_RANGE_BASE) {
2174
0
            int ret = 0;
2175
            /* optimize the common 'space' tests */
2176
0
            if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
2177
0
                re_emit_op(s, REOP_space);
2178
0
            } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
2179
0
                re_emit_op(s, REOP_not_space);
2180
0
            } else {
2181
0
                ret = re_emit_string_list(s, cr);
2182
0
            }
2183
0
            re_string_list_free(cr);
2184
0
            if (ret)
2185
0
                return -1;
2186
0
        } else {
2187
0
            if (s->ignore_case)
2188
0
                c = lre_canonicalize(c, s->is_unicode);
2189
0
            re_emit_char(s, c);
2190
0
        }
2191
0
        if (is_backward_dir)
2192
0
            re_emit_op(s, REOP_prev);
2193
0
        break;
2194
0
    }
2195
2196
    /* quantifier */
2197
0
    if (last_atom_start >= 0) {
2198
0
        c = *p;
2199
0
        switch(c) {
2200
0
        case '*':
2201
0
            p++;
2202
0
            quant_min = 0;
2203
0
            quant_max = INT32_MAX;
2204
0
            goto quantifier;
2205
0
        case '+':
2206
0
            p++;
2207
0
            quant_min = 1;
2208
0
            quant_max = INT32_MAX;
2209
0
            goto quantifier;
2210
0
        case '?':
2211
0
            p++;
2212
0
            quant_min = 0;
2213
0
            quant_max = 1;
2214
0
            goto quantifier;
2215
0
        case '{':
2216
0
            {
2217
0
                const uint8_t *p1 = p;
2218
                /* As an extension (see ES6 annex B), we accept '{' not
2219
                   followed by digits as a normal atom */
2220
0
                if (!is_digit(p[1])) {
2221
0
                    if (s->is_unicode)
2222
0
                        goto invalid_quant_count;
2223
0
                    break;
2224
0
                }
2225
0
                p++;
2226
0
                quant_min = parse_digits(&p, TRUE);
2227
0
                quant_max = quant_min;
2228
0
                if (*p == ',') {
2229
0
                    p++;
2230
0
                    if (is_digit(*p)) {
2231
0
                        quant_max = parse_digits(&p, TRUE);
2232
0
                        if (quant_max < quant_min) {
2233
0
                        invalid_quant_count:
2234
0
                            return re_parse_error(s, "invalid repetition count");
2235
0
                        }
2236
0
                    } else {
2237
0
                        quant_max = INT32_MAX; /* infinity */
2238
0
                    }
2239
0
                }
2240
0
                if (*p != '}' && !s->is_unicode) {
2241
                    /* Annex B: normal atom if invalid '{' syntax */
2242
0
                    p = p1;
2243
0
                    break;
2244
0
                }
2245
0
                if (re_parse_expect(s, &p, '}'))
2246
0
                    return -1;
2247
0
            }
2248
0
        quantifier:
2249
0
            greedy = TRUE;
2250
0
            if (*p == '?') {
2251
0
                p++;
2252
0
                greedy = FALSE;
2253
0
            }
2254
0
            if (last_atom_start < 0) {
2255
0
                return re_parse_error(s, "nothing to repeat");
2256
0
            }
2257
0
            {
2258
0
                BOOL need_capture_init, add_zero_advance_check;
2259
0
                int len, pos;
2260
                
2261
                /* the spec tells that if there is no advance when
2262
                   running the atom after the first quant_min times,
2263
                   then there is no match. We remove this test when we
2264
                   are sure the atom always advances the position. */
2265
0
                add_zero_advance_check =
2266
0
                    re_need_check_adv_and_capture_init(&need_capture_init,
2267
0
                                                       s->byte_code.buf + last_atom_start,
2268
0
                                                       s->byte_code.size - last_atom_start);
2269
            
2270
                /* general case: need to reset the capture at each
2271
                   iteration. We don't do it if there are no captures
2272
                   in the atom or if we are sure all captures are
2273
                   initialized in the atom. If quant_min = 0, we still
2274
                   need to reset once the captures in case the atom
2275
                   does not match. */
2276
0
                if (need_capture_init && last_capture_count != s->capture_count) {
2277
0
                    if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2278
0
                        goto out_of_memory;
2279
0
                    int pos = last_atom_start;
2280
0
                    s->byte_code.buf[pos++] = REOP_save_reset;
2281
0
                    s->byte_code.buf[pos++] = last_capture_count;
2282
0
                    s->byte_code.buf[pos++] = s->capture_count - 1;
2283
0
                }
2284
2285
0
                len = s->byte_code.size - last_atom_start;
2286
0
                if (quant_min == 0) {
2287
                    /* need to reset the capture in case the atom is
2288
                       not executed */
2289
0
                    if (!need_capture_init && last_capture_count != s->capture_count) {
2290
0
                        if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2291
0
                            goto out_of_memory;
2292
0
                        s->byte_code.buf[last_atom_start++] = REOP_save_reset;
2293
0
                        s->byte_code.buf[last_atom_start++] = last_capture_count;
2294
0
                        s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
2295
0
                    }
2296
0
                    if (quant_max == 0) {
2297
0
                        s->byte_code.size = last_atom_start;
2298
0
                    } else if (quant_max == 1 || quant_max == INT32_MAX) {
2299
0
                        BOOL has_goto = (quant_max == INT32_MAX);
2300
0
                        if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check * 2))
2301
0
                            goto out_of_memory;
2302
0
                        s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
2303
0
                            greedy;
2304
0
                        put_u32(s->byte_code.buf + last_atom_start + 1,
2305
0
                                len + 5 * has_goto + add_zero_advance_check * 2 * 2);
2306
0
                        if (add_zero_advance_check) {
2307
0
                            s->byte_code.buf[last_atom_start + 1 + 4] = REOP_set_char_pos;
2308
0
                            s->byte_code.buf[last_atom_start + 1 + 4 + 1] = 0;
2309
0
                            re_emit_op_u8(s, REOP_check_advance, 0);
2310
0
                        }
2311
0
                        if (has_goto)
2312
0
                            re_emit_goto(s, REOP_goto, last_atom_start);
2313
0
                    } else {
2314
0
                        if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
2315
0
                            goto out_of_memory;
2316
0
                        pos = last_atom_start;
2317
0
                        s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
2318
0
                        put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
2319
0
                        pos += 4;
2320
2321
0
                        s->byte_code.buf[pos++] = REOP_set_i32;
2322
0
                        s->byte_code.buf[pos++] = 0;
2323
0
                        put_u32(s->byte_code.buf + pos, quant_max);
2324
0
                        pos += 4;
2325
0
                        last_atom_start = pos;
2326
0
                        if (add_zero_advance_check) {
2327
0
                            s->byte_code.buf[pos++] = REOP_set_char_pos;
2328
0
                            s->byte_code.buf[pos++] = 0;
2329
0
                        }
2330
0
                        re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
2331
0
                    }
2332
0
                } else if (quant_min == 1 && quant_max == INT32_MAX &&
2333
0
                           !add_zero_advance_check) {
2334
0
                    re_emit_goto(s, REOP_split_next_first - greedy,
2335
0
                                 last_atom_start);
2336
0
                } else {
2337
0
                    if (quant_min == quant_max)
2338
0
                        add_zero_advance_check = FALSE;
2339
0
                    if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2))
2340
0
                        goto out_of_memory;
2341
                    /* Note: we assume the string length is < INT32_MAX */
2342
0
                    pos = last_atom_start;
2343
0
                    s->byte_code.buf[pos++] = REOP_set_i32;
2344
0
                    s->byte_code.buf[pos++] = 0;
2345
0
                    put_u32(s->byte_code.buf + pos, quant_max);
2346
0
                    pos += 4;
2347
0
                    last_atom_start = pos;
2348
0
                    if (add_zero_advance_check) {
2349
0
                        s->byte_code.buf[pos++] = REOP_set_char_pos;
2350
0
                        s->byte_code.buf[pos++] = 0;
2351
0
                    }
2352
0
                    if (quant_min == quant_max) {
2353
                        /* a simple loop is enough */
2354
0
                        re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
2355
0
                    } else {
2356
0
                        re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start);
2357
0
                    }
2358
0
                }
2359
0
                last_atom_start = -1;
2360
0
            }
2361
0
            break;
2362
0
        default:
2363
0
            break;
2364
0
        }
2365
0
    }
2366
0
    s->buf_ptr = p;
2367
0
    return 0;
2368
0
 out_of_memory:
2369
0
    return re_parse_out_of_memory(s);
2370
0
}
2371
2372
static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
2373
0
{
2374
0
    const uint8_t *p;
2375
0
    int ret;
2376
0
    size_t start, term_start, end, term_size;
2377
2378
0
    start = s->byte_code.size;
2379
0
    for(;;) {
2380
0
        p = s->buf_ptr;
2381
0
        if (p >= s->buf_end)
2382
0
            break;
2383
0
        if (*p == '|' || *p == ')')
2384
0
            break;
2385
0
        term_start = s->byte_code.size;
2386
0
        ret = re_parse_term(s, is_backward_dir);
2387
0
        if (ret)
2388
0
            return ret;
2389
0
        if (is_backward_dir) {
2390
            /* reverse the order of the terms (XXX: inefficient, but
2391
               speed is not really critical here) */
2392
0
            end = s->byte_code.size;
2393
0
            term_size = end - term_start;
2394
0
            if (dbuf_claim(&s->byte_code, term_size))
2395
0
                return -1;
2396
0
            memmove(s->byte_code.buf + start + term_size,
2397
0
                    s->byte_code.buf + start,
2398
0
                    end - start);
2399
0
            memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
2400
0
                   term_size);
2401
0
        }
2402
0
    }
2403
0
    return 0;
2404
0
}
2405
2406
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
2407
0
{
2408
0
    int start, len, pos;
2409
2410
0
    if (lre_check_stack_overflow(s->opaque, 0))
2411
0
        return re_parse_error(s, "stack overflow");
2412
2413
0
    start = s->byte_code.size;
2414
0
    if (re_parse_alternative(s, is_backward_dir))
2415
0
        return -1;
2416
0
    while (*s->buf_ptr == '|') {
2417
0
        s->buf_ptr++;
2418
2419
0
        len = s->byte_code.size - start;
2420
2421
        /* insert a split before the first alternative */
2422
0
        if (dbuf_insert(&s->byte_code, start, 5)) {
2423
0
            return re_parse_out_of_memory(s);
2424
0
        }
2425
0
        s->byte_code.buf[start] = REOP_split_next_first;
2426
0
        put_u32(s->byte_code.buf + start + 1, len + 5);
2427
2428
0
        pos = re_emit_op_u32(s, REOP_goto, 0);
2429
2430
0
        s->group_name_scope++;
2431
        
2432
0
        if (re_parse_alternative(s, is_backward_dir))
2433
0
            return -1;
2434
2435
        /* patch the goto */
2436
0
        len = s->byte_code.size - (pos + 4);
2437
0
        put_u32(s->byte_code.buf + pos, len);
2438
0
    }
2439
0
    return 0;
2440
0
}
2441
2442
/* Allocate the registers as a stack. The control flow is recursive so
2443
   the analysis can be linear. */
2444
static int compute_register_count(uint8_t *bc_buf, int bc_buf_len)
2445
0
{
2446
0
    int stack_size, stack_size_max, pos, opcode, len;
2447
0
    uint32_t val;
2448
2449
0
    stack_size = 0;
2450
0
    stack_size_max = 0;
2451
0
    bc_buf += RE_HEADER_LEN;
2452
0
    bc_buf_len -= RE_HEADER_LEN;
2453
0
    pos = 0;
2454
0
    while (pos < bc_buf_len) {
2455
0
        opcode = bc_buf[pos];
2456
0
        len = reopcode_info[opcode].size;
2457
0
        assert(opcode < REOP_COUNT);
2458
0
        assert((pos + len) <= bc_buf_len);
2459
0
        switch(opcode) {
2460
0
        case REOP_set_i32:
2461
0
        case REOP_set_char_pos:
2462
0
            bc_buf[pos + 1] = stack_size;
2463
0
            stack_size++;
2464
0
            if (stack_size > stack_size_max) {
2465
0
                if (stack_size > REGISTER_COUNT_MAX)
2466
0
                    return -1;
2467
0
                stack_size_max = stack_size;
2468
0
            }
2469
0
            break;
2470
0
        case REOP_check_advance:
2471
0
        case REOP_loop:
2472
0
        case REOP_loop_split_goto_first:
2473
0
        case REOP_loop_split_next_first:
2474
0
            assert(stack_size > 0);
2475
0
            stack_size--;
2476
0
            bc_buf[pos + 1] = stack_size;
2477
0
            break;
2478
0
        case REOP_loop_check_adv_split_goto_first:
2479
0
        case REOP_loop_check_adv_split_next_first:
2480
0
            assert(stack_size >= 2);
2481
0
            stack_size -= 2;
2482
0
            bc_buf[pos + 1] = stack_size;
2483
0
            break;
2484
0
        case REOP_range:
2485
0
        case REOP_range_i:
2486
0
            val = get_u16(bc_buf + pos + 1);
2487
0
            len += val * 4;
2488
0
            break;
2489
0
        case REOP_range32:
2490
0
        case REOP_range32_i:
2491
0
            val = get_u16(bc_buf + pos + 1);
2492
0
            len += val * 8;
2493
0
            break;
2494
0
        case REOP_back_reference:
2495
0
        case REOP_back_reference_i:
2496
0
        case REOP_backward_back_reference:
2497
0
        case REOP_backward_back_reference_i:
2498
0
            val = bc_buf[pos + 1];
2499
0
            len += val;
2500
0
            break;
2501
0
        }
2502
0
        pos += len;
2503
0
    }
2504
0
    return stack_size_max;
2505
0
}
2506
2507
static void *lre_bytecode_realloc(void *opaque, void *ptr, size_t size)
2508
0
{
2509
0
    if (size > (INT32_MAX / 2)) {
2510
        /* the bytecode cannot be larger than 2G. Leave some slack to 
2511
           avoid some overflows. */
2512
0
        return NULL;
2513
0
    } else {
2514
0
        return lre_realloc(opaque, ptr, size);
2515
0
    }
2516
0
}
2517
2518
/* 'buf' must be a zero terminated UTF-8 string of length buf_len.
2519
   Return NULL if error and allocate an error message in *perror_msg,
2520
   otherwise the compiled bytecode and its length in plen.
2521
*/
2522
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
2523
                     const char *buf, size_t buf_len, int re_flags,
2524
                     void *opaque)
2525
0
{
2526
0
    REParseState s_s, *s = &s_s;
2527
0
    int register_count;
2528
0
    BOOL is_sticky;
2529
2530
0
    memset(s, 0, sizeof(*s));
2531
0
    s->opaque = opaque;
2532
0
    s->buf_ptr = (const uint8_t *)buf;
2533
0
    s->buf_end = s->buf_ptr + buf_len;
2534
0
    s->buf_start = s->buf_ptr;
2535
0
    s->re_flags = re_flags;
2536
0
    s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
2537
0
    is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
2538
0
    s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
2539
0
    s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0);
2540
0
    s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
2541
0
    s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
2542
0
    s->capture_count = 1;
2543
0
    s->total_capture_count = -1;
2544
0
    s->has_named_captures = -1;
2545
2546
0
    dbuf_init2(&s->byte_code, opaque, lre_bytecode_realloc);
2547
0
    dbuf_init2(&s->group_names, opaque, lre_realloc);
2548
2549
0
    dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */
2550
0
    dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
2551
0
    dbuf_putc(&s->byte_code, 0); /* stack size */
2552
0
    dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
2553
2554
0
    if (!is_sticky) {
2555
        /* iterate thru all positions (about the same as .*?( ... ) )
2556
           .  We do it without an explicit loop so that lock step
2557
           thread execution will be possible in an optimized
2558
           implementation */
2559
0
        re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
2560
0
        re_emit_op(s, REOP_any);
2561
0
        re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
2562
0
    }
2563
0
    re_emit_op_u8(s, REOP_save_start, 0);
2564
2565
0
    if (re_parse_disjunction(s, FALSE)) {
2566
0
    error:
2567
0
        dbuf_free(&s->byte_code);
2568
0
        dbuf_free(&s->group_names);
2569
0
        pstrcpy(error_msg, error_msg_size, s->u.error_msg);
2570
0
        *plen = 0;
2571
0
        return NULL;
2572
0
    }
2573
2574
0
    re_emit_op_u8(s, REOP_save_end, 0);
2575
2576
0
    re_emit_op(s, REOP_match);
2577
2578
0
    if (*s->buf_ptr != '\0') {
2579
0
        re_parse_error(s, "extraneous characters at the end");
2580
0
        goto error;
2581
0
    }
2582
2583
0
    if (dbuf_error(&s->byte_code)) {
2584
0
        re_parse_out_of_memory(s);
2585
0
        goto error;
2586
0
    }
2587
2588
0
    register_count = compute_register_count(s->byte_code.buf, s->byte_code.size);
2589
0
    if (register_count < 0) {
2590
0
        re_parse_error(s, "too many imbricated quantifiers");
2591
0
        goto error;
2592
0
    }
2593
2594
0
    s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
2595
0
    s->byte_code.buf[RE_HEADER_REGISTER_COUNT] = register_count;
2596
0
    put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
2597
0
            s->byte_code.size - RE_HEADER_LEN);
2598
2599
    /* add the named groups if needed */
2600
0
    if (s->group_names.size > (s->capture_count - 1) * LRE_GROUP_NAME_TRAILER_LEN) {
2601
0
        dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
2602
0
        put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
2603
0
                lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
2604
0
    }
2605
0
    dbuf_free(&s->group_names);
2606
2607
#ifdef DUMP_REOP
2608
    lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
2609
#endif
2610
2611
0
    error_msg[0] = '\0';
2612
0
    *plen = s->byte_code.size;
2613
0
    return s->byte_code.buf;
2614
0
}
2615
2616
static BOOL is_line_terminator(uint32_t c)
2617
0
{
2618
0
    return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
2619
0
}
2620
2621
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
2622
0
    do {                                                                \
2623
0
        if (cbuf_type == 0) {                                           \
2624
0
            c = *cptr++;                                                \
2625
0
        } else {                                                        \
2626
0
            const uint16_t *_p = (const uint16_t *)cptr;                \
2627
0
            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
2628
0
            c = *_p++;                                                  \
2629
0
            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
2630
0
                if (_p < _end && is_lo_surrogate(*_p)) {                \
2631
0
                    c = from_surrogate(c, *_p++);                       \
2632
0
                }                                                       \
2633
0
            }                                                           \
2634
0
            cptr = (const void *)_p;                                    \
2635
0
        }                                                               \
2636
0
    } while (0)
2637
2638
#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type)                         \
2639
0
    do {                                                                \
2640
0
        if (cbuf_type == 0) {                                           \
2641
0
            c = cptr[0];                                                \
2642
0
        } else {                                                        \
2643
0
            const uint16_t *_p = (const uint16_t *)cptr;                \
2644
0
            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
2645
0
            c = *_p++;                                                  \
2646
0
            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
2647
0
                if (_p < _end && is_lo_surrogate(*_p)) {                \
2648
0
                    c = from_surrogate(c, *_p);                         \
2649
0
                }                                                       \
2650
0
            }                                                           \
2651
0
        }                                                               \
2652
0
    } while (0)
2653
2654
#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                  \
2655
0
    do {                                                                \
2656
0
        if (cbuf_type == 0) {                                           \
2657
0
            c = cptr[-1];                                               \
2658
0
        } else {                                                        \
2659
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2660
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2661
0
            c = *_p;                                                    \
2662
0
            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
2663
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2664
0
                    c = from_surrogate(*--_p, c);                       \
2665
0
                }                                                       \
2666
0
            }                                                           \
2667
0
        }                                                               \
2668
0
    } while (0)
2669
2670
#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                   \
2671
0
    do {                                                                \
2672
0
        if (cbuf_type == 0) {                                           \
2673
0
            cptr--;                                                     \
2674
0
            c = cptr[0];                                                \
2675
0
        } else {                                                        \
2676
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2677
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2678
0
            c = *_p;                                                    \
2679
0
            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
2680
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2681
0
                    c = from_surrogate(*--_p, c);                       \
2682
0
                }                                                       \
2683
0
            }                                                           \
2684
0
            cptr = (const void *)_p;                                    \
2685
0
        }                                                               \
2686
0
    } while (0)
2687
2688
#define PREV_CHAR(cptr, cbuf_start, cbuf_type)                          \
2689
0
    do {                                                                \
2690
0
        if (cbuf_type == 0) {                                           \
2691
0
            cptr--;                                                     \
2692
0
        } else {                                                        \
2693
0
            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
2694
0
            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
2695
0
            if (is_lo_surrogate(*_p) && cbuf_type == 2) {               \
2696
0
                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
2697
0
                    --_p;                                               \
2698
0
                }                                                       \
2699
0
            }                                                           \
2700
0
            cptr = (const void *)_p;                                    \
2701
0
        }                                                               \
2702
0
    } while (0)
2703
2704
typedef enum {
2705
    RE_EXEC_STATE_SPLIT,
2706
    RE_EXEC_STATE_LOOKAHEAD,
2707
    RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
2708
} REExecStateEnum;
2709
2710
#if INTPTR_MAX >= INT64_MAX
2711
#define BP_TYPE_BITS 3
2712
#else
2713
#define BP_TYPE_BITS 2
2714
#endif
2715
2716
typedef union {
2717
    uint8_t *ptr;
2718
    intptr_t val; /* for bp, the low BP_SHIFT bits store REExecStateEnum */
2719
    struct {
2720
        uintptr_t val : sizeof(uintptr_t) * 8 - BP_TYPE_BITS;
2721
        uintptr_t type : BP_TYPE_BITS;
2722
    } bp;
2723
} StackElem;
2724
2725
typedef struct {
2726
    const uint8_t *cbuf;
2727
    const uint8_t *cbuf_end;
2728
    /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
2729
    int cbuf_type;
2730
    int capture_count;
2731
    BOOL is_unicode;
2732
    int interrupt_counter;
2733
    void *opaque; /* used for stack overflow check */
2734
2735
    StackElem *stack_buf;
2736
    size_t stack_size;
2737
    StackElem static_stack_buf[32]; /* static stack to avoid allocation in most cases */
2738
} REExecContext;
2739
2740
static int lre_poll_timeout(REExecContext *s)
2741
0
{
2742
0
    if (unlikely(--s->interrupt_counter <= 0)) {
2743
0
        s->interrupt_counter = INTERRUPT_COUNTER_INIT;
2744
0
        if (lre_check_timeout(s->opaque))
2745
0
            return LRE_RET_TIMEOUT;
2746
0
    }
2747
0
    return 0;
2748
0
}
2749
2750
static no_inline int stack_realloc(REExecContext *s, size_t n)
2751
0
{
2752
0
    StackElem *new_stack;
2753
0
    size_t new_size;
2754
0
    new_size = s->stack_size * 3 / 2;
2755
0
    if (new_size < n)
2756
0
        new_size = n;
2757
0
    if (s->stack_buf == s->static_stack_buf) {
2758
0
        new_stack = lre_realloc(s->opaque, NULL, new_size * sizeof(StackElem));
2759
0
        if (!new_stack)
2760
0
            return -1;
2761
        /* XXX: could use correct size */
2762
0
        memcpy(new_stack, s->stack_buf, s->stack_size * sizeof(StackElem));
2763
0
    } else {
2764
0
        new_stack = lre_realloc(s->opaque, s->stack_buf, new_size * sizeof(StackElem));
2765
0
        if (!new_stack)
2766
0
            return -1;
2767
0
    }
2768
0
    s->stack_size = new_size;
2769
0
    s->stack_buf = new_stack;
2770
0
    return 0;
2771
0
}
2772
2773
/* return 1 if match, 0 if not match or < 0 if error. */
2774
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2775
                                   const uint8_t *pc, const uint8_t *cptr)
2776
0
{
2777
0
    int opcode;
2778
0
    int cbuf_type;
2779
0
    uint32_t val, c, idx;
2780
0
    const uint8_t *cbuf_end;
2781
0
    StackElem *sp, *bp, *stack_end;
2782
#ifdef DUMP_EXEC
2783
    const uint8_t *pc_start = pc; /* TEST */
2784
#endif
2785
0
    cbuf_type = s->cbuf_type;
2786
0
    cbuf_end = s->cbuf_end;
2787
2788
0
    sp = s->stack_buf;
2789
0
    bp = s->stack_buf;
2790
0
    stack_end = s->stack_buf + s->stack_size;
2791
    
2792
0
#define CHECK_STACK_SPACE(n)                            \
2793
0
    if (unlikely((stack_end - sp) < (n))) {             \
2794
0
        size_t saved_sp = sp - s->stack_buf;            \
2795
0
        size_t saved_bp = bp - s->stack_buf;            \
2796
0
        if (stack_realloc(s, sp - s->stack_buf + (n)))  \
2797
0
            return LRE_RET_MEMORY_ERROR;                \
2798
0
        stack_end = s->stack_buf + s->stack_size;       \
2799
0
        sp = s->stack_buf + saved_sp;                   \
2800
0
        bp = s->stack_buf + saved_bp;                   \
2801
0
    }
2802
2803
    /* XXX: could test if the value was saved to reduce the stack size
2804
       but slower */
2805
0
#define SAVE_CAPTURE(idx, value)                        \
2806
0
    {                                                   \
2807
0
        CHECK_STACK_SPACE(2);                           \
2808
0
        sp[0].val = idx;                                \
2809
0
        sp[1].ptr = capture[idx];                       \
2810
0
        sp += 2;                                        \
2811
0
        capture[idx] = (value);                         \
2812
0
    }
2813
2814
    /* avoid saving the previous value if already saved */
2815
0
#define SAVE_CAPTURE_CHECK(idx, value)          \
2816
0
    {                                           \
2817
0
        StackElem *sp1;                         \
2818
0
        sp1 = sp;                               \
2819
0
        for(;;) {                               \
2820
0
            if (sp1 > bp) {                             \
2821
0
                if (sp1[-2].val == idx)                 \
2822
0
                    break;                              \
2823
0
                sp1 -= 2;                               \
2824
0
            } else {                                    \
2825
0
                CHECK_STACK_SPACE(2);                   \
2826
0
                sp[0].val = idx;                        \
2827
0
                sp[1].ptr = capture[idx];               \
2828
0
                sp += 2;                                \
2829
0
                break;                                  \
2830
0
            }                                           \
2831
0
        }                                               \
2832
0
        capture[idx] = (value);                         \
2833
0
    }
2834
2835
2836
#ifdef DUMP_EXEC
2837
    printf("%5s %5s %5s %5s %s\n", "PC", "CP", "BP", "SP", "OPCODE");
2838
#endif    
2839
0
    for(;;) {
2840
0
        opcode = *pc++;
2841
#ifdef DUMP_EXEC
2842
        printf("%5ld %5ld %5ld %5ld %s\n",
2843
               pc - 1 - pc_start,
2844
               cbuf_type == 0 ? cptr - s->cbuf : (cptr - s->cbuf) / 2,
2845
               bp - s->stack_buf,
2846
               sp - s->stack_buf,
2847
               reopcode_info[opcode].name);
2848
#endif        
2849
0
        switch(opcode) {
2850
0
        case REOP_match:
2851
0
            return 1;
2852
0
        no_match:
2853
0
            for(;;) {
2854
0
                REExecStateEnum type;
2855
0
                if (bp == s->stack_buf)
2856
0
                    return 0;
2857
                /* undo the modifications to capture[] */
2858
0
                while (sp > bp) {
2859
0
                    capture[sp[-2].val] = sp[-1].ptr;
2860
0
                    sp -= 2;
2861
0
                }
2862
                
2863
0
                pc = sp[-3].ptr;
2864
0
                cptr = sp[-2].ptr;
2865
0
                type = sp[-1].bp.type;
2866
0
                bp = s->stack_buf + sp[-1].bp.val;
2867
0
                sp -= 3;
2868
0
                if (type != RE_EXEC_STATE_LOOKAHEAD)
2869
0
                    break;
2870
0
            }
2871
0
            if (lre_poll_timeout(s))
2872
0
                return LRE_RET_TIMEOUT;
2873
0
            break;
2874
0
        case REOP_lookahead_match:
2875
            /* pop all the saved states until reaching the start of
2876
               the lookahead and keep the updated captures and
2877
               variables and the corresponding undo info. */
2878
0
            {
2879
0
                StackElem *sp1, *sp_top, *next_sp;
2880
0
                REExecStateEnum type;
2881
2882
0
                sp_top = sp;
2883
0
                for(;;) {
2884
0
                    sp1 = sp;
2885
0
                    sp = bp;
2886
0
                    pc = sp[-3].ptr;
2887
0
                    cptr = sp[-2].ptr;
2888
0
                    type = sp[-1].bp.type;
2889
0
                    bp = s->stack_buf + sp[-1].bp.val;
2890
0
                    sp[-1].ptr = (void *)sp1; /* save the next value for the copy step */
2891
0
                    sp -= 3;
2892
0
                    if (type == RE_EXEC_STATE_LOOKAHEAD)
2893
0
                        break;
2894
0
                }
2895
0
                if (sp != s->stack_buf) {
2896
                    /* keep the undo info if there is a saved state */
2897
0
                    sp1 = sp;
2898
0
                    while (sp1 < sp_top) {
2899
0
                        next_sp = (void *)sp1[2].ptr;
2900
0
                        sp1 += 3;
2901
0
                        while (sp1 < next_sp)
2902
0
                            *sp++ = *sp1++;
2903
0
                    }
2904
0
                }
2905
0
            }
2906
0
            break;
2907
0
        case REOP_negative_lookahead_match:
2908
            /* pop all the saved states until reaching start of the negative lookahead */
2909
0
            for(;;) {
2910
0
                REExecStateEnum type;
2911
0
                type = bp[-1].bp.type;
2912
                /* undo the modifications to capture[] */
2913
0
                while (sp > bp) {
2914
0
                    capture[sp[-2].val] = sp[-1].ptr;
2915
0
                    sp -= 2;
2916
0
                }
2917
0
                pc = sp[-3].ptr;
2918
0
                cptr = sp[-2].ptr;
2919
0
                type = sp[-1].bp.type;
2920
0
                bp = s->stack_buf + sp[-1].bp.val;
2921
0
                sp -= 3;
2922
0
                if (type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD)
2923
0
                    break;
2924
0
            }
2925
0
            goto no_match;
2926
0
        case REOP_char32:
2927
0
        case REOP_char32_i:
2928
0
            val = get_u32(pc);
2929
0
            pc += 4;
2930
0
            goto test_char;
2931
0
        case REOP_char:
2932
0
        case REOP_char_i:
2933
0
            val = get_u16(pc);
2934
0
            pc += 2;
2935
0
        test_char:
2936
0
            if (cptr >= cbuf_end)
2937
0
                goto no_match;
2938
0
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2939
0
            if (opcode == REOP_char_i || opcode == REOP_char32_i) {
2940
0
                c = lre_canonicalize(c, s->is_unicode);
2941
0
            }
2942
0
            if (val != c)
2943
0
                goto no_match;
2944
0
            break;
2945
0
        case REOP_split_goto_first:
2946
0
        case REOP_split_next_first:
2947
0
            {
2948
0
                const uint8_t *pc1;
2949
2950
0
                val = get_u32(pc);
2951
0
                pc += 4;
2952
0
                if (opcode == REOP_split_next_first) {
2953
0
                    pc1 = pc + (int)val;
2954
0
                } else {
2955
0
                    pc1 = pc;
2956
0
                    pc = pc + (int)val;
2957
0
                }
2958
0
                CHECK_STACK_SPACE(3);
2959
0
                sp[0].ptr = (uint8_t *)pc1;
2960
0
                sp[1].ptr = (uint8_t *)cptr;
2961
0
                sp[2].bp.val = bp - s->stack_buf;
2962
0
                sp[2].bp.type = RE_EXEC_STATE_SPLIT;
2963
0
                sp += 3;
2964
0
                bp = sp;
2965
0
            }
2966
0
            break;
2967
0
        case REOP_lookahead:
2968
0
        case REOP_negative_lookahead:
2969
0
            val = get_u32(pc);
2970
0
            pc += 4;
2971
0
            CHECK_STACK_SPACE(3);
2972
0
            sp[0].ptr = (uint8_t *)(pc + (int)val);
2973
0
            sp[1].ptr = (uint8_t *)cptr;
2974
0
            sp[2].bp.val = bp - s->stack_buf;
2975
0
            sp[2].bp.type = RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead;
2976
0
            sp += 3;
2977
0
            bp = sp;
2978
0
            break;
2979
0
        case REOP_goto:
2980
0
            val = get_u32(pc);
2981
0
            pc += 4 + (int)val;
2982
0
            if (lre_poll_timeout(s))
2983
0
                return LRE_RET_TIMEOUT;
2984
0
            break;
2985
0
        case REOP_line_start:
2986
0
        case REOP_line_start_m:
2987
0
            if (cptr == s->cbuf)
2988
0
                break;
2989
0
            if (opcode == REOP_line_start)
2990
0
                goto no_match;
2991
0
            PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2992
0
            if (!is_line_terminator(c))
2993
0
                goto no_match;
2994
0
            break;
2995
0
        case REOP_line_end:
2996
0
        case REOP_line_end_m:
2997
0
            if (cptr == cbuf_end)
2998
0
                break;
2999
0
            if (opcode == REOP_line_end)
3000
0
                goto no_match;
3001
0
            PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
3002
0
            if (!is_line_terminator(c))
3003
0
                goto no_match;
3004
0
            break;
3005
0
        case REOP_dot:
3006
0
            if (cptr == cbuf_end)
3007
0
                goto no_match;
3008
0
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3009
0
            if (is_line_terminator(c))
3010
0
                goto no_match;
3011
0
            break;
3012
0
        case REOP_any:
3013
0
            if (cptr == cbuf_end)
3014
0
                goto no_match;
3015
0
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3016
0
            break;
3017
0
        case REOP_space:
3018
0
            if (cptr == cbuf_end)
3019
0
                goto no_match;
3020
0
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3021
0
            if (!lre_is_space(c))
3022
0
                goto no_match;
3023
0
            break;
3024
0
        case REOP_not_space:
3025
0
            if (cptr == cbuf_end)
3026
0
                goto no_match;
3027
0
            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3028
0
            if (lre_is_space(c))
3029
0
                goto no_match;
3030
0
            break;
3031
0
        case REOP_save_start:
3032
0
        case REOP_save_end:
3033
0
            val = *pc++;
3034
0
            assert(val < s->capture_count);
3035
0
            idx = 2 * val + opcode - REOP_save_start;
3036
0
            SAVE_CAPTURE(idx, (uint8_t *)cptr);
3037
0
            break;
3038
0
        case REOP_save_reset:
3039
0
            {
3040
0
                uint32_t val2;
3041
0
                val = pc[0];
3042
0
                val2 = pc[1];
3043
0
                pc += 2;
3044
0
                assert(val2 < s->capture_count);
3045
0
                CHECK_STACK_SPACE(2 * (val2 - val + 1));
3046
0
                while (val <= val2) {
3047
0
                    idx = 2 * val;
3048
0
                    SAVE_CAPTURE(idx, NULL);
3049
0
                    idx = 2 * val + 1;
3050
0
                    SAVE_CAPTURE(idx, NULL);
3051
0
                    val++;
3052
0
                }
3053
0
            }
3054
0
            break;
3055
0
        case REOP_set_i32:
3056
0
            idx = 2 * s->capture_count + pc[0];
3057
0
            val = get_u32(pc + 1);
3058
0
            pc += 5;
3059
0
            SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
3060
0
            break;
3061
0
        case REOP_loop:
3062
0
            {
3063
0
                uint32_t val2;
3064
0
                idx = 2 * s->capture_count + pc[0];
3065
0
                val = get_u32(pc + 1);
3066
0
                pc += 5;
3067
3068
0
                val2 = (uintptr_t)capture[idx] - 1;
3069
0
                SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
3070
0
                if (val2 != 0) {
3071
0
                    pc += (int)val;
3072
0
                    if (lre_poll_timeout(s))
3073
0
                        return LRE_RET_TIMEOUT;
3074
0
                }
3075
0
            }
3076
0
            break;
3077
0
        case REOP_loop_split_goto_first:
3078
0
        case REOP_loop_split_next_first:
3079
0
        case REOP_loop_check_adv_split_goto_first:
3080
0
        case REOP_loop_check_adv_split_next_first:
3081
0
            {
3082
0
                const uint8_t *pc1;
3083
0
                uint32_t val2, limit;
3084
0
                idx = 2 * s->capture_count + pc[0];
3085
0
                limit = get_u32(pc + 1);
3086
0
                val = get_u32(pc + 5);
3087
0
                pc += 9;
3088
3089
                /* decrement the counter */
3090
0
                val2 = (uintptr_t)capture[idx] - 1;
3091
0
                SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
3092
3093
0
                if (val2 > limit) {
3094
                    /* normal loop if counter > limit */
3095
0
                    pc += (int)val;
3096
0
                    if (lre_poll_timeout(s))
3097
0
                        return LRE_RET_TIMEOUT;
3098
0
                } else {
3099
                    /* check advance */
3100
0
                    if ((opcode == REOP_loop_check_adv_split_goto_first ||
3101
0
                         opcode == REOP_loop_check_adv_split_next_first) &&
3102
0
                        capture[idx + 1] == cptr &&
3103
0
                        val2 != limit) {
3104
0
                        goto no_match;
3105
0
                    }
3106
                    
3107
                    /* otherwise conditional split */
3108
0
                    if (val2 != 0) {
3109
0
                        if (opcode == REOP_loop_split_next_first ||
3110
0
                            opcode == REOP_loop_check_adv_split_next_first) {
3111
0
                            pc1 = pc + (int)val;
3112
0
                        } else {
3113
0
                            pc1 = pc;
3114
0
                            pc = pc + (int)val;
3115
0
                        }
3116
0
                        CHECK_STACK_SPACE(3);
3117
0
                        sp[0].ptr = (uint8_t *)pc1;
3118
0
                        sp[1].ptr = (uint8_t *)cptr;
3119
0
                        sp[2].bp.val = bp - s->stack_buf;
3120
0
                        sp[2].bp.type = RE_EXEC_STATE_SPLIT;
3121
0
                        sp += 3;
3122
0
                        bp = sp;
3123
0
                    }
3124
0
                }
3125
0
            }
3126
0
            break;
3127
0
        case REOP_set_char_pos:
3128
0
            idx = 2 * s->capture_count + pc[0];
3129
0
            pc++;
3130
0
            SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
3131
0
            break;
3132
0
        case REOP_check_advance:
3133
0
            idx = 2 * s->capture_count + pc[0];
3134
0
            pc++;
3135
0
            if (capture[idx] == cptr)
3136
0
                goto no_match;
3137
0
            break;
3138
0
        case REOP_word_boundary:
3139
0
        case REOP_word_boundary_i:
3140
0
        case REOP_not_word_boundary:
3141
0
        case REOP_not_word_boundary_i:
3142
0
            {
3143
0
                BOOL v1, v2;
3144
0
                int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i);
3145
0
                BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i);
3146
                /* char before */
3147
0
                if (cptr == s->cbuf) {
3148
0
                    v1 = FALSE;
3149
0
                } else {
3150
0
                    PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
3151
0
                    if (c < 256) {
3152
0
                        v1 = (lre_is_word_byte(c) != 0);
3153
0
                    } else {
3154
0
                        v1 = ignore_case && (c == 0x017f || c == 0x212a);
3155
0
                    }
3156
0
                }
3157
                /* current char */
3158
0
                if (cptr >= cbuf_end) {
3159
0
                    v2 = FALSE;
3160
0
                } else {
3161
0
                    PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
3162
0
                    if (c < 256) {
3163
0
                        v2 = (lre_is_word_byte(c) != 0);
3164
0
                    } else {
3165
0
                        v2 = ignore_case && (c == 0x017f || c == 0x212a);
3166
0
                    }
3167
0
                }
3168
0
                if (v1 ^ v2 ^ is_boundary)
3169
0
                    goto no_match;
3170
0
            }
3171
0
            break;
3172
0
        case REOP_back_reference:
3173
0
        case REOP_back_reference_i:
3174
0
        case REOP_backward_back_reference:
3175
0
        case REOP_backward_back_reference_i:
3176
0
            {
3177
0
                const uint8_t *cptr1, *cptr1_end, *cptr1_start;
3178
0
                const uint8_t *pc1;
3179
0
                uint32_t c1, c2;
3180
0
                int i, n;
3181
3182
0
                n = *pc++;
3183
0
                pc1 = pc;
3184
0
                pc += n;
3185
3186
0
                for(i = 0; i < n; i++) {
3187
0
                    val = pc1[i];
3188
0
                    if (val >= s->capture_count)
3189
0
                        goto no_match;
3190
0
                    cptr1_start = capture[2 * val];
3191
0
                    cptr1_end = capture[2 * val + 1];
3192
                    /* test the first not empty capture */
3193
0
                    if (cptr1_start && cptr1_end) {
3194
0
                        if (opcode == REOP_back_reference ||
3195
0
                            opcode == REOP_back_reference_i) {
3196
0
                            cptr1 = cptr1_start;
3197
0
                            while (cptr1 < cptr1_end) {
3198
0
                                if (cptr >= cbuf_end)
3199
0
                                    goto no_match;
3200
0
                                GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
3201
0
                                GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
3202
0
                                if (opcode == REOP_back_reference_i) {
3203
0
                                    c1 = lre_canonicalize(c1, s->is_unicode);
3204
0
                                    c2 = lre_canonicalize(c2, s->is_unicode);
3205
0
                                }
3206
0
                                if (c1 != c2)
3207
0
                                    goto no_match;
3208
0
                            }
3209
0
                        } else {
3210
0
                            cptr1 = cptr1_end;
3211
0
                            while (cptr1 > cptr1_start) {
3212
0
                                if (cptr == s->cbuf)
3213
0
                                    goto no_match;
3214
0
                                GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
3215
0
                                GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
3216
0
                                if (opcode == REOP_backward_back_reference_i) {
3217
0
                                    c1 = lre_canonicalize(c1, s->is_unicode);
3218
0
                                    c2 = lre_canonicalize(c2, s->is_unicode);
3219
0
                                }
3220
0
                                if (c1 != c2)
3221
0
                                    goto no_match;
3222
0
                            }
3223
0
                        }
3224
0
                        break;
3225
0
                    }
3226
0
                }
3227
0
            }
3228
0
            break;
3229
0
        case REOP_range:
3230
0
        case REOP_range_i:
3231
0
            {
3232
0
                int n;
3233
0
                uint32_t low, high, idx_min, idx_max, idx;
3234
3235
0
                n = get_u16(pc); /* n must be >= 1 */
3236
0
                pc += 2;
3237
0
                if (cptr >= cbuf_end)
3238
0
                    goto no_match;
3239
0
                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3240
0
                if (opcode == REOP_range_i) {
3241
0
                    c = lre_canonicalize(c, s->is_unicode);
3242
0
                }
3243
0
                idx_min = 0;
3244
0
                low = get_u16(pc + 0 * 4);
3245
0
                if (c < low)
3246
0
                    goto no_match;
3247
0
                idx_max = n - 1;
3248
0
                high = get_u16(pc + idx_max * 4 + 2);
3249
                /* 0xffff in for last value means +infinity */
3250
0
                if (unlikely(c >= 0xffff) && high == 0xffff)
3251
0
                    goto range_match;
3252
0
                if (c > high)
3253
0
                    goto no_match;
3254
0
                while (idx_min <= idx_max) {
3255
0
                    idx = (idx_min + idx_max) / 2;
3256
0
                    low = get_u16(pc + idx * 4);
3257
0
                    high = get_u16(pc + idx * 4 + 2);
3258
0
                    if (c < low)
3259
0
                        idx_max = idx - 1;
3260
0
                    else if (c > high)
3261
0
                        idx_min = idx + 1;
3262
0
                    else
3263
0
                        goto range_match;
3264
0
                }
3265
0
                goto no_match;
3266
0
            range_match:
3267
0
                pc += 4 * n;
3268
0
            }
3269
0
            break;
3270
0
        case REOP_range32:
3271
0
        case REOP_range32_i:
3272
0
            {
3273
0
                int n;
3274
0
                uint32_t low, high, idx_min, idx_max, idx;
3275
3276
0
                n = get_u16(pc); /* n must be >= 1 */
3277
0
                pc += 2;
3278
0
                if (cptr >= cbuf_end)
3279
0
                    goto no_match;
3280
0
                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3281
0
                if (opcode == REOP_range32_i) {
3282
0
                    c = lre_canonicalize(c, s->is_unicode);
3283
0
                }
3284
0
                idx_min = 0;
3285
0
                low = get_u32(pc + 0 * 8);
3286
0
                if (c < low)
3287
0
                    goto no_match;
3288
0
                idx_max = n - 1;
3289
0
                high = get_u32(pc + idx_max * 8 + 4);
3290
0
                if (c > high)
3291
0
                    goto no_match;
3292
0
                while (idx_min <= idx_max) {
3293
0
                    idx = (idx_min + idx_max) / 2;
3294
0
                    low = get_u32(pc + idx * 8);
3295
0
                    high = get_u32(pc + idx * 8 + 4);
3296
0
                    if (c < low)
3297
0
                        idx_max = idx - 1;
3298
0
                    else if (c > high)
3299
0
                        idx_min = idx + 1;
3300
0
                    else
3301
0
                        goto range32_match;
3302
0
                }
3303
0
                goto no_match;
3304
0
            range32_match:
3305
0
                pc += 8 * n;
3306
0
            }
3307
0
            break;
3308
0
        case REOP_prev:
3309
            /* go to the previous char */
3310
0
            if (cptr == s->cbuf)
3311
0
                goto no_match;
3312
0
            PREV_CHAR(cptr, s->cbuf, cbuf_type);
3313
0
            break;
3314
0
        default:
3315
#ifdef DUMP_EXEC
3316
            printf("unknown opcode pc=%ld\n", pc - 1 - pc_start);
3317
#endif            
3318
0
            abort();
3319
0
        }
3320
0
    }
3321
0
}
3322
3323
/* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the
3324
   starting position of the match and must be such as 0 <= cindex <=
3325
   clen. */
3326
int lre_exec(uint8_t **capture,
3327
             const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
3328
             int cbuf_type, void *opaque)
3329
0
{
3330
0
    REExecContext s_s, *s = &s_s;
3331
0
    int re_flags, i, ret;
3332
0
    const uint8_t *cptr;
3333
3334
0
    re_flags = lre_get_flags(bc_buf);
3335
0
    s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
3336
0
    s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
3337
0
    s->cbuf = cbuf;
3338
0
    s->cbuf_end = cbuf + (clen << cbuf_type);
3339
0
    s->cbuf_type = cbuf_type;
3340
0
    if (s->cbuf_type == 1 && s->is_unicode)
3341
0
        s->cbuf_type = 2;
3342
0
    s->interrupt_counter = INTERRUPT_COUNTER_INIT;
3343
0
    s->opaque = opaque;
3344
3345
0
    s->stack_buf = s->static_stack_buf;
3346
0
    s->stack_size = countof(s->static_stack_buf);
3347
3348
0
    for(i = 0; i < s->capture_count * 2; i++)
3349
0
        capture[i] = NULL;
3350
3351
0
    cptr = cbuf + (cindex << cbuf_type);
3352
0
    if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
3353
0
        const uint16_t *p = (const uint16_t *)cptr;
3354
0
        if (is_lo_surrogate(*p) && is_hi_surrogate(p[-1])) {
3355
0
            cptr = (const uint8_t *)(p - 1);
3356
0
        }
3357
0
    }
3358
3359
0
    ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
3360
3361
0
    if (s->stack_buf != s->static_stack_buf)
3362
0
        lre_realloc(s->opaque, s->stack_buf, 0);
3363
0
    return ret;
3364
0
}
3365
3366
int lre_get_alloc_count(const uint8_t *bc_buf)
3367
0
{
3368
0
    return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
3369
0
        bc_buf[RE_HEADER_REGISTER_COUNT];
3370
0
}
3371
3372
int lre_get_capture_count(const uint8_t *bc_buf)
3373
0
{
3374
0
    return bc_buf[RE_HEADER_CAPTURE_COUNT];
3375
0
}
3376
3377
int lre_get_flags(const uint8_t *bc_buf)
3378
0
{
3379
0
    return get_u16(bc_buf + RE_HEADER_FLAGS);
3380
0
}
3381
3382
/* Return NULL if no group names. Otherwise, return a pointer to
3383
   'capture_count - 1' zero terminated UTF-8 strings. */
3384
const char *lre_get_groupnames(const uint8_t *bc_buf)
3385
0
{
3386
0
    uint32_t re_bytecode_len;
3387
0
    if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
3388
0
        return NULL;
3389
0
    re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
3390
0
    return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
3391
0
}
3392
3393
#ifdef TEST
3394
3395
BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
3396
{
3397
    return FALSE;
3398
}
3399
3400
void *lre_realloc(void *opaque, void *ptr, size_t size)
3401
{
3402
    return realloc(ptr, size);
3403
}
3404
3405
int main(int argc, char **argv)
3406
{
3407
    int len, flags, ret, i;
3408
    uint8_t *bc;
3409
    char error_msg[64];
3410
    uint8_t *capture;
3411
    const char *input;
3412
    int input_len, capture_count;
3413
3414
    if (argc < 4) {
3415
        printf("usage: %s regexp flags input\n", argv[0]);
3416
        return 1;
3417
    }
3418
    flags = atoi(argv[2]);
3419
    bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
3420
                     strlen(argv[1]), flags, NULL);
3421
    if (!bc) {
3422
        fprintf(stderr, "error: %s\n", error_msg);
3423
        exit(1);
3424
    }
3425
3426
    input = argv[3];
3427
    input_len = strlen(input);
3428
3429
    capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
3430
    ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
3431
    printf("ret=%d\n", ret);
3432
    if (ret == 1) {
3433
        capture_count = lre_get_capture_count(bc);
3434
        for(i = 0; i < 2 * capture_count; i++) {
3435
            uint8_t *ptr;
3436
            ptr = capture[i];
3437
            printf("%d: ", i);
3438
            if (!ptr)
3439
                printf("<nil>");
3440
            else
3441
                printf("%u", (int)(ptr - (uint8_t *)input));
3442
            printf("\n");
3443
        }
3444
    }
3445
    free(capture);
3446
    return 0;
3447
}
3448
#endif