Coverage Report

Created: 2026-01-09 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/librdkafka/src/regexp.c
Line
Count
Source
1
/**
2
 * Copyright: public domain
3
 *
4
 * From https://github.com/ccxvii/minilibs sha
5
 * 875c33568b5a4aa4fb3dd0c52ea98f7f0e5ca684:
6
 *
7
 * These libraries are in the public domain (or the equivalent where that is not
8
 * possible). You can do anything you want with them. You have no legal
9
 * obligation to do anything else, although I appreciate attribution.
10
 */
11
12
#include "rd.h"
13
14
#include <stdlib.h>
15
#include <string.h>
16
#include <setjmp.h>
17
#include <stdio.h>
18
19
#include "regexp.h"
20
21
10.2k
#define nelem(a) (sizeof(a) / sizeof(a)[0])
22
23
typedef unsigned int Rune;
24
25
528
static int isalpharune(Rune c) {
26
        /* TODO: Add unicode support */
27
528
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
28
528
}
29
30
0
static Rune toupperrune(Rune c) {
31
        /* TODO: Add unicode support */
32
0
        if (c >= 'a' && c <= 'z')
33
0
                return c - 'a' + 'A';
34
0
        return c;
35
0
}
36
37
256k
static int chartorune(Rune *r, const char *s) {
38
        /* TODO: Add UTF-8 decoding */
39
256k
        *r = *s;
40
256k
        return 1;
41
256k
}
42
43
28.3k
#define REPINF    255
44
0
#define MAXTHREAD 1000
45
36.3k
#define MAXSUB    REG_MAXSUB
46
47
typedef struct Reclass Reclass;
48
typedef struct Renode Renode;
49
typedef struct Reinst Reinst;
50
typedef struct Rethread Rethread;
51
typedef struct Restate Restate;
52
53
struct Reclass {
54
        Rune *end;
55
        Rune spans[64];
56
};
57
58
struct Restate {
59
        Reprog *prog;
60
        Renode *pstart, *pend;
61
62
        const char *source;
63
        unsigned int ncclass;
64
        unsigned int nsub;
65
        Renode *sub[MAXSUB];
66
67
        int lookahead;
68
        Rune yychar;
69
        Reclass *yycc;
70
        int yymin, yymax;
71
72
        const char *error;
73
        jmp_buf kaboom;
74
};
75
76
struct Reprog {
77
        Reinst *start, *end;
78
        int flags;
79
        unsigned int nsub;
80
        Reclass cclass[16];
81
        Restate g; /**< Upstream has this as a global variable */
82
};
83
84
1.14k
static void die(Restate *g, const char *message) {
85
1.14k
        g->error = message;
86
1.14k
        longjmp(g->kaboom, 1);
87
1.14k
}
88
89
0
static Rune canon(Rune c) {
90
0
        Rune u = toupperrune(c);
91
0
        if (c >= 128 && u < 128)
92
0
                return c;
93
0
        return u;
94
0
}
95
96
/* Scan */
97
98
enum {
99
        L_CHAR = 256,
100
        L_CCLASS,  /* character class */
101
        L_NCCLASS, /* negative character class */
102
        L_NC,      /* "(?:" no capture */
103
        L_PLA,     /* "(?=" positive lookahead */
104
        L_NLA,     /* "(?!" negative lookahead */
105
        L_WORD,    /* "\b" word boundary */
106
        L_NWORD,   /* "\B" non-word boundary */
107
        L_REF,     /* "\1" back-reference */
108
        L_COUNT    /* {M,N} */
109
};
110
111
9.37k
static int hex(Restate *g, int c) {
112
9.37k
        if (c >= '0' && c <= '9')
113
3.96k
                return c - '0';
114
5.41k
        if (c >= 'a' && c <= 'f')
115
1.97k
                return c - 'a' + 0xA;
116
3.44k
        if (c >= 'A' && c <= 'F')
117
3.16k
                return c - 'A' + 0xA;
118
276
        die(g, "invalid escape sequence");
119
276
        return 0;
120
3.44k
}
121
122
5.62k
static int dec(Restate *g, int c) {
123
5.62k
        if (c >= '0' && c <= '9')
124
5.50k
                return c - '0';
125
117
        die(g, "invalid quantifier");
126
117
        return 0;
127
5.62k
}
128
129
5.25k
#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
130
131
245k
static int nextrune(Restate *g) {
132
245k
        g->source += chartorune(&g->yychar, g->source);
133
245k
        if (g->yychar == '\\') {
134
11.1k
                g->source += chartorune(&g->yychar, g->source);
135
11.1k
                switch (g->yychar) {
136
7
                case 0:
137
7
                        die(g, "unterminated escape sequence");
138
1.35k
                case 'f':
139
1.35k
                        g->yychar = '\f';
140
1.35k
                        return 0;
141
327
                case 'n':
142
327
                        g->yychar = '\n';
143
327
                        return 0;
144
203
                case 'r':
145
203
                        g->yychar = '\r';
146
203
                        return 0;
147
201
                case 't':
148
201
                        g->yychar = '\t';
149
201
                        return 0;
150
194
                case 'v':
151
194
                        g->yychar = '\v';
152
194
                        return 0;
153
378
                case 'c':
154
378
                        g->yychar = (*g->source++) & 31;
155
378
                        return 0;
156
1.58k
                case 'x':
157
1.58k
                        g->yychar = hex(g, *g->source++) << 4;
158
1.58k
                        g->yychar += hex(g, *g->source++);
159
1.58k
                        if (g->yychar == 0) {
160
496
                                g->yychar = '0';
161
496
                                return 1;
162
496
                        }
163
1.09k
                        return 0;
164
1.63k
                case 'u':
165
1.63k
                        g->yychar = hex(g, *g->source++) << 12;
166
1.63k
                        g->yychar += hex(g, *g->source++) << 8;
167
1.63k
                        g->yychar += hex(g, *g->source++) << 4;
168
1.63k
                        g->yychar += hex(g, *g->source++);
169
1.63k
                        if (g->yychar == 0) {
170
533
                                g->yychar = '0';
171
533
                                return 1;
172
533
                        }
173
1.09k
                        return 0;
174
11.1k
                }
175
5.25k
                if (strchr(ESCAPES, g->yychar))
176
4.72k
                        return 1;
177
528
                if (isalpharune(g->yychar) ||
178
517
                    g->yychar == '_') /* check identity escape */
179
12
                        die(g, "invalid escape character");
180
528
                return 0;
181
5.25k
        }
182
234k
        return 0;
183
245k
}
184
185
2.03k
static int lexcount(Restate *g) {
186
2.03k
        g->yychar = *g->source++;
187
188
2.03k
        g->yymin  = dec(g, g->yychar);
189
2.03k
        g->yychar = *g->source++;
190
4.04k
        while (g->yychar != ',' && g->yychar != '}') {
191
2.01k
                g->yymin  = g->yymin * 10 + dec(g, g->yychar);
192
2.01k
                g->yychar = *g->source++;
193
2.01k
        }
194
2.03k
        if (g->yymin >= REPINF)
195
18
                die(g, "numeric overflow");
196
197
2.03k
        if (g->yychar == ',') {
198
831
                g->yychar = *g->source++;
199
831
                if (g->yychar == '}') {
200
251
                        g->yymax = REPINF;
201
580
                } else {
202
580
                        g->yymax  = dec(g, g->yychar);
203
580
                        g->yychar = *g->source++;
204
1.57k
                        while (g->yychar != '}') {
205
997
                                g->yymax  = g->yymax * 10 + dec(g, g->yychar);
206
997
                                g->yychar = *g->source++;
207
997
                        }
208
580
                        if (g->yymax >= REPINF)
209
19
                                die(g, "numeric overflow");
210
580
                }
211
1.20k
        } else {
212
1.20k
                g->yymax = g->yymin;
213
1.20k
        }
214
215
2.03k
        return L_COUNT;
216
2.03k
}
217
218
1.37k
static void newcclass(Restate *g) {
219
1.37k
        if (g->ncclass >= nelem(g->prog->cclass))
220
7
                die(g, "too many character classes");
221
1.37k
        g->yycc      = g->prog->cclass + g->ncclass++;
222
1.37k
        g->yycc->end = g->yycc->spans;
223
1.37k
}
224
225
8.90k
static void addrange(Restate *g, Rune a, Rune b) {
226
8.90k
        if (a > b)
227
46
                die(g, "invalid character class range");
228
8.90k
        if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
229
44
                die(g, "too many character class ranges");
230
8.90k
        *g->yycc->end++ = a;
231
8.90k
        *g->yycc->end++ = b;
232
8.90k
}
233
234
708
static void addranges_d(Restate *g) {
235
708
        addrange(g, '0', '9');
236
708
}
237
238
278
static void addranges_D(Restate *g) {
239
278
        addrange(g, 0, '0' - 1);
240
278
        addrange(g, '9' + 1, 0xFFFF);
241
278
}
242
243
468
static void addranges_s(Restate *g) {
244
468
        addrange(g, 0x9, 0x9);
245
468
        addrange(g, 0xA, 0xD);
246
468
        addrange(g, 0x20, 0x20);
247
468
        addrange(g, 0xA0, 0xA0);
248
468
        addrange(g, 0x2028, 0x2029);
249
468
        addrange(g, 0xFEFF, 0xFEFF);
250
468
}
251
252
4
static void addranges_S(Restate *g) {
253
4
        addrange(g, 0, 0x9 - 1);
254
4
        addrange(g, 0x9 + 1, 0xA - 1);
255
4
        addrange(g, 0xD + 1, 0x20 - 1);
256
4
        addrange(g, 0x20 + 1, 0xA0 - 1);
257
4
        addrange(g, 0xA0 + 1, 0x2028 - 1);
258
4
        addrange(g, 0x2029 + 1, 0xFEFF - 1);
259
4
        addrange(g, 0xFEFF + 1, 0xFFFF);
260
4
}
261
262
362
static void addranges_w(Restate *g) {
263
362
        addrange(g, '0', '9');
264
362
        addrange(g, 'A', 'Z');
265
362
        addrange(g, '_', '_');
266
362
        addrange(g, 'a', 'z');
267
362
}
268
269
90
static void addranges_W(Restate *g) {
270
90
        addrange(g, 0, '0' - 1);
271
90
        addrange(g, '9' + 1, 'A' - 1);
272
90
        addrange(g, 'Z' + 1, '_' - 1);
273
90
        addrange(g, '_' + 1, 'a' - 1);
274
90
        addrange(g, 'z' + 1, 0xFFFF);
275
90
}
276
277
763
static int lexclass(Restate *g) {
278
763
        int type = L_CCLASS;
279
763
        int quoted, havesave, havedash;
280
763
        Rune save = 0;
281
282
763
        newcclass(g);
283
284
763
        quoted = nextrune(g);
285
763
        if (!quoted && g->yychar == '^') {
286
36
                type   = L_NCCLASS;
287
36
                quoted = nextrune(g);
288
36
        }
289
290
763
        havesave = havedash = 0;
291
6.81k
        for (;;) {
292
6.81k
                if (g->yychar == 0)
293
235
                        die(g, "unterminated character class");
294
6.81k
                if (!quoted && g->yychar == ']')
295
439
                        break;
296
297
6.37k
                if (!quoted && g->yychar == '-') {
298
2.05k
                        if (havesave) {
299
1.53k
                                if (havedash) {
300
446
                                        addrange(g, save, '-');
301
446
                                        havesave = havedash = 0;
302
1.09k
                                } else {
303
1.09k
                                        havedash = 1;
304
1.09k
                                }
305
1.53k
                        } else {
306
514
                                save     = '-';
307
514
                                havesave = 1;
308
514
                        }
309
4.32k
                } else if (quoted && strchr("DSWdsw", g->yychar)) {
310
1.31k
                        if (havesave) {
311
441
                                addrange(g, save, save);
312
441
                                if (havedash)
313
195
                                        addrange(g, '-', '-');
314
441
                        }
315
1.31k
                        switch (g->yychar) {
316
529
                        case 'd':
317
529
                                addranges_d(g);
318
529
                                break;
319
217
                        case 's':
320
217
                                addranges_s(g);
321
217
                                break;
322
190
                        case 'w':
323
190
                                addranges_w(g);
324
190
                                break;
325
278
                        case 'D':
326
278
                                addranges_D(g);
327
278
                                break;
328
4
                        case 'S':
329
4
                                addranges_S(g);
330
4
                                break;
331
90
                        case 'W':
332
90
                                addranges_W(g);
333
90
                                break;
334
1.31k
                        }
335
1.27k
                        havesave = havedash = 0;
336
3.01k
                } else {
337
3.01k
                        if (quoted) {
338
694
                                if (g->yychar == 'b')
339
194
                                        g->yychar = '\b';
340
500
                                else if (g->yychar == '0')
341
217
                                        g->yychar = 0;
342
                                /* else identity escape */
343
694
                        }
344
3.01k
                        if (havesave) {
345
1.68k
                                if (havedash) {
346
404
                                        addrange(g, save, g->yychar);
347
404
                                        havesave = havedash = 0;
348
1.28k
                                } else {
349
1.28k
                                        addrange(g, save, save);
350
1.28k
                                        save = g->yychar;
351
1.28k
                                }
352
1.68k
                        } else {
353
1.32k
                                save     = g->yychar;
354
1.32k
                                havesave = 1;
355
1.32k
                        }
356
3.01k
                }
357
358
6.34k
                quoted = nextrune(g);
359
6.34k
        }
360
361
726
        if (havesave) {
362
174
                addrange(g, save, save);
363
174
                if (havedash)
364
37
                        addrange(g, '-', '-');
365
174
        }
366
367
726
        return type;
368
763
}
369
370
238k
static int lex(Restate *g) {
371
238k
        int quoted = nextrune(g);
372
238k
        if (quoted) {
373
3.74k
                switch (g->yychar) {
374
201
                case 'b':
375
201
                        return L_WORD;
376
396
                case 'B':
377
396
                        return L_NWORD;
378
92
                case 'd':
379
92
                        newcclass(g);
380
92
                        addranges_d(g);
381
92
                        return L_CCLASS;
382
123
                case 's':
383
123
                        newcclass(g);
384
123
                        addranges_s(g);
385
123
                        return L_CCLASS;
386
105
                case 'w':
387
105
                        newcclass(g);
388
105
                        addranges_w(g);
389
105
                        return L_CCLASS;
390
89
                case 'D':
391
89
                        newcclass(g);
392
89
                        addranges_d(g);
393
89
                        return L_NCCLASS;
394
130
                case 'S':
395
130
                        newcclass(g);
396
130
                        addranges_s(g);
397
130
                        return L_NCCLASS;
398
69
                case 'W':
399
69
                        newcclass(g);
400
69
                        addranges_w(g);
401
69
                        return L_NCCLASS;
402
1.25k
                case '0':
403
1.25k
                        g->yychar = 0;
404
1.25k
                        return L_CHAR;
405
3.74k
                }
406
1.28k
                if (g->yychar >= '0' && g->yychar <= '9') {
407
865
                        g->yychar -= '0';
408
865
                        if (*g->source >= '0' && *g->source <= '9')
409
200
                                g->yychar = g->yychar * 10 + *g->source++ - '0';
410
865
                        return L_REF;
411
865
                }
412
418
                return L_CHAR;
413
1.28k
        }
414
415
234k
        switch (g->yychar) {
416
1.16k
        case 0:
417
71.2k
        case '$':
418
73.3k
        case ')':
419
75.9k
        case '*':
420
76.9k
        case '+':
421
77.9k
        case '.':
422
80.8k
        case '?':
423
105k
        case '^':
424
110k
        case '|':
425
110k
                return g->yychar;
426
234k
        }
427
428
124k
        if (g->yychar == '{')
429
2.03k
                return lexcount(g);
430
122k
        if (g->yychar == '[')
431
763
                return lexclass(g);
432
121k
        if (g->yychar == '(') {
433
3.53k
                if (g->source[0] == '?') {
434
2.34k
                        if (g->source[1] == ':') {
435
668
                                g->source += 2;
436
668
                                return L_NC;
437
668
                        }
438
1.67k
                        if (g->source[1] == '=') {
439
671
                                g->source += 2;
440
671
                                return L_PLA;
441
671
                        }
442
1.00k
                        if (g->source[1] == '!') {
443
984
                                g->source += 2;
444
984
                                return L_NLA;
445
984
                        }
446
1.00k
                }
447
1.21k
                return '(';
448
3.53k
        }
449
450
118k
        return L_CHAR;
451
121k
}
452
453
/* Parse */
454
455
enum {
456
        P_CAT,
457
        P_ALT,
458
        P_REP,
459
        P_BOL,
460
        P_EOL,
461
        P_WORD,
462
        P_NWORD,
463
        P_PAR,
464
        P_PLA,
465
        P_NLA,
466
        P_ANY,
467
        P_CHAR,
468
        P_CCLASS,
469
        P_NCCLASS,
470
        P_REF
471
};
472
473
struct Renode {
474
        unsigned char type;
475
        unsigned char ng, m, n;
476
        Rune c;
477
        Reclass *cc;
478
        Renode *x;
479
        Renode *y;
480
};
481
482
449k
static Renode *newnode(Restate *g, int type) {
483
449k
        Renode *node = g->pend++;
484
449k
        node->type   = type;
485
449k
        node->cc     = NULL;
486
449k
        node->c      = 0;
487
449k
        node->ng     = 0;
488
449k
        node->m      = 0;
489
449k
        node->n      = 0;
490
449k
        node->x = node->y = NULL;
491
449k
        return node;
492
449k
}
493
494
44.5k
static int empty(Renode *node) {
495
44.5k
        if (!node)
496
1.63k
                return 1;
497
42.9k
        switch (node->type) {
498
216
        default:
499
216
                return 1;
500
30.1k
        case P_CAT:
501
30.1k
                return empty(node->x) && empty(node->y);
502
911
        case P_ALT:
503
911
                return empty(node->x) || empty(node->y);
504
1.43k
        case P_REP:
505
1.43k
                return empty(node->x) || node->m == 0;
506
3.27k
        case P_PAR:
507
3.27k
                return empty(node->x);
508
1.75k
        case P_REF:
509
1.75k
                return empty(node->x);
510
684
        case P_ANY:
511
4.77k
        case P_CHAR:
512
5.04k
        case P_CCLASS:
513
5.24k
        case P_NCCLASS:
514
5.24k
                return 0;
515
42.9k
        }
516
42.9k
}
517
518
6.20k
static Renode *newrep(Restate *g, Renode *atom, int ng, int min, int max) {
519
6.20k
        Renode *rep = newnode(g, P_REP);
520
6.20k
        if (max == REPINF && empty(atom))
521
37
                die(g, "infinite loop matching the empty string");
522
6.20k
        rep->ng = ng;
523
6.20k
        rep->m  = min;
524
6.20k
        rep->n  = max;
525
6.20k
        rep->x  = atom;
526
6.20k
        return rep;
527
6.20k
}
528
529
238k
static void next(Restate *g) {
530
238k
        g->lookahead = lex(g);
531
238k
}
532
533
1.06M
static int re_accept(Restate *g, int t) {
534
1.06M
        if (g->lookahead == t) {
535
112k
                next(g);
536
112k
                return 1;
537
112k
        }
538
952k
        return 0;
539
1.06M
}
540
541
static Renode *parsealt(Restate *g);
542
543
126k
static Renode *parseatom(Restate *g) {
544
126k
        Renode *atom;
545
126k
        if (g->lookahead == L_CHAR) {
546
119k
                atom    = newnode(g, P_CHAR);
547
119k
                atom->c = g->yychar;
548
119k
                next(g);
549
119k
                return atom;
550
119k
        }
551
6.58k
        if (g->lookahead == L_CCLASS) {
552
722
                atom     = newnode(g, P_CCLASS);
553
722
                atom->cc = g->yycc;
554
722
                next(g);
555
722
                return atom;
556
722
        }
557
5.86k
        if (g->lookahead == L_NCCLASS) {
558
315
                atom     = newnode(g, P_NCCLASS);
559
315
                atom->cc = g->yycc;
560
315
                next(g);
561
315
                return atom;
562
315
        }
563
5.54k
        if (g->lookahead == L_REF) {
564
865
                atom = newnode(g, P_REF);
565
865
                if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
566
29
                        die(g, "invalid back-reference");
567
865
                atom->n = g->yychar;
568
865
                atom->x = g->sub[g->yychar];
569
865
                next(g);
570
865
                return atom;
571
865
        }
572
4.68k
        if (re_accept(g, '.'))
573
1.01k
                return newnode(g, P_ANY);
574
3.67k
        if (re_accept(g, '(')) {
575
1.20k
                atom = newnode(g, P_PAR);
576
1.20k
                if (g->nsub == MAXSUB)
577
4
                        die(g, "too many captures");
578
1.20k
                atom->n         = g->nsub++;
579
1.20k
                atom->x         = parsealt(g);
580
1.20k
                g->sub[atom->n] = atom;
581
1.20k
                if (!re_accept(g, ')'))
582
62
                        die(g, "unmatched '('");
583
1.20k
                return atom;
584
1.20k
        }
585
2.46k
        if (re_accept(g, L_NC)) {
586
668
                atom = parsealt(g);
587
668
                if (!re_accept(g, ')'))
588
12
                        die(g, "unmatched '('");
589
668
                return atom;
590
668
        }
591
1.79k
        if (re_accept(g, L_PLA)) {
592
671
                atom    = newnode(g, P_PLA);
593
671
                atom->x = parsealt(g);
594
671
                if (!re_accept(g, ')'))
595
9
                        die(g, "unmatched '('");
596
671
                return atom;
597
671
        }
598
1.12k
        if (re_accept(g, L_NLA)) {
599
984
                atom    = newnode(g, P_NLA);
600
984
                atom->x = parsealt(g);
601
984
                if (!re_accept(g, ')'))
602
9
                        die(g, "unmatched '('");
603
984
                return atom;
604
984
        }
605
138
        die(g, "syntax error");
606
138
        return NULL;
607
1.12k
}
608
609
221k
static Renode *parserep(Restate *g) {
610
221k
        Renode *atom;
611
612
221k
        if (re_accept(g, '^'))
613
24.7k
                return newnode(g, P_BOL);
614
197k
        if (re_accept(g, '$'))
615
70.1k
                return newnode(g, P_EOL);
616
127k
        if (re_accept(g, L_WORD))
617
201
                return newnode(g, P_WORD);
618
126k
        if (re_accept(g, L_NWORD))
619
396
                return newnode(g, P_NWORD);
620
621
126k
        atom = parseatom(g);
622
126k
        if (g->lookahead == L_COUNT) {
623
1.79k
                int min = g->yymin, max = g->yymax;
624
1.79k
                next(g);
625
1.79k
                if (max < min)
626
5
                        die(g, "invalid quantifier");
627
1.79k
                return newrep(g, atom, re_accept(g, '?'), min, max);
628
1.79k
        }
629
124k
        if (re_accept(g, '*'))
630
2.59k
                return newrep(g, atom, re_accept(g, '?'), 0, REPINF);
631
122k
        if (re_accept(g, '+'))
632
1.01k
                return newrep(g, atom, re_accept(g, '?'), 1, REPINF);
633
121k
        if (re_accept(g, '?'))
634
800
                return newrep(g, atom, re_accept(g, '?'), 0, 1);
635
120k
        return atom;
636
121k
}
637
638
9.46k
static Renode *parsecat(Restate *g) {
639
9.46k
        Renode *cat, *x;
640
9.46k
        if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
641
4.58k
                cat = parserep(g);
642
221k
                while (g->lookahead && g->lookahead != '|' &&
643
218k
                       g->lookahead != ')') {
644
217k
                        x      = cat;
645
217k
                        cat    = newnode(g, P_CAT);
646
217k
                        cat->x = x;
647
217k
                        cat->y = parserep(g);
648
217k
                }
649
4.58k
                return cat;
650
4.58k
        }
651
4.88k
        return NULL;
652
9.46k
}
653
654
5.04k
static Renode *parsealt(Restate *g) {
655
5.04k
        Renode *alt, *x;
656
5.04k
        alt = parsecat(g);
657
9.46k
        while (re_accept(g, '|')) {
658
4.41k
                x      = alt;
659
4.41k
                alt    = newnode(g, P_ALT);
660
4.41k
                alt->x = x;
661
4.41k
                alt->y = parsecat(g);
662
4.41k
        }
663
5.04k
        return alt;
664
5.04k
}
665
666
/* Compile */
667
668
enum {
669
        I_END,
670
        I_JUMP,
671
        I_SPLIT,
672
        I_PLA,
673
        I_NLA,
674
        I_ANYNL,
675
        I_ANY,
676
        I_CHAR,
677
        I_CCLASS,
678
        I_NCCLASS,
679
        I_REF,
680
        I_BOL,
681
        I_EOL,
682
        I_WORD,
683
        I_NWORD,
684
        I_LPAR,
685
        I_RPAR
686
};
687
688
struct Reinst {
689
        unsigned char opcode;
690
        unsigned char n;
691
        Rune c;
692
        Reclass *cc;
693
        Reinst *x;
694
        Reinst *y;
695
};
696
697
425k
static unsigned int count(Renode *node) {
698
425k
        unsigned int min, max;
699
425k
        if (!node)
700
4.26k
                return 0;
701
421k
        switch (node->type) {
702
206k
        default:
703
206k
                return 1;
704
205k
        case P_CAT:
705
205k
                return count(node->x) + count(node->y);
706
3.92k
        case P_ALT:
707
3.92k
                return count(node->x) + count(node->y) + 2;
708
4.41k
        case P_REP:
709
4.41k
                min = node->m;
710
4.41k
                max = node->n;
711
4.41k
                if (min == max)
712
824
                        return count(node->x) * min;
713
3.59k
                if (max < REPINF)
714
926
                        return count(node->x) * max + (max - min);
715
2.66k
                return count(node->x) * (min + 1) + 2;
716
578
        case P_PAR:
717
578
                return count(node->x) + 2;
718
454
        case P_PLA:
719
454
                return count(node->x) + 2;
720
442
        case P_NLA:
721
442
                return count(node->x) + 2;
722
421k
        }
723
421k
}
724
725
403k
static Reinst *emit(Reprog *prog, int opcode) {
726
403k
        Reinst *inst = prog->end++;
727
403k
        inst->opcode = opcode;
728
403k
        inst->n      = 0;
729
403k
        inst->c      = 0;
730
403k
        inst->cc     = NULL;
731
403k
        inst->x = inst->y = NULL;
732
403k
        return inst;
733
403k
}
734
735
596k
static void compile(Reprog *prog, Renode *node) {
736
596k
        Reinst *inst, *split, *jump;
737
596k
        unsigned int i;
738
739
596k
        if (!node)
740
10.5k
                return;
741
742
585k
        switch (node->type) {
743
211k
        case P_CAT:
744
211k
                compile(prog, node->x);
745
211k
                compile(prog, node->y);
746
211k
                break;
747
748
6.90k
        case P_ALT:
749
6.90k
                split = emit(prog, I_SPLIT);
750
6.90k
                compile(prog, node->x);
751
6.90k
                jump = emit(prog, I_JUMP);
752
6.90k
                compile(prog, node->y);
753
6.90k
                split->x = split + 1;
754
6.90k
                split->y = jump + 1;
755
6.90k
                jump->x  = prog->end;
756
6.90k
                break;
757
758
8.18k
        case P_REP:
759
145k
                for (i = 0; i < node->m; ++i) {
760
137k
                        inst = prog->end;
761
137k
                        compile(prog, node->x);
762
137k
                }
763
8.18k
                if (node->m == node->n)
764
2.27k
                        break;
765
5.91k
                if (node->n < REPINF) {
766
9.91k
                        for (i = node->m; i < node->n; ++i) {
767
8.46k
                                split = emit(prog, I_SPLIT);
768
8.46k
                                compile(prog, node->x);
769
8.46k
                                if (node->ng) {
770
433
                                        split->y = split + 1;
771
433
                                        split->x = prog->end;
772
8.02k
                                } else {
773
8.02k
                                        split->x = split + 1;
774
8.02k
                                        split->y = prog->end;
775
8.02k
                                }
776
8.46k
                        }
777
4.45k
                } else if (node->m == 0) {
778
2.26k
                        split = emit(prog, I_SPLIT);
779
2.26k
                        compile(prog, node->x);
780
2.26k
                        jump = emit(prog, I_JUMP);
781
2.26k
                        if (node->ng) {
782
943
                                split->y = split + 1;
783
943
                                split->x = prog->end;
784
1.31k
                        } else {
785
1.31k
                                split->x = split + 1;
786
1.31k
                                split->y = prog->end;
787
1.31k
                        }
788
2.26k
                        jump->x = split;
789
2.26k
                } else {
790
2.19k
                        split = emit(prog, I_SPLIT);
791
2.19k
                        if (node->ng) {
792
552
                                split->y = inst;
793
552
                                split->x = prog->end;
794
1.64k
                        } else {
795
1.64k
                                split->x = inst;
796
1.64k
                                split->y = prog->end;
797
1.64k
                        }
798
2.19k
                }
799
5.91k
                break;
800
801
25.5k
        case P_BOL:
802
25.5k
                emit(prog, I_BOL);
803
25.5k
                break;
804
70.3k
        case P_EOL:
805
70.3k
                emit(prog, I_EOL);
806
70.3k
                break;
807
564
        case P_WORD:
808
564
                emit(prog, I_WORD);
809
564
                break;
810
727
        case P_NWORD:
811
727
                emit(prog, I_NWORD);
812
727
                break;
813
814
8.34k
        case P_PAR:
815
8.34k
                inst    = emit(prog, I_LPAR);
816
8.34k
                inst->n = node->n;
817
8.34k
                compile(prog, node->x);
818
8.34k
                inst    = emit(prog, I_RPAR);
819
8.34k
                inst->n = node->n;
820
8.34k
                break;
821
760
        case P_PLA:
822
760
                split = emit(prog, I_PLA);
823
760
                compile(prog, node->x);
824
760
                emit(prog, I_END);
825
760
                split->x = split + 1;
826
760
                split->y = prog->end;
827
760
                break;
828
663
        case P_NLA:
829
663
                split = emit(prog, I_NLA);
830
663
                compile(prog, node->x);
831
663
                emit(prog, I_END);
832
663
                split->x = split + 1;
833
663
                split->y = prog->end;
834
663
                break;
835
836
4.45k
        case P_ANY:
837
4.45k
                emit(prog, I_ANY);
838
4.45k
                break;
839
245k
        case P_CHAR:
840
245k
                inst    = emit(prog, I_CHAR);
841
245k
                inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
842
245k
                break;
843
716
        case P_CCLASS:
844
716
                inst     = emit(prog, I_CCLASS);
845
716
                inst->cc = node->cc;
846
716
                break;
847
567
        case P_NCCLASS:
848
567
                inst     = emit(prog, I_NCCLASS);
849
567
                inst->cc = node->cc;
850
567
                break;
851
675
        case P_REF:
852
675
                inst    = emit(prog, I_REF);
853
675
                inst->n = node->n;
854
675
                break;
855
585k
        }
856
585k
}
857
858
#ifdef TEST
859
static void dumpnode(Renode *node) {
860
        Rune *p;
861
        if (!node) {
862
                printf("Empty");
863
                return;
864
        }
865
        switch (node->type) {
866
        case P_CAT:
867
                printf("Cat(");
868
                dumpnode(node->x);
869
                printf(", ");
870
                dumpnode(node->y);
871
                printf(")");
872
                break;
873
        case P_ALT:
874
                printf("Alt(");
875
                dumpnode(node->x);
876
                printf(", ");
877
                dumpnode(node->y);
878
                printf(")");
879
                break;
880
        case P_REP:
881
                printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m,
882
                       node->n);
883
                dumpnode(node->x);
884
                printf(")");
885
                break;
886
        case P_BOL:
887
                printf("Bol");
888
                break;
889
        case P_EOL:
890
                printf("Eol");
891
                break;
892
        case P_WORD:
893
                printf("Word");
894
                break;
895
        case P_NWORD:
896
                printf("NotWord");
897
                break;
898
        case P_PAR:
899
                printf("Par(%d,", node->n);
900
                dumpnode(node->x);
901
                printf(")");
902
                break;
903
        case P_PLA:
904
                printf("PLA(");
905
                dumpnode(node->x);
906
                printf(")");
907
                break;
908
        case P_NLA:
909
                printf("NLA(");
910
                dumpnode(node->x);
911
                printf(")");
912
                break;
913
        case P_ANY:
914
                printf("Any");
915
                break;
916
        case P_CHAR:
917
                printf("Char(%c)", node->c);
918
                break;
919
        case P_CCLASS:
920
                printf("Class(");
921
                for (p = node->cc->spans; p < node->cc->end; p += 2)
922
                        printf("%02X-%02X,", p[0], p[1]);
923
                printf(")");
924
                break;
925
        case P_NCCLASS:
926
                printf("NotClass(");
927
                for (p = node->cc->spans; p < node->cc->end; p += 2)
928
                        printf("%02X-%02X,", p[0], p[1]);
929
                printf(")");
930
                break;
931
        case P_REF:
932
                printf("Ref(%d)", node->n);
933
                break;
934
        }
935
}
936
937
static void dumpprog(Reprog *prog) {
938
        Reinst *inst;
939
        int i;
940
        for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
941
                printf("% 5d: ", i);
942
                switch (inst->opcode) {
943
                case I_END:
944
                        puts("end");
945
                        break;
946
                case I_JUMP:
947
                        printf("jump %d\n", (int)(inst->x - prog->start));
948
                        break;
949
                case I_SPLIT:
950
                        printf("split %d %d\n", (int)(inst->x - prog->start),
951
                               (int)(inst->y - prog->start));
952
                        break;
953
                case I_PLA:
954
                        printf("pla %d %d\n", (int)(inst->x - prog->start),
955
                               (int)(inst->y - prog->start));
956
                        break;
957
                case I_NLA:
958
                        printf("nla %d %d\n", (int)(inst->x - prog->start),
959
                               (int)(inst->y - prog->start));
960
                        break;
961
                case I_ANY:
962
                        puts("any");
963
                        break;
964
                case I_ANYNL:
965
                        puts("anynl");
966
                        break;
967
                case I_CHAR:
968
                        printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n"
969
                                                              : "char U+%04X\n",
970
                               inst->c);
971
                        break;
972
                case I_CCLASS:
973
                        puts("cclass");
974
                        break;
975
                case I_NCCLASS:
976
                        puts("ncclass");
977
                        break;
978
                case I_REF:
979
                        printf("ref %d\n", inst->n);
980
                        break;
981
                case I_BOL:
982
                        puts("bol");
983
                        break;
984
                case I_EOL:
985
                        puts("eol");
986
                        break;
987
                case I_WORD:
988
                        puts("word");
989
                        break;
990
                case I_NWORD:
991
                        puts("nword");
992
                        break;
993
                case I_LPAR:
994
                        printf("lpar %d\n", inst->n);
995
                        break;
996
                case I_RPAR:
997
                        printf("rpar %d\n", inst->n);
998
                        break;
999
                }
1000
        }
1001
}
1002
#endif
1003
1004
2.14k
Reprog *re_regcomp(const char *pattern, int cflags, const char **errorp) {
1005
2.14k
        Reprog *prog;
1006
2.14k
        Restate *g;
1007
2.14k
        Renode *node;
1008
2.14k
        Reinst *split, *jump;
1009
2.14k
        int i;
1010
2.14k
        unsigned int ncount;
1011
2.14k
        size_t pattern_len = strlen(pattern);
1012
1013
2.14k
        if (pattern_len > 10000) {
1014
                /* Avoid stack exhaustion in recursive parseatom() et.al. */
1015
11
                if (errorp)
1016
11
                        *errorp = "regexp pattern too long (max 10000)";
1017
11
                return NULL;
1018
11
        }
1019
1020
2.13k
        prog      = rd_calloc(1, sizeof(Reprog));
1021
2.13k
        g         = &prog->g;
1022
2.13k
        g->prog   = prog;
1023
2.13k
        g->pstart = g->pend = rd_malloc(sizeof(Renode) * pattern_len * 2);
1024
1025
2.13k
        if (setjmp(g->kaboom)) {
1026
1.14k
                if (errorp)
1027
1.14k
                        *errorp = g->error;
1028
1.14k
                rd_free(g->pstart);
1029
1.14k
                rd_free(prog);
1030
1.14k
                return NULL;
1031
1.14k
        }
1032
1033
988
        g->source  = pattern;
1034
988
        g->ncclass = 0;
1035
988
        g->nsub    = 1;
1036
35.1k
        for (i = 0; i < MAXSUB; ++i)
1037
34.1k
                g->sub[i] = 0;
1038
1039
988
        g->prog->flags = cflags;
1040
1041
988
        next(g);
1042
988
        node = parsealt(g);
1043
988
        if (g->lookahead == ')')
1044
19
                die(g, "unmatched ')'");
1045
988
        if (g->lookahead != 0)
1046
0
                die(g, "syntax error");
1047
1048
988
        g->prog->nsub = g->nsub;
1049
988
        ncount        = count(node);
1050
988
        if (ncount > 10000)
1051
44
                die(g, "regexp graph too large");
1052
988
        g->prog->start = g->prog->end =
1053
988
            rd_malloc((ncount + 6) * sizeof(Reinst));
1054
1055
988
        split    = emit(g->prog, I_SPLIT);
1056
988
        split->x = split + 3;
1057
988
        split->y = split + 1;
1058
988
        emit(g->prog, I_ANYNL);
1059
988
        jump    = emit(g->prog, I_JUMP);
1060
988
        jump->x = split;
1061
988
        emit(g->prog, I_LPAR);
1062
988
        compile(g->prog, node);
1063
988
        emit(g->prog, I_RPAR);
1064
988
        emit(g->prog, I_END);
1065
1066
#ifdef TEST
1067
        dumpnode(node);
1068
        putchar('\n');
1069
        dumpprog(g->prog);
1070
#endif
1071
1072
988
        rd_free(g->pstart);
1073
1074
988
        if (errorp)
1075
988
                *errorp = NULL;
1076
988
        return g->prog;
1077
2.13k
}
1078
1079
988
void re_regfree(Reprog *prog) {
1080
988
        if (prog) {
1081
988
                rd_free(prog->start);
1082
988
                rd_free(prog);
1083
988
        }
1084
988
}
1085
1086
/* Match */
1087
1088
0
static int isnewline(int c) {
1089
0
        return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
1090
0
}
1091
1092
0
static int iswordchar(int c) {
1093
0
        return c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
1094
0
               (c >= '0' && c <= '9');
1095
0
}
1096
1097
0
static int incclass(Reclass *cc, Rune c) {
1098
0
        Rune *p;
1099
0
        for (p = cc->spans; p < cc->end; p += 2)
1100
0
                if (p[0] <= c && c <= p[1])
1101
0
                        return 1;
1102
0
        return 0;
1103
0
}
1104
1105
0
static int incclasscanon(Reclass *cc, Rune c) {
1106
0
        Rune *p, r;
1107
0
        for (p = cc->spans; p < cc->end; p += 2)
1108
0
                for (r = p[0]; r <= p[1]; ++r)
1109
0
                        if (c == canon(r))
1110
0
                                return 1;
1111
0
        return 0;
1112
0
}
1113
1114
0
static int strncmpcanon(const char *a, const char *b, unsigned int n) {
1115
0
        Rune ra, rb;
1116
0
        int c;
1117
0
        while (n--) {
1118
0
                if (!*a)
1119
0
                        return -1;
1120
0
                if (!*b)
1121
0
                        return 1;
1122
0
                a += chartorune(&ra, a);
1123
0
                b += chartorune(&rb, b);
1124
0
                c = canon(ra) - canon(rb);
1125
0
                if (c)
1126
0
                        return c;
1127
0
        }
1128
0
        return 0;
1129
0
}
1130
1131
struct Rethread {
1132
        Reinst *pc;
1133
        const char *sp;
1134
        Resub sub;
1135
};
1136
1137
0
static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub) {
1138
0
        t->pc = pc;
1139
0
        t->sp = sp;
1140
0
        memcpy(&t->sub, sub, sizeof t->sub);
1141
0
}
1142
1143
static int
1144
0
match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out) {
1145
0
        Rethread ready[MAXTHREAD];
1146
0
        Resub scratch;
1147
0
        Resub sub;
1148
0
        Rune c;
1149
0
        unsigned int nready;
1150
0
        int i;
1151
1152
        /* queue initial thread */
1153
0
        spawn(ready + 0, pc, sp, out);
1154
0
        nready = 1;
1155
1156
        /* run threads in stack order */
1157
0
        while (nready > 0) {
1158
0
                --nready;
1159
0
                pc = ready[nready].pc;
1160
0
                sp = ready[nready].sp;
1161
0
                memcpy(&sub, &ready[nready].sub, sizeof sub);
1162
0
                for (;;) {
1163
0
                        switch (pc->opcode) {
1164
0
                        case I_END:
1165
0
                                for (i = 0; i < MAXSUB; ++i) {
1166
0
                                        out->sub[i].sp = sub.sub[i].sp;
1167
0
                                        out->sub[i].ep = sub.sub[i].ep;
1168
0
                                }
1169
0
                                return 1;
1170
0
                        case I_JUMP:
1171
0
                                pc = pc->x;
1172
0
                                continue;
1173
0
                        case I_SPLIT:
1174
0
                                if (nready >= MAXTHREAD) {
1175
0
                                        fprintf(
1176
0
                                            stderr,
1177
0
                                            "regexec: backtrack overflow!\n");
1178
0
                                        return 0;
1179
0
                                }
1180
0
                                spawn(&ready[nready++], pc->y, sp, &sub);
1181
0
                                pc = pc->x;
1182
0
                                continue;
1183
1184
0
                        case I_PLA:
1185
0
                                if (!match(pc->x, sp, bol, flags, &sub))
1186
0
                                        goto dead;
1187
0
                                pc = pc->y;
1188
0
                                continue;
1189
0
                        case I_NLA:
1190
0
                                memcpy(&scratch, &sub, sizeof scratch);
1191
0
                                if (match(pc->x, sp, bol, flags, &scratch))
1192
0
                                        goto dead;
1193
0
                                pc = pc->y;
1194
0
                                continue;
1195
1196
0
                        case I_ANYNL:
1197
0
                                sp += chartorune(&c, sp);
1198
0
                                if (c == 0)
1199
0
                                        goto dead;
1200
0
                                break;
1201
0
                        case I_ANY:
1202
0
                                sp += chartorune(&c, sp);
1203
0
                                if (c == 0)
1204
0
                                        goto dead;
1205
0
                                if (isnewline(c))
1206
0
                                        goto dead;
1207
0
                                break;
1208
0
                        case I_CHAR:
1209
0
                                sp += chartorune(&c, sp);
1210
0
                                if (c == 0)
1211
0
                                        goto dead;
1212
0
                                if (flags & REG_ICASE)
1213
0
                                        c = canon(c);
1214
0
                                if (c != pc->c)
1215
0
                                        goto dead;
1216
0
                                break;
1217
0
                        case I_CCLASS:
1218
0
                                sp += chartorune(&c, sp);
1219
0
                                if (c == 0)
1220
0
                                        goto dead;
1221
0
                                if (flags & REG_ICASE) {
1222
0
                                        if (!incclasscanon(pc->cc, canon(c)))
1223
0
                                                goto dead;
1224
0
                                } else {
1225
0
                                        if (!incclass(pc->cc, c))
1226
0
                                                goto dead;
1227
0
                                }
1228
0
                                break;
1229
0
                        case I_NCCLASS:
1230
0
                                sp += chartorune(&c, sp);
1231
0
                                if (c == 0)
1232
0
                                        goto dead;
1233
0
                                if (flags & REG_ICASE) {
1234
0
                                        if (incclasscanon(pc->cc, canon(c)))
1235
0
                                                goto dead;
1236
0
                                } else {
1237
0
                                        if (incclass(pc->cc, c))
1238
0
                                                goto dead;
1239
0
                                }
1240
0
                                break;
1241
0
                        case I_REF:
1242
0
                                i = (int)(sub.sub[pc->n].ep -
1243
0
                                          sub.sub[pc->n].sp);
1244
0
                                if (flags & REG_ICASE) {
1245
0
                                        if (strncmpcanon(sp, sub.sub[pc->n].sp,
1246
0
                                                         i))
1247
0
                                                goto dead;
1248
0
                                } else {
1249
0
                                        if (strncmp(sp, sub.sub[pc->n].sp, i))
1250
0
                                                goto dead;
1251
0
                                }
1252
0
                                if (i > 0)
1253
0
                                        sp += i;
1254
0
                                break;
1255
1256
0
                        case I_BOL:
1257
0
                                if (sp == bol && !(flags & REG_NOTBOL))
1258
0
                                        break;
1259
0
                                if (flags & REG_NEWLINE)
1260
0
                                        if (sp > bol && isnewline(sp[-1]))
1261
0
                                                break;
1262
0
                                goto dead;
1263
0
                        case I_EOL:
1264
0
                                if (*sp == 0)
1265
0
                                        break;
1266
0
                                if (flags & REG_NEWLINE)
1267
0
                                        if (isnewline(*sp))
1268
0
                                                break;
1269
0
                                goto dead;
1270
0
                        case I_WORD:
1271
0
                                i = sp > bol && iswordchar(sp[-1]);
1272
0
                                i ^= iswordchar(sp[0]);
1273
0
                                if (i)
1274
0
                                        break;
1275
0
                                goto dead;
1276
0
                        case I_NWORD:
1277
0
                                i = sp > bol && iswordchar(sp[-1]);
1278
0
                                i ^= iswordchar(sp[0]);
1279
0
                                if (!i)
1280
0
                                        break;
1281
0
                                goto dead;
1282
1283
0
                        case I_LPAR:
1284
0
                                sub.sub[pc->n].sp = sp;
1285
0
                                break;
1286
0
                        case I_RPAR:
1287
0
                                sub.sub[pc->n].ep = sp;
1288
0
                                break;
1289
0
                        default:
1290
0
                                goto dead;
1291
0
                        }
1292
0
                        pc = pc + 1;
1293
0
                }
1294
0
        dead:;
1295
0
        }
1296
0
        return 0;
1297
0
}
1298
1299
0
int re_regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) {
1300
0
        Resub scratch;
1301
0
        int i;
1302
1303
0
        if (!sub)
1304
0
                sub = &scratch;
1305
1306
0
        sub->nsub = prog->nsub;
1307
0
        for (i = 0; i < MAXSUB; ++i)
1308
0
                sub->sub[i].sp = sub->sub[i].ep = NULL;
1309
1310
0
        return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1311
0
}
1312
1313
#ifdef TEST
1314
int main(int argc, char **argv) {
1315
        const char *error;
1316
        const char *s;
1317
        Reprog *p;
1318
        Resub m;
1319
        unsigned int i;
1320
1321
        if (argc > 1) {
1322
                p = regcomp(argv[1], 0, &error);
1323
                if (!p) {
1324
                        fprintf(stderr, "regcomp: %s\n", error);
1325
                        return 1;
1326
                }
1327
1328
                if (argc > 2) {
1329
                        s = argv[2];
1330
                        printf("nsub = %d\n", p->nsub);
1331
                        if (!regexec(p, s, &m, 0)) {
1332
                                for (i = 0; i < m.nsub; ++i) {
1333
                                        int n = m.sub[i].ep - m.sub[i].sp;
1334
                                        if (n > 0)
1335
                                                printf(
1336
                                                    "match %d: s=%d e=%d n=%d "
1337
                                                    "'%.*s'\n",
1338
                                                    i, (int)(m.sub[i].sp - s),
1339
                                                    (int)(m.sub[i].ep - s), n,
1340
                                                    n, m.sub[i].sp);
1341
                                        else
1342
                                                printf("match %d: n=0 ''\n", i);
1343
                                }
1344
                        } else {
1345
                                printf("no match\n");
1346
                        }
1347
                }
1348
        }
1349
1350
        return 0;
1351
}
1352
#endif