Coverage Report

Created: 2025-01-28 06:17

/src/mupdf/thirdparty/mujs/regexp.c
Line
Count
Source (jump to first uncovered line)
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <string.h>
4
#include <setjmp.h>
5
#include <limits.h>
6
7
#include "regexp.h"
8
#include "utf.h"
9
10
0
#define emit regemit
11
0
#define next regnext
12
0
#define accept regaccept
13
14
0
#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16
0
#define REPINF 255
17
#ifndef REG_MAXPROG
18
0
#define REG_MAXPROG (32 << 10)
19
#endif
20
#ifndef REG_MAXREC
21
0
#define REG_MAXREC 1024
22
#endif
23
#ifndef REG_MAXSPAN
24
#define REG_MAXSPAN 64
25
#endif
26
#ifndef REG_MAXCLASS
27
0
#define REG_MAXCLASS 128
28
#endif
29
30
typedef struct Reclass Reclass;
31
typedef struct Renode Renode;
32
typedef struct Reinst Reinst;
33
typedef struct Rethread Rethread;
34
35
struct Reclass {
36
  Rune *end;
37
  Rune spans[REG_MAXSPAN];
38
};
39
40
struct Reprog {
41
  Reinst *start, *end;
42
  Reclass *cclass;
43
  int flags;
44
  int nsub;
45
};
46
47
struct cstate {
48
  Reprog *prog;
49
  Renode *pstart, *pend;
50
51
  const char *source;
52
  int ncclass;
53
  int nsub;
54
  Renode *sub[REG_MAXSUB];
55
56
  int lookahead;
57
  Rune yychar;
58
  Reclass *yycc;
59
  int yymin, yymax;
60
61
  const char *error;
62
  jmp_buf kaboom;
63
64
  Reclass cclass[REG_MAXCLASS];
65
};
66
67
static void die(struct cstate *g, const char *message)
68
0
{
69
0
  g->error = message;
70
0
  longjmp(g->kaboom, 1);
71
0
}
72
73
static int canon(Rune c)
74
0
{
75
0
  Rune u = toupperrune(c);
76
0
  if (c >= 128 && u < 128)
77
0
    return c;
78
0
  return u;
79
0
}
80
81
/* Scan */
82
83
enum {
84
  L_CHAR = 256,
85
  L_CCLASS, /* character class */
86
  L_NCCLASS,  /* negative character class */
87
  L_NC,   /* "(?:" no capture */
88
  L_PLA,    /* "(?=" positive lookahead */
89
  L_NLA,    /* "(?!" negative lookahead */
90
  L_WORD,   /* "\b" word boundary */
91
  L_NWORD,  /* "\B" non-word boundary */
92
  L_REF,    /* "\1" back-reference */
93
  L_COUNT,  /* {M,N} */
94
};
95
96
static int hex(struct cstate *g, int c)
97
0
{
98
0
  if (c >= '0' && c <= '9') return c - '0';
99
0
  if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
100
0
  if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
101
0
  die(g, "invalid escape sequence");
102
0
  return 0;
103
0
}
104
105
static int dec(struct cstate *g, int c)
106
0
{
107
0
  if (c >= '0' && c <= '9') return c - '0';
108
0
  die(g, "invalid quantifier");
109
0
  return 0;
110
0
}
111
112
0
#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789"
113
114
static int isunicodeletter(int c)
115
0
{
116
0
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
117
0
}
118
119
static int nextrune(struct cstate *g)
120
0
{
121
0
  if (!*g->source) {
122
0
    g->yychar = EOF;
123
0
    return 0;
124
0
  }
125
0
  g->source += chartorune(&g->yychar, g->source);
126
0
  if (g->yychar == '\\') {
127
0
    if (!*g->source)
128
0
      die(g, "unterminated escape sequence");
129
0
    g->source += chartorune(&g->yychar, g->source);
130
0
    switch (g->yychar) {
131
0
    case 'f': g->yychar = '\f'; return 0;
132
0
    case 'n': g->yychar = '\n'; return 0;
133
0
    case 'r': g->yychar = '\r'; return 0;
134
0
    case 't': g->yychar = '\t'; return 0;
135
0
    case 'v': g->yychar = '\v'; return 0;
136
0
    case 'c':
137
0
      if (!g->source[0])
138
0
        die(g, "unterminated escape sequence");
139
0
      g->yychar = (*g->source++) & 31;
140
0
      return 0;
141
0
    case 'x':
142
0
      if (!g->source[0] || !g->source[1])
143
0
        die(g, "unterminated escape sequence");
144
0
      g->yychar = hex(g, *g->source++) << 4;
145
0
      g->yychar += hex(g, *g->source++);
146
0
      if (g->yychar == 0) {
147
0
        g->yychar = '0';
148
0
        return 1;
149
0
      }
150
0
      return 0;
151
0
    case 'u':
152
0
      if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
153
0
        die(g, "unterminated escape sequence");
154
0
      g->yychar = hex(g, *g->source++) << 12;
155
0
      g->yychar += hex(g, *g->source++) << 8;
156
0
      g->yychar += hex(g, *g->source++) << 4;
157
0
      g->yychar += hex(g, *g->source++);
158
0
      if (g->yychar == 0) {
159
0
        g->yychar = '0';
160
0
        return 1;
161
0
      }
162
0
      return 0;
163
0
    case 0:
164
0
      g->yychar = '0';
165
0
      return 1;
166
0
    }
167
0
    if (strchr(ESCAPES, g->yychar))
168
0
      return 1;
169
0
    if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
170
0
      die(g, "invalid escape character");
171
0
    return 0;
172
0
  }
173
0
  return 0;
174
0
}
175
176
static int lexcount(struct cstate *g)
177
0
{
178
0
  g->yychar = *g->source++;
179
180
0
  g->yymin = dec(g, g->yychar);
181
0
  g->yychar = *g->source++;
182
0
  while (g->yychar != ',' && g->yychar != '}') {
183
0
    g->yymin = g->yymin * 10 + dec(g, g->yychar);
184
0
    g->yychar = *g->source++;
185
0
    if (g->yymin >= REPINF)
186
0
      die(g, "numeric overflow");
187
0
  }
188
189
0
  if (g->yychar == ',') {
190
0
    g->yychar = *g->source++;
191
0
    if (g->yychar == '}') {
192
0
      g->yymax = REPINF;
193
0
    } else {
194
0
      g->yymax = dec(g, g->yychar);
195
0
      g->yychar = *g->source++;
196
0
      while (g->yychar != '}') {
197
0
        g->yymax = g->yymax * 10 + dec(g, g->yychar);
198
0
        g->yychar = *g->source++;
199
0
        if (g->yymax >= REPINF)
200
0
          die(g, "numeric overflow");
201
0
      }
202
0
    }
203
0
  } else {
204
0
    g->yymax = g->yymin;
205
0
  }
206
207
0
  return L_COUNT;
208
0
}
209
210
static void newcclass(struct cstate *g)
211
0
{
212
0
  if (g->ncclass >= REG_MAXCLASS)
213
0
    die(g, "too many character classes");
214
0
  g->yycc = g->cclass + g->ncclass++;
215
0
  g->yycc->end = g->yycc->spans;
216
0
}
217
218
static void addrange(struct cstate *g, Rune a, Rune b)
219
0
{
220
0
  Reclass *cc = g->yycc;
221
0
  Rune *p;
222
223
0
  if (a > b)
224
0
    die(g, "invalid character class range");
225
226
  /* extend existing ranges if they overlap */
227
0
  for (p = cc->spans; p < cc->end; p += 2) {
228
    /* completely inside old range */
229
0
    if (a >= p[0] && b <= p[1])
230
0
      return;
231
232
    /* completely swallows old range */
233
0
    if (a < p[0] && b >= p[1]) {
234
0
      p[0] = a;
235
0
      p[1] = b;
236
0
      return;
237
0
    }
238
239
    /* extend at start */
240
0
    if (b >= p[0] - 1 && b <= p[1] && a < p[0]) {
241
0
      p[0] = a;
242
0
      return;
243
0
    }
244
245
    /* extend at end */
246
0
    if (a >= p[0] && a <= p[1] + 1 && b > p[1]) {
247
0
      p[1] = b;
248
0
      return;
249
0
    }
250
0
  }
251
252
0
  if (cc->end + 2 >= cc->spans + nelem(cc->spans))
253
0
    die(g, "too many character class ranges");
254
0
  *cc->end++ = a;
255
0
  *cc->end++ = b;
256
0
}
257
258
static void addranges_d(struct cstate *g)
259
0
{
260
0
  addrange(g, '0', '9');
261
0
}
262
263
static void addranges_D(struct cstate *g)
264
0
{
265
0
  addrange(g, 0, '0'-1);
266
0
  addrange(g, '9'+1, 0xFFFF);
267
0
}
268
269
static void addranges_s(struct cstate *g)
270
0
{
271
0
  addrange(g, 0x9, 0xD);
272
0
  addrange(g, 0x20, 0x20);
273
0
  addrange(g, 0xA0, 0xA0);
274
0
  addrange(g, 0x2028, 0x2029);
275
0
  addrange(g, 0xFEFF, 0xFEFF);
276
0
}
277
278
static void addranges_S(struct cstate *g)
279
0
{
280
0
  addrange(g, 0, 0x9-1);
281
0
  addrange(g, 0xD+1, 0x20-1);
282
0
  addrange(g, 0x20+1, 0xA0-1);
283
0
  addrange(g, 0xA0+1, 0x2028-1);
284
0
  addrange(g, 0x2029+1, 0xFEFF-1);
285
0
  addrange(g, 0xFEFF+1, 0xFFFF);
286
0
}
287
288
static void addranges_w(struct cstate *g)
289
0
{
290
0
  addrange(g, '0', '9');
291
0
  addrange(g, 'A', 'Z');
292
0
  addrange(g, '_', '_');
293
0
  addrange(g, 'a', 'z');
294
0
}
295
296
static void addranges_W(struct cstate *g)
297
0
{
298
0
  addrange(g, 0, '0'-1);
299
0
  addrange(g, '9'+1, 'A'-1);
300
0
  addrange(g, 'Z'+1, '_'-1);
301
0
  addrange(g, '_'+1, 'a'-1);
302
0
  addrange(g, 'z'+1, 0xFFFF);
303
0
}
304
305
static int lexclass(struct cstate *g)
306
0
{
307
0
  int type = L_CCLASS;
308
0
  int quoted, havesave, havedash;
309
0
  Rune save = 0;
310
311
0
  newcclass(g);
312
313
0
  quoted = nextrune(g);
314
0
  if (!quoted && g->yychar == '^') {
315
0
    type = L_NCCLASS;
316
0
    quoted = nextrune(g);
317
0
  }
318
319
0
  havesave = havedash = 0;
320
0
  for (;;) {
321
0
    if (g->yychar == EOF)
322
0
      die(g, "unterminated character class");
323
0
    if (!quoted && g->yychar == ']')
324
0
      break;
325
326
0
    if (!quoted && g->yychar == '-') {
327
0
      if (havesave) {
328
0
        if (havedash) {
329
0
          addrange(g, save, '-');
330
0
          havesave = havedash = 0;
331
0
        } else {
332
0
          havedash = 1;
333
0
        }
334
0
      } else {
335
0
        save = '-';
336
0
        havesave = 1;
337
0
      }
338
0
    } else if (quoted && strchr("DSWdsw", g->yychar)) {
339
0
      if (havesave) {
340
0
        addrange(g, save, save);
341
0
        if (havedash)
342
0
          addrange(g, '-', '-');
343
0
      }
344
0
      switch (g->yychar) {
345
0
      case 'd': addranges_d(g); break;
346
0
      case 's': addranges_s(g); break;
347
0
      case 'w': addranges_w(g); break;
348
0
      case 'D': addranges_D(g); break;
349
0
      case 'S': addranges_S(g); break;
350
0
      case 'W': addranges_W(g); break;
351
0
      }
352
0
      havesave = havedash = 0;
353
0
    } else {
354
0
      if (quoted) {
355
0
        if (g->yychar == 'b')
356
0
          g->yychar = '\b';
357
0
        else if (g->yychar == '0')
358
0
          g->yychar = 0;
359
        /* else identity escape */
360
0
      }
361
0
      if (havesave) {
362
0
        if (havedash) {
363
0
          addrange(g, save, g->yychar);
364
0
          havesave = havedash = 0;
365
0
        } else {
366
0
          addrange(g, save, save);
367
0
          save = g->yychar;
368
0
        }
369
0
      } else {
370
0
        save = g->yychar;
371
0
        havesave = 1;
372
0
      }
373
0
    }
374
375
0
    quoted = nextrune(g);
376
0
  }
377
378
0
  if (havesave) {
379
0
    addrange(g, save, save);
380
0
    if (havedash)
381
0
      addrange(g, '-', '-');
382
0
  }
383
384
0
  return type;
385
0
}
386
387
static int lex(struct cstate *g)
388
0
{
389
0
  int quoted = nextrune(g);
390
0
  if (quoted) {
391
0
    switch (g->yychar) {
392
0
    case 'b': return L_WORD;
393
0
    case 'B': return L_NWORD;
394
0
    case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
395
0
    case 's': newcclass(g); addranges_s(g); return L_CCLASS;
396
0
    case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
397
0
    case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
398
0
    case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
399
0
    case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
400
0
    case '0': g->yychar = 0; return L_CHAR;
401
0
    }
402
0
    if (g->yychar >= '0' && g->yychar <= '9') {
403
0
      g->yychar -= '0';
404
0
      if (*g->source >= '0' && *g->source <= '9')
405
0
        g->yychar = g->yychar * 10 + *g->source++ - '0';
406
0
      return L_REF;
407
0
    }
408
0
    return L_CHAR;
409
0
  }
410
411
0
  switch (g->yychar) {
412
0
  case EOF:
413
0
  case '$': case ')': case '*': case '+':
414
0
  case '.': case '?': case '^': case '|':
415
0
    return g->yychar;
416
0
  }
417
418
0
  if (g->yychar == '{')
419
0
    return lexcount(g);
420
0
  if (g->yychar == '[')
421
0
    return lexclass(g);
422
0
  if (g->yychar == '(') {
423
0
    if (g->source[0] == '?') {
424
0
      if (g->source[1] == ':') {
425
0
        g->source += 2;
426
0
        return L_NC;
427
0
      }
428
0
      if (g->source[1] == '=') {
429
0
        g->source += 2;
430
0
        return L_PLA;
431
0
      }
432
0
      if (g->source[1] == '!') {
433
0
        g->source += 2;
434
0
        return L_NLA;
435
0
      }
436
0
    }
437
0
    return '(';
438
0
  }
439
440
0
  return L_CHAR;
441
0
}
442
443
/* Parse */
444
445
enum {
446
  P_CAT, P_ALT, P_REP,
447
  P_BOL, P_EOL, P_WORD, P_NWORD,
448
  P_PAR, P_PLA, P_NLA,
449
  P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
450
  P_REF,
451
};
452
453
struct Renode {
454
  unsigned char type;
455
  unsigned char ng, m, n;
456
  Rune c;
457
  int cc;
458
  Renode *x;
459
  Renode *y;
460
};
461
462
static Renode *newnode(struct cstate *g, int type)
463
0
{
464
0
  Renode *node = g->pend++;
465
0
  node->type = type;
466
0
  node->cc = -1;
467
0
  node->c = 0;
468
0
  node->ng = 0;
469
0
  node->m = 0;
470
0
  node->n = 0;
471
0
  node->x = node->y = NULL;
472
0
  return node;
473
0
}
474
475
static int empty(Renode *node)
476
0
{
477
0
  if (!node) return 1;
478
0
  switch (node->type) {
479
0
  default: return 1;
480
0
  case P_CAT: return empty(node->x) && empty(node->y);
481
0
  case P_ALT: return empty(node->x) || empty(node->y);
482
0
  case P_REP: return empty(node->x) || node->m == 0;
483
0
  case P_PAR: return empty(node->x);
484
0
  case P_REF: return empty(node->x);
485
0
  case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
486
0
  }
487
0
}
488
489
static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
490
0
{
491
0
  Renode *rep = newnode(g, P_REP);
492
0
  if (max == REPINF && empty(atom))
493
0
    die(g, "infinite loop matching the empty string");
494
0
  rep->ng = ng;
495
0
  rep->m = min;
496
0
  rep->n = max;
497
0
  rep->x = atom;
498
0
  return rep;
499
0
}
500
501
static void next(struct cstate *g)
502
0
{
503
0
  g->lookahead = lex(g);
504
0
}
505
506
static int accept(struct cstate *g, int t)
507
0
{
508
0
  if (g->lookahead == t) {
509
0
    next(g);
510
0
    return 1;
511
0
  }
512
0
  return 0;
513
0
}
514
515
static Renode *parsealt(struct cstate *g);
516
517
static Renode *parseatom(struct cstate *g)
518
0
{
519
0
  Renode *atom;
520
0
  if (g->lookahead == L_CHAR) {
521
0
    atom = newnode(g, P_CHAR);
522
0
    atom->c = g->yychar;
523
0
    next(g);
524
0
    return atom;
525
0
  }
526
0
  if (g->lookahead == L_CCLASS) {
527
0
    atom = newnode(g, P_CCLASS);
528
0
    atom->cc = g->yycc - g->cclass;
529
0
    next(g);
530
0
    return atom;
531
0
  }
532
0
  if (g->lookahead == L_NCCLASS) {
533
0
    atom = newnode(g, P_NCCLASS);
534
0
    atom->cc = g->yycc - g->cclass;
535
0
    next(g);
536
0
    return atom;
537
0
  }
538
0
  if (g->lookahead == L_REF) {
539
0
    atom = newnode(g, P_REF);
540
0
    if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
541
0
      die(g, "invalid back-reference");
542
0
    atom->n = g->yychar;
543
0
    atom->x = g->sub[g->yychar];
544
0
    next(g);
545
0
    return atom;
546
0
  }
547
0
  if (accept(g, '.'))
548
0
    return newnode(g, P_ANY);
549
0
  if (accept(g, '(')) {
550
0
    atom = newnode(g, P_PAR);
551
0
    if (g->nsub == REG_MAXSUB)
552
0
      die(g, "too many captures");
553
0
    atom->n = g->nsub++;
554
0
    atom->x = parsealt(g);
555
0
    g->sub[atom->n] = atom;
556
0
    if (!accept(g, ')'))
557
0
      die(g, "unmatched '('");
558
0
    return atom;
559
0
  }
560
0
  if (accept(g, L_NC)) {
561
0
    atom = parsealt(g);
562
0
    if (!accept(g, ')'))
563
0
      die(g, "unmatched '('");
564
0
    return atom;
565
0
  }
566
0
  if (accept(g, L_PLA)) {
567
0
    atom = newnode(g, P_PLA);
568
0
    atom->x = parsealt(g);
569
0
    if (!accept(g, ')'))
570
0
      die(g, "unmatched '('");
571
0
    return atom;
572
0
  }
573
0
  if (accept(g, L_NLA)) {
574
0
    atom = newnode(g, P_NLA);
575
0
    atom->x = parsealt(g);
576
0
    if (!accept(g, ')'))
577
0
      die(g, "unmatched '('");
578
0
    return atom;
579
0
  }
580
0
  die(g, "syntax error");
581
0
  return NULL;
582
0
}
583
584
static Renode *parserep(struct cstate *g)
585
0
{
586
0
  Renode *atom;
587
588
0
  if (accept(g, '^')) return newnode(g, P_BOL);
589
0
  if (accept(g, '$')) return newnode(g, P_EOL);
590
0
  if (accept(g, L_WORD)) return newnode(g, P_WORD);
591
0
  if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
592
593
0
  atom = parseatom(g);
594
0
  if (g->lookahead == L_COUNT) {
595
0
    int min = g->yymin, max = g->yymax;
596
0
    next(g);
597
0
    if (max < min)
598
0
      die(g, "invalid quantifier");
599
0
    return newrep(g, atom, accept(g, '?'), min, max);
600
0
  }
601
0
  if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
602
0
  if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
603
0
  if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
604
0
  return atom;
605
0
}
606
607
static Renode *parsecat(struct cstate *g)
608
0
{
609
0
  Renode *cat, *head, **tail;
610
0
  if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
611
    /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
612
0
    head = parserep(g);
613
0
    tail = &head;
614
0
    while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
615
0
      cat = newnode(g, P_CAT);
616
0
      cat->x = *tail;
617
0
      cat->y = parserep(g);
618
0
      *tail = cat;
619
0
      tail = &cat->y;
620
0
    }
621
0
    return head;
622
0
  }
623
0
  return NULL;
624
0
}
625
626
static Renode *parsealt(struct cstate *g)
627
0
{
628
0
  Renode *alt, *x;
629
0
  alt = parsecat(g);
630
0
  while (accept(g, '|')) {
631
0
    x = alt;
632
0
    alt = newnode(g, P_ALT);
633
0
    alt->x = x;
634
0
    alt->y = parsecat(g);
635
0
  }
636
0
  return alt;
637
0
}
638
639
/* Compile */
640
641
enum {
642
  I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
643
  I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
644
  I_BOL, I_EOL, I_WORD, I_NWORD,
645
  I_LPAR, I_RPAR
646
};
647
648
struct Reinst {
649
  unsigned char opcode;
650
  unsigned char n;
651
  Rune c;
652
  Reclass *cc;
653
  Reinst *x;
654
  Reinst *y;
655
};
656
657
static int count(struct cstate *g, Renode *node, int depth)
658
0
{
659
0
  int min, max, n;
660
0
  if (!node) return 0;
661
0
  if (++depth > REG_MAXREC) die(g, "stack overflow");
662
0
  switch (node->type) {
663
0
  default: return 1;
664
0
  case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth);
665
0
  case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2;
666
0
  case P_REP:
667
0
    min = node->m;
668
0
    max = node->n;
669
0
    if (min == max) n = count(g, node->x, depth) * min;
670
0
    else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min);
671
0
    else n = count(g, node->x, depth) * (min + 1) + 2;
672
0
    if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
673
0
    return n;
674
0
  case P_PAR: return count(g, node->x, depth) + 2;
675
0
  case P_PLA: return count(g, node->x, depth) + 2;
676
0
  case P_NLA: return count(g, node->x, depth) + 2;
677
0
  }
678
0
}
679
680
static Reinst *emit(Reprog *prog, int opcode)
681
0
{
682
0
  Reinst *inst = prog->end++;
683
0
  inst->opcode = opcode;
684
0
  inst->n = 0;
685
0
  inst->c = 0;
686
0
  inst->cc = NULL;
687
0
  inst->x = inst->y = NULL;
688
0
  return inst;
689
0
}
690
691
static void compile(Reprog *prog, Renode *node)
692
0
{
693
0
  Reinst *inst, *split, *jump;
694
0
  int i;
695
696
0
loop:
697
0
  if (!node)
698
0
    return;
699
700
0
  switch (node->type) {
701
0
  case P_CAT:
702
0
    compile(prog, node->x);
703
0
    node = node->y;
704
0
    goto loop;
705
706
0
  case P_ALT:
707
0
    split = emit(prog, I_SPLIT);
708
0
    compile(prog, node->x);
709
0
    jump = emit(prog, I_JUMP);
710
0
    compile(prog, node->y);
711
0
    split->x = split + 1;
712
0
    split->y = jump + 1;
713
0
    jump->x = prog->end;
714
0
    break;
715
716
0
  case P_REP:
717
0
    inst = NULL; /* silence compiler warning. assert(node->m > 0). */
718
0
    for (i = 0; i < node->m; ++i) {
719
0
      inst = prog->end;
720
0
      compile(prog, node->x);
721
0
    }
722
0
    if (node->m == node->n)
723
0
      break;
724
0
    if (node->n < REPINF) {
725
0
      for (i = node->m; i < node->n; ++i) {
726
0
        split = emit(prog, I_SPLIT);
727
0
        compile(prog, node->x);
728
0
        if (node->ng) {
729
0
          split->y = split + 1;
730
0
          split->x = prog->end;
731
0
        } else {
732
0
          split->x = split + 1;
733
0
          split->y = prog->end;
734
0
        }
735
0
      }
736
0
    } else if (node->m == 0) {
737
0
      split = emit(prog, I_SPLIT);
738
0
      compile(prog, node->x);
739
0
      jump = emit(prog, I_JUMP);
740
0
      if (node->ng) {
741
0
        split->y = split + 1;
742
0
        split->x = prog->end;
743
0
      } else {
744
0
        split->x = split + 1;
745
0
        split->y = prog->end;
746
0
      }
747
0
      jump->x = split;
748
0
    } else {
749
0
      split = emit(prog, I_SPLIT);
750
0
      if (node->ng) {
751
0
        split->y = inst;
752
0
        split->x = prog->end;
753
0
      } else {
754
0
        split->x = inst;
755
0
        split->y = prog->end;
756
0
      }
757
0
    }
758
0
    break;
759
760
0
  case P_BOL: emit(prog, I_BOL); break;
761
0
  case P_EOL: emit(prog, I_EOL); break;
762
0
  case P_WORD: emit(prog, I_WORD); break;
763
0
  case P_NWORD: emit(prog, I_NWORD); break;
764
765
0
  case P_PAR:
766
0
    inst = emit(prog, I_LPAR);
767
0
    inst->n = node->n;
768
0
    compile(prog, node->x);
769
0
    inst = emit(prog, I_RPAR);
770
0
    inst->n = node->n;
771
0
    break;
772
0
  case P_PLA:
773
0
    split = emit(prog, I_PLA);
774
0
    compile(prog, node->x);
775
0
    emit(prog, I_END);
776
0
    split->x = split + 1;
777
0
    split->y = prog->end;
778
0
    break;
779
0
  case P_NLA:
780
0
    split = emit(prog, I_NLA);
781
0
    compile(prog, node->x);
782
0
    emit(prog, I_END);
783
0
    split->x = split + 1;
784
0
    split->y = prog->end;
785
0
    break;
786
787
0
  case P_ANY:
788
0
    emit(prog, I_ANY);
789
0
    break;
790
0
  case P_CHAR:
791
0
    inst = emit(prog, I_CHAR);
792
0
    inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
793
0
    break;
794
0
  case P_CCLASS:
795
0
    inst = emit(prog, I_CCLASS);
796
0
    inst->cc = prog->cclass + node->cc;
797
0
    break;
798
0
  case P_NCCLASS:
799
0
    inst = emit(prog, I_NCCLASS);
800
0
    inst->cc = prog->cclass + node->cc;
801
0
    break;
802
0
  case P_REF:
803
0
    inst = emit(prog, I_REF);
804
0
    inst->n = node->n;
805
0
    break;
806
0
  }
807
0
}
808
809
#ifdef TEST
810
static void dumpnode(struct cstate *g, Renode *node)
811
{
812
  Rune *p;
813
  Reclass *cc;
814
  if (!node) { printf("Empty"); return; }
815
  switch (node->type) {
816
  case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
817
  case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
818
  case P_REP:
819
    printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
820
    dumpnode(g, node->x);
821
    printf(")");
822
    break;
823
  case P_BOL: printf("Bol"); break;
824
  case P_EOL: printf("Eol"); break;
825
  case P_WORD: printf("Word"); break;
826
  case P_NWORD: printf("NotWord"); break;
827
  case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break;
828
  case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break;
829
  case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break;
830
  case P_ANY: printf("Any"); break;
831
  case P_CHAR: printf("Char(%c)", node->c); break;
832
  case P_CCLASS:
833
    printf("Class(");
834
    cc = g->cclass + node->cc;
835
    for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
836
    printf(")");
837
    break;
838
  case P_NCCLASS:
839
    printf("NotClass(");
840
    cc = g->cclass + node->cc;
841
    for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
842
    printf(")");
843
    break;
844
  case P_REF: printf("Ref(%d)", node->n); break;
845
  }
846
}
847
848
static void dumpcclass(Reclass *cc) {
849
  Rune *p;
850
  for (p = cc->spans; p < cc->end; p += 2) {
851
    if (p[0] > 32 && p[0] < 127)
852
      printf(" %c", p[0]);
853
    else
854
      printf(" \\x%02x", p[0]);
855
    if (p[1] > 32 && p[1] < 127)
856
      printf("-%c", p[1]);
857
    else
858
      printf("-\\x%02x", p[1]);
859
  }
860
  putchar('\n');
861
}
862
863
static void dumpprog(Reprog *prog)
864
{
865
  Reinst *inst;
866
  int i;
867
  for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
868
    printf("% 5d: ", i);
869
    switch (inst->opcode) {
870
    case I_END: puts("end"); break;
871
    case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
872
    case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
873
    case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
874
    case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
875
    case I_ANY: puts("any"); break;
876
    case I_ANYNL: puts("anynl"); break;
877
    case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
878
    case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break;
879
    case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break;
880
    case I_REF: printf("ref %d\n", inst->n); break;
881
    case I_BOL: puts("bol"); break;
882
    case I_EOL: puts("eol"); break;
883
    case I_WORD: puts("word"); break;
884
    case I_NWORD: puts("nword"); break;
885
    case I_LPAR: printf("lpar %d\n", inst->n); break;
886
    case I_RPAR: printf("rpar %d\n", inst->n); break;
887
    }
888
  }
889
}
890
#endif
891
892
Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
893
  const char *pattern, int cflags, const char **errorp)
894
0
{
895
0
  struct cstate g;
896
0
  Renode *node;
897
0
  Reinst *split, *jump;
898
0
  int i, n;
899
900
0
  g.pstart = NULL;
901
0
  g.prog = NULL;
902
903
0
  if (setjmp(g.kaboom)) {
904
0
    if (errorp) *errorp = g.error;
905
0
    alloc(ctx, g.pstart, 0);
906
0
    if (g.prog) {
907
0
      alloc(ctx, g.prog->cclass, 0);
908
0
      alloc(ctx, g.prog->start, 0);
909
0
      alloc(ctx, g.prog, 0);
910
0
    }
911
0
    return NULL;
912
0
  }
913
914
0
  g.prog = alloc(ctx, NULL, sizeof (Reprog));
915
0
  if (!g.prog)
916
0
    die(&g, "cannot allocate regular expression");
917
0
  g.prog->start = NULL;
918
0
  g.prog->cclass = NULL;
919
920
0
  n = strlen(pattern) * 2;
921
0
  if (n > REG_MAXPROG)
922
0
    die(&g, "program too large");
923
0
  if (n > 0) {
924
0
    g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
925
0
    if (!g.pstart)
926
0
      die(&g, "cannot allocate regular expression parse list");
927
0
  }
928
929
0
  g.source = pattern;
930
0
  g.ncclass = 0;
931
0
  g.nsub = 1;
932
0
  for (i = 0; i < REG_MAXSUB; ++i)
933
0
    g.sub[i] = 0;
934
935
0
  g.prog->flags = cflags;
936
937
0
  next(&g);
938
0
  node = parsealt(&g);
939
0
  if (g.lookahead == ')')
940
0
    die(&g, "unmatched ')'");
941
0
  if (g.lookahead != EOF)
942
0
    die(&g, "syntax error");
943
944
#ifdef TEST
945
  dumpnode(&g, node);
946
  putchar('\n');
947
#endif
948
949
0
  n = 6 + count(&g, node, 0);
950
0
  if (n < 0 || n > REG_MAXPROG)
951
0
    die(&g, "program too large");
952
953
0
  g.prog->nsub = g.nsub;
954
0
  g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
955
0
  if (!g.prog->start)
956
0
    die(&g, "cannot allocate regular expression instruction list");
957
958
0
  if (g.ncclass > 0) {
959
0
    g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass));
960
0
    if (!g.prog->cclass)
961
0
      die(&g, "cannot allocate regular expression character class list");
962
0
    memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass));
963
0
    for (i = 0; i < g.ncclass; ++i)
964
0
      g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans);
965
0
  }
966
967
0
  split = emit(g.prog, I_SPLIT);
968
0
  split->x = split + 3;
969
0
  split->y = split + 1;
970
0
  emit(g.prog, I_ANYNL);
971
0
  jump = emit(g.prog, I_JUMP);
972
0
  jump->x = split;
973
0
  emit(g.prog, I_LPAR);
974
0
  compile(g.prog, node);
975
0
  emit(g.prog, I_RPAR);
976
0
  emit(g.prog, I_END);
977
978
#ifdef TEST
979
  dumpprog(g.prog);
980
#endif
981
982
0
  alloc(ctx, g.pstart, 0);
983
984
0
  if (errorp) *errorp = NULL;
985
0
  return g.prog;
986
0
}
987
988
void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
989
0
{
990
0
  if (prog) {
991
0
    if (prog->cclass)
992
0
      alloc(ctx, prog->cclass, 0);
993
0
    alloc(ctx, prog->start, 0);
994
0
    alloc(ctx, prog, 0);
995
0
  }
996
0
}
997
998
static void *default_alloc(void *ctx, void *p, int n)
999
0
{
1000
0
  if (n == 0) {
1001
0
    free(p);
1002
0
    return NULL;
1003
0
  }
1004
0
  return realloc(p, (size_t)n);
1005
0
}
1006
1007
Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
1008
0
{
1009
0
  return regcompx(default_alloc, NULL, pattern, cflags, errorp);
1010
0
}
1011
1012
void regfree(Reprog *prog)
1013
0
{
1014
0
  regfreex(default_alloc, NULL, prog);
1015
0
}
1016
1017
/* Match */
1018
1019
static int isnewline(int c)
1020
0
{
1021
0
  return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
1022
0
}
1023
1024
static int iswordchar(int c)
1025
0
{
1026
0
  return c == '_' ||
1027
0
    (c >= 'a' && c <= 'z') ||
1028
0
    (c >= 'A' && c <= 'Z') ||
1029
0
    (c >= '0' && c <= '9');
1030
0
}
1031
1032
static int incclass(Reclass *cc, Rune c)
1033
0
{
1034
0
  Rune *p;
1035
0
  for (p = cc->spans; p < cc->end; p += 2)
1036
0
    if (p[0] <= c && c <= p[1])
1037
0
      return 1;
1038
0
  return 0;
1039
0
}
1040
1041
static int incclasscanon(Reclass *cc, Rune c)
1042
0
{
1043
0
  Rune *p, r;
1044
0
  for (p = cc->spans; p < cc->end; p += 2)
1045
0
    for (r = p[0]; r <= p[1]; ++r)
1046
0
      if (c == canon(r))
1047
0
        return 1;
1048
0
  return 0;
1049
0
}
1050
1051
static int strncmpcanon(const char *a, const char *b, int n)
1052
0
{
1053
0
  Rune ra, rb;
1054
0
  int c;
1055
0
  while (n--) {
1056
0
    if (!*a) return -1;
1057
0
    if (!*b) return 1;
1058
0
    a += chartorune(&ra, a);
1059
0
    b += chartorune(&rb, b);
1060
0
    c = canon(ra) - canon(rb);
1061
0
    if (c)
1062
0
      return c;
1063
0
  }
1064
0
  return 0;
1065
0
}
1066
1067
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
1068
0
{
1069
0
  Resub scratch;
1070
0
  int result;
1071
0
  int i;
1072
0
  Rune c;
1073
1074
  /* stack overflow */
1075
0
  if (depth > REG_MAXREC)
1076
0
    return -1;
1077
1078
0
  for (;;) {
1079
0
    switch (pc->opcode) {
1080
0
    case I_END:
1081
0
      return 0;
1082
0
    case I_JUMP:
1083
0
      pc = pc->x;
1084
0
      break;
1085
0
    case I_SPLIT:
1086
0
      scratch = *out;
1087
0
      result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1088
0
      if (result == -1)
1089
0
        return -1;
1090
0
      if (result == 0) {
1091
0
        *out = scratch;
1092
0
        return 0;
1093
0
      }
1094
0
      pc = pc->y;
1095
0
      break;
1096
1097
0
    case I_PLA:
1098
0
      result = match(pc->x, sp, bol, flags, out, depth+1);
1099
0
      if (result == -1)
1100
0
        return -1;
1101
0
      if (result == 1)
1102
0
        return 1;
1103
0
      pc = pc->y;
1104
0
      break;
1105
0
    case I_NLA:
1106
0
      scratch = *out;
1107
0
      result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1108
0
      if (result == -1)
1109
0
        return -1;
1110
0
      if (result == 0)
1111
0
        return 1;
1112
0
      pc = pc->y;
1113
0
      break;
1114
1115
0
    case I_ANYNL:
1116
0
      if (!*sp) return 1;
1117
0
      sp += chartorune(&c, sp);
1118
0
      pc = pc + 1;
1119
0
      break;
1120
0
    case I_ANY:
1121
0
      if (!*sp) return 1;
1122
0
      sp += chartorune(&c, sp);
1123
0
      if (isnewline(c))
1124
0
        return 1;
1125
0
      pc = pc + 1;
1126
0
      break;
1127
0
    case I_CHAR:
1128
0
      if (!*sp) return 1;
1129
0
      sp += chartorune(&c, sp);
1130
0
      if (flags & REG_ICASE)
1131
0
        c = canon(c);
1132
0
      if (c != pc->c)
1133
0
        return 1;
1134
0
      pc = pc + 1;
1135
0
      break;
1136
0
    case I_CCLASS:
1137
0
      if (!*sp) return 1;
1138
0
      sp += chartorune(&c, sp);
1139
0
      if (flags & REG_ICASE) {
1140
0
        if (!incclasscanon(pc->cc, canon(c)))
1141
0
          return 1;
1142
0
      } else {
1143
0
        if (!incclass(pc->cc, c))
1144
0
          return 1;
1145
0
      }
1146
0
      pc = pc + 1;
1147
0
      break;
1148
0
    case I_NCCLASS:
1149
0
      if (!*sp) return 1;
1150
0
      sp += chartorune(&c, sp);
1151
0
      if (flags & REG_ICASE) {
1152
0
        if (incclasscanon(pc->cc, canon(c)))
1153
0
          return 1;
1154
0
      } else {
1155
0
        if (incclass(pc->cc, c))
1156
0
          return 1;
1157
0
      }
1158
0
      pc = pc + 1;
1159
0
      break;
1160
0
    case I_REF:
1161
0
      i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1162
0
      if (flags & REG_ICASE) {
1163
0
        if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1164
0
          return 1;
1165
0
      } else {
1166
0
        if (strncmp(sp, out->sub[pc->n].sp, i))
1167
0
          return 1;
1168
0
      }
1169
0
      if (i > 0)
1170
0
        sp += i;
1171
0
      pc = pc + 1;
1172
0
      break;
1173
1174
0
    case I_BOL:
1175
0
      if (sp == bol && !(flags & REG_NOTBOL)) {
1176
0
        pc = pc + 1;
1177
0
        break;
1178
0
      }
1179
0
      if (flags & REG_NEWLINE) {
1180
0
        if (sp > bol && isnewline(sp[-1])) {
1181
0
          pc = pc + 1;
1182
0
          break;
1183
0
        }
1184
0
      }
1185
0
      return 1;
1186
0
    case I_EOL:
1187
0
      if (*sp == 0) {
1188
0
        pc = pc + 1;
1189
0
        break;
1190
0
      }
1191
0
      if (flags & REG_NEWLINE) {
1192
0
        if (isnewline(*sp)) {
1193
0
          pc = pc + 1;
1194
0
          break;
1195
0
        }
1196
0
      }
1197
0
      return 1;
1198
0
    case I_WORD:
1199
0
      i = sp > bol && iswordchar(sp[-1]);
1200
0
      i ^= iswordchar(sp[0]);
1201
0
      if (!i)
1202
0
        return 1;
1203
0
      pc = pc + 1;
1204
0
      break;
1205
0
    case I_NWORD:
1206
0
      i = sp > bol && iswordchar(sp[-1]);
1207
0
      i ^= iswordchar(sp[0]);
1208
0
      if (i)
1209
0
        return 1;
1210
0
      pc = pc + 1;
1211
0
      break;
1212
1213
0
    case I_LPAR:
1214
0
      out->sub[pc->n].sp = sp;
1215
0
      pc = pc + 1;
1216
0
      break;
1217
0
    case I_RPAR:
1218
0
      out->sub[pc->n].ep = sp;
1219
0
      pc = pc + 1;
1220
0
      break;
1221
0
    default:
1222
0
      return 1;
1223
0
    }
1224
0
  }
1225
0
}
1226
1227
int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1228
0
{
1229
0
  Resub scratch;
1230
0
  int i;
1231
1232
0
  if (!sub)
1233
0
    sub = &scratch;
1234
1235
0
  sub->nsub = prog->nsub;
1236
0
  for (i = 0; i < REG_MAXSUB; ++i)
1237
0
    sub->sub[i].sp = sub->sub[i].ep = NULL;
1238
1239
0
  return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1240
0
}
1241
1242
#ifdef TEST
1243
int main(int argc, char **argv)
1244
{
1245
  const char *error;
1246
  const char *s;
1247
  Reprog *p;
1248
  Resub m;
1249
  int i;
1250
1251
  if (argc > 1) {
1252
    p = regcomp(argv[1], 0, &error);
1253
    if (!p) {
1254
      fprintf(stderr, "regcomp: %s\n", error);
1255
      return 1;
1256
    }
1257
1258
    if (argc > 2) {
1259
      s = argv[2];
1260
      printf("nsub = %d\n", p->nsub);
1261
      if (!regexec(p, s, &m, 0)) {
1262
        for (i = 0; i < m.nsub; ++i) {
1263
          int n = m.sub[i].ep - m.sub[i].sp;
1264
          if (n > 0)
1265
            printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1266
          else
1267
            printf("match %d: n=0 ''\n", i);
1268
        }
1269
      } else {
1270
        printf("no match\n");
1271
      }
1272
    }
1273
  }
1274
1275
  return 0;
1276
}
1277
#endif