Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/thirdparty/mujs/regexp.c
Line
Count
Source (jump to first uncovered line)
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <string.h>
4
#include <setjmp.h>
5
#include <limits.h>
6
7
#include "regexp.h"
8
#include "utf.h"
9
10
0
#define emit regemit
11
0
#define next regnext
12
0
#define accept regaccept
13
14
0
#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16
0
#define REPINF 255
17
#ifndef REG_MAXPROG
18
0
#define REG_MAXPROG (32 << 10)
19
#endif
20
#ifndef REG_MAXREC
21
0
#define REG_MAXREC 1024
22
#endif
23
#ifndef REG_MAXSPAN
24
#define REG_MAXSPAN 64
25
#endif
26
#ifndef REG_MAXCLASS
27
#define REG_MAXCLASS 16
28
#endif
29
30
typedef struct Reclass Reclass;
31
typedef struct Renode Renode;
32
typedef struct Reinst Reinst;
33
typedef struct Rethread Rethread;
34
35
struct Reclass {
36
  Rune *end;
37
  Rune spans[REG_MAXSPAN];
38
};
39
40
struct Reprog {
41
  Reinst *start, *end;
42
  int flags;
43
  int nsub;
44
  Reclass cclass[REG_MAXCLASS];
45
};
46
47
struct cstate {
48
  Reprog *prog;
49
  Renode *pstart, *pend;
50
51
  const char *source;
52
  int ncclass;
53
  int nsub;
54
  Renode *sub[REG_MAXSUB];
55
56
  int lookahead;
57
  Rune yychar;
58
  Reclass *yycc;
59
  int yymin, yymax;
60
61
  const char *error;
62
  jmp_buf kaboom;
63
};
64
65
static void die(struct cstate *g, const char *message)
66
0
{
67
0
  g->error = message;
68
0
  longjmp(g->kaboom, 1);
69
0
}
70
71
static int canon(Rune c)
72
0
{
73
0
  Rune u = toupperrune(c);
74
0
  if (c >= 128 && u < 128)
75
0
    return c;
76
0
  return u;
77
0
}
78
79
/* Scan */
80
81
enum {
82
  L_CHAR = 256,
83
  L_CCLASS, /* character class */
84
  L_NCCLASS,  /* negative character class */
85
  L_NC,   /* "(?:" no capture */
86
  L_PLA,    /* "(?=" positive lookahead */
87
  L_NLA,    /* "(?!" negative lookahead */
88
  L_WORD,   /* "\b" word boundary */
89
  L_NWORD,  /* "\B" non-word boundary */
90
  L_REF,    /* "\1" back-reference */
91
  L_COUNT,  /* {M,N} */
92
};
93
94
static int hex(struct cstate *g, int c)
95
0
{
96
0
  if (c >= '0' && c <= '9') return c - '0';
97
0
  if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
98
0
  if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
99
0
  die(g, "invalid escape sequence");
100
0
  return 0;
101
0
}
102
103
static int dec(struct cstate *g, int c)
104
0
{
105
0
  if (c >= '0' && c <= '9') return c - '0';
106
0
  die(g, "invalid quantifier");
107
0
  return 0;
108
0
}
109
110
0
#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789"
111
112
static int isunicodeletter(int c)
113
0
{
114
0
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
115
0
}
116
117
static int nextrune(struct cstate *g)
118
0
{
119
0
  if (!*g->source) {
120
0
    g->yychar = EOF;
121
0
    return 0;
122
0
  }
123
0
  g->source += chartorune(&g->yychar, g->source);
124
0
  if (g->yychar == '\\') {
125
0
    if (!*g->source)
126
0
      die(g, "unterminated escape sequence");
127
0
    g->source += chartorune(&g->yychar, g->source);
128
0
    switch (g->yychar) {
129
0
    case 'f': g->yychar = '\f'; return 0;
130
0
    case 'n': g->yychar = '\n'; return 0;
131
0
    case 'r': g->yychar = '\r'; return 0;
132
0
    case 't': g->yychar = '\t'; return 0;
133
0
    case 'v': g->yychar = '\v'; return 0;
134
0
    case 'c':
135
0
      if (!g->source[0])
136
0
        die(g, "unterminated escape sequence");
137
0
      g->yychar = (*g->source++) & 31;
138
0
      return 0;
139
0
    case 'x':
140
0
      if (!g->source[0] || !g->source[1])
141
0
        die(g, "unterminated escape sequence");
142
0
      g->yychar = hex(g, *g->source++) << 4;
143
0
      g->yychar += hex(g, *g->source++);
144
0
      if (g->yychar == 0) {
145
0
        g->yychar = '0';
146
0
        return 1;
147
0
      }
148
0
      return 0;
149
0
    case 'u':
150
0
      if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
151
0
        die(g, "unterminated escape sequence");
152
0
      g->yychar = hex(g, *g->source++) << 12;
153
0
      g->yychar += hex(g, *g->source++) << 8;
154
0
      g->yychar += hex(g, *g->source++) << 4;
155
0
      g->yychar += hex(g, *g->source++);
156
0
      if (g->yychar == 0) {
157
0
        g->yychar = '0';
158
0
        return 1;
159
0
      }
160
0
      return 0;
161
0
    case 0:
162
0
      g->yychar = '0';
163
0
      return 1;
164
0
    }
165
0
    if (strchr(ESCAPES, g->yychar))
166
0
      return 1;
167
0
    if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
168
0
      die(g, "invalid escape character");
169
0
    return 0;
170
0
  }
171
0
  return 0;
172
0
}
173
174
static int lexcount(struct cstate *g)
175
0
{
176
0
  g->yychar = *g->source++;
177
178
0
  g->yymin = dec(g, g->yychar);
179
0
  g->yychar = *g->source++;
180
0
  while (g->yychar != ',' && g->yychar != '}') {
181
0
    g->yymin = g->yymin * 10 + dec(g, g->yychar);
182
0
    g->yychar = *g->source++;
183
0
    if (g->yymin >= REPINF)
184
0
      die(g, "numeric overflow");
185
0
  }
186
187
0
  if (g->yychar == ',') {
188
0
    g->yychar = *g->source++;
189
0
    if (g->yychar == '}') {
190
0
      g->yymax = REPINF;
191
0
    } else {
192
0
      g->yymax = dec(g, g->yychar);
193
0
      g->yychar = *g->source++;
194
0
      while (g->yychar != '}') {
195
0
        g->yymax = g->yymax * 10 + dec(g, g->yychar);
196
0
        g->yychar = *g->source++;
197
0
        if (g->yymax >= REPINF)
198
0
          die(g, "numeric overflow");
199
0
      }
200
0
    }
201
0
  } else {
202
0
    g->yymax = g->yymin;
203
0
  }
204
205
0
  return L_COUNT;
206
0
}
207
208
static void newcclass(struct cstate *g)
209
0
{
210
0
  if (g->ncclass >= nelem(g->prog->cclass))
211
0
    die(g, "too many character classes");
212
0
  g->yycc = g->prog->cclass + g->ncclass++;
213
0
  g->yycc->end = g->yycc->spans;
214
0
}
215
216
static void addrange(struct cstate *g, Rune a, Rune b)
217
0
{
218
0
  if (a > b)
219
0
    die(g, "invalid character class range");
220
0
  if (g->yycc->end + 2 >= g->yycc->spans + nelem(g->yycc->spans))
221
0
    die(g, "too many character class ranges");
222
0
  *g->yycc->end++ = a;
223
0
  *g->yycc->end++ = b;
224
0
}
225
226
static void addranges_d(struct cstate *g)
227
0
{
228
0
  addrange(g, '0', '9');
229
0
}
230
231
static void addranges_D(struct cstate *g)
232
0
{
233
0
  addrange(g, 0, '0'-1);
234
0
  addrange(g, '9'+1, 0xFFFF);
235
0
}
236
237
static void addranges_s(struct cstate *g)
238
0
{
239
0
  addrange(g, 0x9, 0xD);
240
0
  addrange(g, 0x20, 0x20);
241
0
  addrange(g, 0xA0, 0xA0);
242
0
  addrange(g, 0x2028, 0x2029);
243
0
  addrange(g, 0xFEFF, 0xFEFF);
244
0
}
245
246
static void addranges_S(struct cstate *g)
247
0
{
248
0
  addrange(g, 0, 0x9-1);
249
0
  addrange(g, 0xD+1, 0x20-1);
250
0
  addrange(g, 0x20+1, 0xA0-1);
251
0
  addrange(g, 0xA0+1, 0x2028-1);
252
0
  addrange(g, 0x2029+1, 0xFEFF-1);
253
0
  addrange(g, 0xFEFF+1, 0xFFFF);
254
0
}
255
256
static void addranges_w(struct cstate *g)
257
0
{
258
0
  addrange(g, '0', '9');
259
0
  addrange(g, 'A', 'Z');
260
0
  addrange(g, '_', '_');
261
0
  addrange(g, 'a', 'z');
262
0
}
263
264
static void addranges_W(struct cstate *g)
265
0
{
266
0
  addrange(g, 0, '0'-1);
267
0
  addrange(g, '9'+1, 'A'-1);
268
0
  addrange(g, 'Z'+1, '_'-1);
269
0
  addrange(g, '_'+1, 'a'-1);
270
0
  addrange(g, 'z'+1, 0xFFFF);
271
0
}
272
273
static int lexclass(struct cstate *g)
274
0
{
275
0
  int type = L_CCLASS;
276
0
  int quoted, havesave, havedash;
277
0
  Rune save = 0;
278
279
0
  newcclass(g);
280
281
0
  quoted = nextrune(g);
282
0
  if (!quoted && g->yychar == '^') {
283
0
    type = L_NCCLASS;
284
0
    quoted = nextrune(g);
285
0
  }
286
287
0
  havesave = havedash = 0;
288
0
  for (;;) {
289
0
    if (g->yychar == EOF)
290
0
      die(g, "unterminated character class");
291
0
    if (!quoted && g->yychar == ']')
292
0
      break;
293
294
0
    if (!quoted && g->yychar == '-') {
295
0
      if (havesave) {
296
0
        if (havedash) {
297
0
          addrange(g, save, '-');
298
0
          havesave = havedash = 0;
299
0
        } else {
300
0
          havedash = 1;
301
0
        }
302
0
      } else {
303
0
        save = '-';
304
0
        havesave = 1;
305
0
      }
306
0
    } else if (quoted && strchr("DSWdsw", g->yychar)) {
307
0
      if (havesave) {
308
0
        addrange(g, save, save);
309
0
        if (havedash)
310
0
          addrange(g, '-', '-');
311
0
      }
312
0
      switch (g->yychar) {
313
0
      case 'd': addranges_d(g); break;
314
0
      case 's': addranges_s(g); break;
315
0
      case 'w': addranges_w(g); break;
316
0
      case 'D': addranges_D(g); break;
317
0
      case 'S': addranges_S(g); break;
318
0
      case 'W': addranges_W(g); break;
319
0
      }
320
0
      havesave = havedash = 0;
321
0
    } else {
322
0
      if (quoted) {
323
0
        if (g->yychar == 'b')
324
0
          g->yychar = '\b';
325
0
        else if (g->yychar == '0')
326
0
          g->yychar = 0;
327
        /* else identity escape */
328
0
      }
329
0
      if (havesave) {
330
0
        if (havedash) {
331
0
          addrange(g, save, g->yychar);
332
0
          havesave = havedash = 0;
333
0
        } else {
334
0
          addrange(g, save, save);
335
0
          save = g->yychar;
336
0
        }
337
0
      } else {
338
0
        save = g->yychar;
339
0
        havesave = 1;
340
0
      }
341
0
    }
342
343
0
    quoted = nextrune(g);
344
0
  }
345
346
0
  if (havesave) {
347
0
    addrange(g, save, save);
348
0
    if (havedash)
349
0
      addrange(g, '-', '-');
350
0
  }
351
352
0
  return type;
353
0
}
354
355
static int lex(struct cstate *g)
356
0
{
357
0
  int quoted = nextrune(g);
358
0
  if (quoted) {
359
0
    switch (g->yychar) {
360
0
    case 'b': return L_WORD;
361
0
    case 'B': return L_NWORD;
362
0
    case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
363
0
    case 's': newcclass(g); addranges_s(g); return L_CCLASS;
364
0
    case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
365
0
    case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
366
0
    case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
367
0
    case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
368
0
    case '0': g->yychar = 0; return L_CHAR;
369
0
    }
370
0
    if (g->yychar >= '0' && g->yychar <= '9') {
371
0
      g->yychar -= '0';
372
0
      if (*g->source >= '0' && *g->source <= '9')
373
0
        g->yychar = g->yychar * 10 + *g->source++ - '0';
374
0
      return L_REF;
375
0
    }
376
0
    return L_CHAR;
377
0
  }
378
379
0
  switch (g->yychar) {
380
0
  case EOF:
381
0
  case '$': case ')': case '*': case '+':
382
0
  case '.': case '?': case '^': case '|':
383
0
    return g->yychar;
384
0
  }
385
386
0
  if (g->yychar == '{')
387
0
    return lexcount(g);
388
0
  if (g->yychar == '[')
389
0
    return lexclass(g);
390
0
  if (g->yychar == '(') {
391
0
    if (g->source[0] == '?') {
392
0
      if (g->source[1] == ':') {
393
0
        g->source += 2;
394
0
        return L_NC;
395
0
      }
396
0
      if (g->source[1] == '=') {
397
0
        g->source += 2;
398
0
        return L_PLA;
399
0
      }
400
0
      if (g->source[1] == '!') {
401
0
        g->source += 2;
402
0
        return L_NLA;
403
0
      }
404
0
    }
405
0
    return '(';
406
0
  }
407
408
0
  return L_CHAR;
409
0
}
410
411
/* Parse */
412
413
enum {
414
  P_CAT, P_ALT, P_REP,
415
  P_BOL, P_EOL, P_WORD, P_NWORD,
416
  P_PAR, P_PLA, P_NLA,
417
  P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
418
  P_REF,
419
};
420
421
struct Renode {
422
  unsigned char type;
423
  unsigned char ng, m, n;
424
  Rune c;
425
  Reclass *cc;
426
  Renode *x;
427
  Renode *y;
428
};
429
430
static Renode *newnode(struct cstate *g, int type)
431
0
{
432
0
  Renode *node = g->pend++;
433
0
  node->type = type;
434
0
  node->cc = NULL;
435
0
  node->c = 0;
436
0
  node->ng = 0;
437
0
  node->m = 0;
438
0
  node->n = 0;
439
0
  node->x = node->y = NULL;
440
0
  return node;
441
0
}
442
443
static int empty(Renode *node)
444
0
{
445
0
  if (!node) return 1;
446
0
  switch (node->type) {
447
0
  default: return 1;
448
0
  case P_CAT: return empty(node->x) && empty(node->y);
449
0
  case P_ALT: return empty(node->x) || empty(node->y);
450
0
  case P_REP: return empty(node->x) || node->m == 0;
451
0
  case P_PAR: return empty(node->x);
452
0
  case P_REF: return empty(node->x);
453
0
  case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
454
0
  }
455
0
}
456
457
static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
458
0
{
459
0
  Renode *rep = newnode(g, P_REP);
460
0
  if (max == REPINF && empty(atom))
461
0
    die(g, "infinite loop matching the empty string");
462
0
  rep->ng = ng;
463
0
  rep->m = min;
464
0
  rep->n = max;
465
0
  rep->x = atom;
466
0
  return rep;
467
0
}
468
469
static void next(struct cstate *g)
470
0
{
471
0
  g->lookahead = lex(g);
472
0
}
473
474
static int accept(struct cstate *g, int t)
475
0
{
476
0
  if (g->lookahead == t) {
477
0
    next(g);
478
0
    return 1;
479
0
  }
480
0
  return 0;
481
0
}
482
483
static Renode *parsealt(struct cstate *g);
484
485
static Renode *parseatom(struct cstate *g)
486
0
{
487
0
  Renode *atom;
488
0
  if (g->lookahead == L_CHAR) {
489
0
    atom = newnode(g, P_CHAR);
490
0
    atom->c = g->yychar;
491
0
    next(g);
492
0
    return atom;
493
0
  }
494
0
  if (g->lookahead == L_CCLASS) {
495
0
    atom = newnode(g, P_CCLASS);
496
0
    atom->cc = g->yycc;
497
0
    next(g);
498
0
    return atom;
499
0
  }
500
0
  if (g->lookahead == L_NCCLASS) {
501
0
    atom = newnode(g, P_NCCLASS);
502
0
    atom->cc = g->yycc;
503
0
    next(g);
504
0
    return atom;
505
0
  }
506
0
  if (g->lookahead == L_REF) {
507
0
    atom = newnode(g, P_REF);
508
0
    if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
509
0
      die(g, "invalid back-reference");
510
0
    atom->n = g->yychar;
511
0
    atom->x = g->sub[g->yychar];
512
0
    next(g);
513
0
    return atom;
514
0
  }
515
0
  if (accept(g, '.'))
516
0
    return newnode(g, P_ANY);
517
0
  if (accept(g, '(')) {
518
0
    atom = newnode(g, P_PAR);
519
0
    if (g->nsub == REG_MAXSUB)
520
0
      die(g, "too many captures");
521
0
    atom->n = g->nsub++;
522
0
    atom->x = parsealt(g);
523
0
    g->sub[atom->n] = atom;
524
0
    if (!accept(g, ')'))
525
0
      die(g, "unmatched '('");
526
0
    return atom;
527
0
  }
528
0
  if (accept(g, L_NC)) {
529
0
    atom = parsealt(g);
530
0
    if (!accept(g, ')'))
531
0
      die(g, "unmatched '('");
532
0
    return atom;
533
0
  }
534
0
  if (accept(g, L_PLA)) {
535
0
    atom = newnode(g, P_PLA);
536
0
    atom->x = parsealt(g);
537
0
    if (!accept(g, ')'))
538
0
      die(g, "unmatched '('");
539
0
    return atom;
540
0
  }
541
0
  if (accept(g, L_NLA)) {
542
0
    atom = newnode(g, P_NLA);
543
0
    atom->x = parsealt(g);
544
0
    if (!accept(g, ')'))
545
0
      die(g, "unmatched '('");
546
0
    return atom;
547
0
  }
548
0
  die(g, "syntax error");
549
0
  return NULL;
550
0
}
551
552
static Renode *parserep(struct cstate *g)
553
0
{
554
0
  Renode *atom;
555
556
0
  if (accept(g, '^')) return newnode(g, P_BOL);
557
0
  if (accept(g, '$')) return newnode(g, P_EOL);
558
0
  if (accept(g, L_WORD)) return newnode(g, P_WORD);
559
0
  if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
560
561
0
  atom = parseatom(g);
562
0
  if (g->lookahead == L_COUNT) {
563
0
    int min = g->yymin, max = g->yymax;
564
0
    next(g);
565
0
    if (max < min)
566
0
      die(g, "invalid quantifier");
567
0
    return newrep(g, atom, accept(g, '?'), min, max);
568
0
  }
569
0
  if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
570
0
  if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
571
0
  if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
572
0
  return atom;
573
0
}
574
575
static Renode *parsecat(struct cstate *g)
576
0
{
577
0
  Renode *cat, *head, **tail;
578
0
  if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
579
    /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
580
0
    head = parserep(g);
581
0
    tail = &head;
582
0
    while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
583
0
      cat = newnode(g, P_CAT);
584
0
      cat->x = *tail;
585
0
      cat->y = parserep(g);
586
0
      *tail = cat;
587
0
      tail = &cat->y;
588
0
    }
589
0
    return head;
590
0
  }
591
0
  return NULL;
592
0
}
593
594
static Renode *parsealt(struct cstate *g)
595
0
{
596
0
  Renode *alt, *x;
597
0
  alt = parsecat(g);
598
0
  while (accept(g, '|')) {
599
0
    x = alt;
600
0
    alt = newnode(g, P_ALT);
601
0
    alt->x = x;
602
0
    alt->y = parsecat(g);
603
0
  }
604
0
  return alt;
605
0
}
606
607
/* Compile */
608
609
enum {
610
  I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
611
  I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
612
  I_BOL, I_EOL, I_WORD, I_NWORD,
613
  I_LPAR, I_RPAR
614
};
615
616
struct Reinst {
617
  unsigned char opcode;
618
  unsigned char n;
619
  Rune c;
620
  Reclass *cc;
621
  Reinst *x;
622
  Reinst *y;
623
};
624
625
static int count(struct cstate *g, Renode *node, int depth)
626
0
{
627
0
  int min, max, n;
628
0
  if (!node) return 0;
629
0
  if (++depth > REG_MAXREC) die(g, "stack overflow");
630
0
  switch (node->type) {
631
0
  default: return 1;
632
0
  case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth);
633
0
  case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2;
634
0
  case P_REP:
635
0
    min = node->m;
636
0
    max = node->n;
637
0
    if (min == max) n = count(g, node->x, depth) * min;
638
0
    else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min);
639
0
    else n = count(g, node->x, depth) * (min + 1) + 2;
640
0
    if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
641
0
    return n;
642
0
  case P_PAR: return count(g, node->x, depth) + 2;
643
0
  case P_PLA: return count(g, node->x, depth) + 2;
644
0
  case P_NLA: return count(g, node->x, depth) + 2;
645
0
  }
646
0
}
647
648
static Reinst *emit(Reprog *prog, int opcode)
649
0
{
650
0
  Reinst *inst = prog->end++;
651
0
  inst->opcode = opcode;
652
0
  inst->n = 0;
653
0
  inst->c = 0;
654
0
  inst->cc = NULL;
655
0
  inst->x = inst->y = NULL;
656
0
  return inst;
657
0
}
658
659
static void compile(Reprog *prog, Renode *node)
660
0
{
661
0
  Reinst *inst, *split, *jump;
662
0
  int i;
663
664
0
loop:
665
0
  if (!node)
666
0
    return;
667
668
0
  switch (node->type) {
669
0
  case P_CAT:
670
0
    compile(prog, node->x);
671
0
    node = node->y;
672
0
    goto loop;
673
674
0
  case P_ALT:
675
0
    split = emit(prog, I_SPLIT);
676
0
    compile(prog, node->x);
677
0
    jump = emit(prog, I_JUMP);
678
0
    compile(prog, node->y);
679
0
    split->x = split + 1;
680
0
    split->y = jump + 1;
681
0
    jump->x = prog->end;
682
0
    break;
683
684
0
  case P_REP:
685
0
    inst = NULL; /* silence compiler warning. assert(node->m > 0). */
686
0
    for (i = 0; i < node->m; ++i) {
687
0
      inst = prog->end;
688
0
      compile(prog, node->x);
689
0
    }
690
0
    if (node->m == node->n)
691
0
      break;
692
0
    if (node->n < REPINF) {
693
0
      for (i = node->m; i < node->n; ++i) {
694
0
        split = emit(prog, I_SPLIT);
695
0
        compile(prog, node->x);
696
0
        if (node->ng) {
697
0
          split->y = split + 1;
698
0
          split->x = prog->end;
699
0
        } else {
700
0
          split->x = split + 1;
701
0
          split->y = prog->end;
702
0
        }
703
0
      }
704
0
    } else if (node->m == 0) {
705
0
      split = emit(prog, I_SPLIT);
706
0
      compile(prog, node->x);
707
0
      jump = emit(prog, I_JUMP);
708
0
      if (node->ng) {
709
0
        split->y = split + 1;
710
0
        split->x = prog->end;
711
0
      } else {
712
0
        split->x = split + 1;
713
0
        split->y = prog->end;
714
0
      }
715
0
      jump->x = split;
716
0
    } else {
717
0
      split = emit(prog, I_SPLIT);
718
0
      if (node->ng) {
719
0
        split->y = inst;
720
0
        split->x = prog->end;
721
0
      } else {
722
0
        split->x = inst;
723
0
        split->y = prog->end;
724
0
      }
725
0
    }
726
0
    break;
727
728
0
  case P_BOL: emit(prog, I_BOL); break;
729
0
  case P_EOL: emit(prog, I_EOL); break;
730
0
  case P_WORD: emit(prog, I_WORD); break;
731
0
  case P_NWORD: emit(prog, I_NWORD); break;
732
733
0
  case P_PAR:
734
0
    inst = emit(prog, I_LPAR);
735
0
    inst->n = node->n;
736
0
    compile(prog, node->x);
737
0
    inst = emit(prog, I_RPAR);
738
0
    inst->n = node->n;
739
0
    break;
740
0
  case P_PLA:
741
0
    split = emit(prog, I_PLA);
742
0
    compile(prog, node->x);
743
0
    emit(prog, I_END);
744
0
    split->x = split + 1;
745
0
    split->y = prog->end;
746
0
    break;
747
0
  case P_NLA:
748
0
    split = emit(prog, I_NLA);
749
0
    compile(prog, node->x);
750
0
    emit(prog, I_END);
751
0
    split->x = split + 1;
752
0
    split->y = prog->end;
753
0
    break;
754
755
0
  case P_ANY:
756
0
    emit(prog, I_ANY);
757
0
    break;
758
0
  case P_CHAR:
759
0
    inst = emit(prog, I_CHAR);
760
0
    inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
761
0
    break;
762
0
  case P_CCLASS:
763
0
    inst = emit(prog, I_CCLASS);
764
0
    inst->cc = node->cc;
765
0
    break;
766
0
  case P_NCCLASS:
767
0
    inst = emit(prog, I_NCCLASS);
768
0
    inst->cc = node->cc;
769
0
    break;
770
0
  case P_REF:
771
0
    inst = emit(prog, I_REF);
772
0
    inst->n = node->n;
773
0
    break;
774
0
  }
775
0
}
776
777
#ifdef TEST
778
static void dumpnode(Renode *node)
779
{
780
  Rune *p;
781
  if (!node) { printf("Empty"); return; }
782
  switch (node->type) {
783
  case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
784
  case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
785
  case P_REP:
786
    printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
787
    dumpnode(node->x);
788
    printf(")");
789
    break;
790
  case P_BOL: printf("Bol"); break;
791
  case P_EOL: printf("Eol"); break;
792
  case P_WORD: printf("Word"); break;
793
  case P_NWORD: printf("NotWord"); break;
794
  case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
795
  case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
796
  case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
797
  case P_ANY: printf("Any"); break;
798
  case P_CHAR: printf("Char(%c)", node->c); break;
799
  case P_CCLASS:
800
    printf("Class(");
801
    for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
802
    printf(")");
803
    break;
804
  case P_NCCLASS:
805
    printf("NotClass(");
806
    for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
807
    printf(")");
808
    break;
809
  case P_REF: printf("Ref(%d)", node->n); break;
810
  }
811
}
812
813
static void dumpcclass(Reclass *cc) {
814
  Rune *p;
815
  for (p = cc->spans; p < cc->end; p += 2) {
816
    if (p[0] > 32 && p[0] < 127)
817
      printf(" %c", p[0]);
818
    else
819
      printf(" \\x%02x", p[0]);
820
    if (p[1] > 32 && p[1] < 127)
821
      printf("-%c", p[1]);
822
    else
823
      printf("-\\x%02x", p[1]);
824
  }
825
  putchar('\n');
826
}
827
828
static void dumpprog(Reprog *prog)
829
{
830
  Reinst *inst;
831
  int i;
832
  for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
833
    printf("% 5d: ", i);
834
    switch (inst->opcode) {
835
    case I_END: puts("end"); break;
836
    case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
837
    case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
838
    case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
839
    case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
840
    case I_ANY: puts("any"); break;
841
    case I_ANYNL: puts("anynl"); break;
842
    case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
843
    case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break;
844
    case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break;
845
    case I_REF: printf("ref %d\n", inst->n); break;
846
    case I_BOL: puts("bol"); break;
847
    case I_EOL: puts("eol"); break;
848
    case I_WORD: puts("word"); break;
849
    case I_NWORD: puts("nword"); break;
850
    case I_LPAR: printf("lpar %d\n", inst->n); break;
851
    case I_RPAR: printf("rpar %d\n", inst->n); break;
852
    }
853
  }
854
}
855
#endif
856
857
Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
858
  const char *pattern, int cflags, const char **errorp)
859
0
{
860
0
  struct cstate g;
861
0
  Renode *node;
862
0
  Reinst *split, *jump;
863
0
  int i, n;
864
865
0
  g.pstart = NULL;
866
0
  g.prog = NULL;
867
868
0
  if (setjmp(g.kaboom)) {
869
0
    if (errorp) *errorp = g.error;
870
0
    alloc(ctx, g.pstart, 0);
871
0
    alloc(ctx, g.prog, 0);
872
0
    return NULL;
873
0
  }
874
875
0
  g.prog = alloc(ctx, NULL, sizeof (Reprog));
876
0
  if (!g.prog)
877
0
    die(&g, "cannot allocate regular expression");
878
0
  n = strlen(pattern) * 2;
879
0
  if (n > REG_MAXPROG)
880
0
    die(&g, "program too large");
881
0
  if (n > 0) {
882
0
    g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
883
0
    if (!g.pstart)
884
0
      die(&g, "cannot allocate regular expression parse list");
885
0
  }
886
887
0
  g.source = pattern;
888
0
  g.ncclass = 0;
889
0
  g.nsub = 1;
890
0
  for (i = 0; i < REG_MAXSUB; ++i)
891
0
    g.sub[i] = 0;
892
893
0
  g.prog->flags = cflags;
894
895
0
  next(&g);
896
0
  node = parsealt(&g);
897
0
  if (g.lookahead == ')')
898
0
    die(&g, "unmatched ')'");
899
0
  if (g.lookahead != EOF)
900
0
    die(&g, "syntax error");
901
902
#ifdef TEST
903
  dumpnode(node);
904
  putchar('\n');
905
#endif
906
907
0
  n = 6 + count(&g, node, 0);
908
0
  if (n < 0 || n > REG_MAXPROG)
909
0
    die(&g, "program too large");
910
911
0
  g.prog->nsub = g.nsub;
912
0
  g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
913
0
  if (!g.prog->start)
914
0
    die(&g, "cannot allocate regular expression instruction list");
915
916
0
  split = emit(g.prog, I_SPLIT);
917
0
  split->x = split + 3;
918
0
  split->y = split + 1;
919
0
  emit(g.prog, I_ANYNL);
920
0
  jump = emit(g.prog, I_JUMP);
921
0
  jump->x = split;
922
0
  emit(g.prog, I_LPAR);
923
0
  compile(g.prog, node);
924
0
  emit(g.prog, I_RPAR);
925
0
  emit(g.prog, I_END);
926
927
#ifdef TEST
928
  dumpprog(g.prog);
929
#endif
930
931
0
  alloc(ctx, g.pstart, 0);
932
933
0
  if (errorp) *errorp = NULL;
934
0
  return g.prog;
935
0
}
936
937
void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
938
0
{
939
0
  if (prog) {
940
0
    alloc(ctx, prog->start, 0);
941
0
    alloc(ctx, prog, 0);
942
0
  }
943
0
}
944
945
static void *default_alloc(void *ctx, void *p, int n)
946
0
{
947
0
  if (n == 0) {
948
0
    free(p);
949
0
    return NULL;
950
0
  }
951
0
  return realloc(p, (size_t)n);
952
0
}
953
954
Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
955
0
{
956
0
  return regcompx(default_alloc, NULL, pattern, cflags, errorp);
957
0
}
958
959
void regfree(Reprog *prog)
960
0
{
961
0
  regfreex(default_alloc, NULL, prog);
962
0
}
963
964
/* Match */
965
966
static int isnewline(int c)
967
0
{
968
0
  return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
969
0
}
970
971
static int iswordchar(int c)
972
0
{
973
0
  return c == '_' ||
974
0
    (c >= 'a' && c <= 'z') ||
975
0
    (c >= 'A' && c <= 'Z') ||
976
0
    (c >= '0' && c <= '9');
977
0
}
978
979
static int incclass(Reclass *cc, Rune c)
980
0
{
981
0
  Rune *p;
982
0
  for (p = cc->spans; p < cc->end; p += 2)
983
0
    if (p[0] <= c && c <= p[1])
984
0
      return 1;
985
0
  return 0;
986
0
}
987
988
static int incclasscanon(Reclass *cc, Rune c)
989
0
{
990
0
  Rune *p, r;
991
0
  for (p = cc->spans; p < cc->end; p += 2)
992
0
    for (r = p[0]; r <= p[1]; ++r)
993
0
      if (c == canon(r))
994
0
        return 1;
995
0
  return 0;
996
0
}
997
998
static int strncmpcanon(const char *a, const char *b, int n)
999
0
{
1000
0
  Rune ra, rb;
1001
0
  int c;
1002
0
  while (n--) {
1003
0
    if (!*a) return -1;
1004
0
    if (!*b) return 1;
1005
0
    a += chartorune(&ra, a);
1006
0
    b += chartorune(&rb, b);
1007
0
    c = canon(ra) - canon(rb);
1008
0
    if (c)
1009
0
      return c;
1010
0
  }
1011
0
  return 0;
1012
0
}
1013
1014
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
1015
0
{
1016
0
  Resub scratch;
1017
0
  int result;
1018
0
  int i;
1019
0
  Rune c;
1020
1021
  /* stack overflow */
1022
0
  if (depth > REG_MAXREC)
1023
0
    return -1;
1024
1025
0
  for (;;) {
1026
0
    switch (pc->opcode) {
1027
0
    case I_END:
1028
0
      return 0;
1029
0
    case I_JUMP:
1030
0
      pc = pc->x;
1031
0
      break;
1032
0
    case I_SPLIT:
1033
0
      scratch = *out;
1034
0
      result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1035
0
      if (result == -1)
1036
0
        return -1;
1037
0
      if (result == 0) {
1038
0
        *out = scratch;
1039
0
        return 0;
1040
0
      }
1041
0
      pc = pc->y;
1042
0
      break;
1043
1044
0
    case I_PLA:
1045
0
      result = match(pc->x, sp, bol, flags, out, depth+1);
1046
0
      if (result == -1)
1047
0
        return -1;
1048
0
      if (result == 1)
1049
0
        return 1;
1050
0
      pc = pc->y;
1051
0
      break;
1052
0
    case I_NLA:
1053
0
      scratch = *out;
1054
0
      result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1055
0
      if (result == -1)
1056
0
        return -1;
1057
0
      if (result == 0)
1058
0
        return 1;
1059
0
      pc = pc->y;
1060
0
      break;
1061
1062
0
    case I_ANYNL:
1063
0
      if (!*sp) return 1;
1064
0
      sp += chartorune(&c, sp);
1065
0
      pc = pc + 1;
1066
0
      break;
1067
0
    case I_ANY:
1068
0
      if (!*sp) return 1;
1069
0
      sp += chartorune(&c, sp);
1070
0
      if (isnewline(c))
1071
0
        return 1;
1072
0
      pc = pc + 1;
1073
0
      break;
1074
0
    case I_CHAR:
1075
0
      if (!*sp) return 1;
1076
0
      sp += chartorune(&c, sp);
1077
0
      if (flags & REG_ICASE)
1078
0
        c = canon(c);
1079
0
      if (c != pc->c)
1080
0
        return 1;
1081
0
      pc = pc + 1;
1082
0
      break;
1083
0
    case I_CCLASS:
1084
0
      if (!*sp) return 1;
1085
0
      sp += chartorune(&c, sp);
1086
0
      if (flags & REG_ICASE) {
1087
0
        if (!incclasscanon(pc->cc, canon(c)))
1088
0
          return 1;
1089
0
      } else {
1090
0
        if (!incclass(pc->cc, c))
1091
0
          return 1;
1092
0
      }
1093
0
      pc = pc + 1;
1094
0
      break;
1095
0
    case I_NCCLASS:
1096
0
      if (!*sp) return 1;
1097
0
      sp += chartorune(&c, sp);
1098
0
      if (flags & REG_ICASE) {
1099
0
        if (incclasscanon(pc->cc, canon(c)))
1100
0
          return 1;
1101
0
      } else {
1102
0
        if (incclass(pc->cc, c))
1103
0
          return 1;
1104
0
      }
1105
0
      pc = pc + 1;
1106
0
      break;
1107
0
    case I_REF:
1108
0
      i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1109
0
      if (flags & REG_ICASE) {
1110
0
        if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1111
0
          return 1;
1112
0
      } else {
1113
0
        if (strncmp(sp, out->sub[pc->n].sp, i))
1114
0
          return 1;
1115
0
      }
1116
0
      if (i > 0)
1117
0
        sp += i;
1118
0
      pc = pc + 1;
1119
0
      break;
1120
1121
0
    case I_BOL:
1122
0
      if (sp == bol && !(flags & REG_NOTBOL)) {
1123
0
        pc = pc + 1;
1124
0
        break;
1125
0
      }
1126
0
      if (flags & REG_NEWLINE) {
1127
0
        if (sp > bol && isnewline(sp[-1])) {
1128
0
          pc = pc + 1;
1129
0
          break;
1130
0
        }
1131
0
      }
1132
0
      return 1;
1133
0
    case I_EOL:
1134
0
      if (*sp == 0) {
1135
0
        pc = pc + 1;
1136
0
        break;
1137
0
      }
1138
0
      if (flags & REG_NEWLINE) {
1139
0
        if (isnewline(*sp)) {
1140
0
          pc = pc + 1;
1141
0
          break;
1142
0
        }
1143
0
      }
1144
0
      return 1;
1145
0
    case I_WORD:
1146
0
      i = sp > bol && iswordchar(sp[-1]);
1147
0
      i ^= iswordchar(sp[0]);
1148
0
      if (!i)
1149
0
        return 1;
1150
0
      pc = pc + 1;
1151
0
      break;
1152
0
    case I_NWORD:
1153
0
      i = sp > bol && iswordchar(sp[-1]);
1154
0
      i ^= iswordchar(sp[0]);
1155
0
      if (i)
1156
0
        return 1;
1157
0
      pc = pc + 1;
1158
0
      break;
1159
1160
0
    case I_LPAR:
1161
0
      out->sub[pc->n].sp = sp;
1162
0
      pc = pc + 1;
1163
0
      break;
1164
0
    case I_RPAR:
1165
0
      out->sub[pc->n].ep = sp;
1166
0
      pc = pc + 1;
1167
0
      break;
1168
0
    default:
1169
0
      return 1;
1170
0
    }
1171
0
  }
1172
0
}
1173
1174
int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1175
0
{
1176
0
  Resub scratch;
1177
0
  int i;
1178
1179
0
  if (!sub)
1180
0
    sub = &scratch;
1181
1182
0
  sub->nsub = prog->nsub;
1183
0
  for (i = 0; i < REG_MAXSUB; ++i)
1184
0
    sub->sub[i].sp = sub->sub[i].ep = NULL;
1185
1186
0
  return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1187
0
}
1188
1189
#ifdef TEST
1190
int main(int argc, char **argv)
1191
{
1192
  const char *error;
1193
  const char *s;
1194
  Reprog *p;
1195
  Resub m;
1196
  int i;
1197
1198
  if (argc > 1) {
1199
    p = regcomp(argv[1], 0, &error);
1200
    if (!p) {
1201
      fprintf(stderr, "regcomp: %s\n", error);
1202
      return 1;
1203
    }
1204
1205
    if (argc > 2) {
1206
      s = argv[2];
1207
      printf("nsub = %d\n", p->nsub);
1208
      if (!regexec(p, s, &m, 0)) {
1209
        for (i = 0; i < m.nsub; ++i) {
1210
          int n = m.sub[i].ep - m.sub[i].sp;
1211
          if (n > 0)
1212
            printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1213
          else
1214
            printf("match %d: n=0 ''\n", i);
1215
        }
1216
      } else {
1217
        printf("no match\n");
1218
      }
1219
    }
1220
  }
1221
1222
  return 0;
1223
}
1224
#endif