Coverage Report

Created: 2025-06-15 06:31

/src/postgres/src/backend/regex/rege_dfa.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * DFA routines
3
 * This file is #included by regexec.c.
4
 *
5
 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6
 *
7
 * Development of this software was funded, in part, by Cray Research Inc.,
8
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9
 * Corporation, none of whom are responsible for the results.  The author
10
 * thanks all of them.
11
 *
12
 * Redistribution and use in source and binary forms -- with or without
13
 * modification -- are permitted for any purpose, provided that
14
 * redistributions in source form retain this entire copyright notice and
15
 * indicate the origin and nature of any modifications.
16
 *
17
 * I'd appreciate being given credit for this package in the documentation
18
 * of software which uses it, but that is not a requirement.
19
 *
20
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
 *
31
 * src/backend/regex/rege_dfa.c
32
 *
33
 */
34
35
/*
36
 * longest - longest-preferred matching engine
37
 *
38
 * On success, returns match endpoint address.  Returns NULL on no match.
39
 * Internal errors also return NULL, with v->err set.
40
 */
41
static chr *
42
longest(struct vars *v,
43
    struct dfa *d,
44
    chr *start,       /* where the match should start */
45
    chr *stop,        /* match must end at or before here */
46
    int *hitstopp)      /* record whether hit v->stop, if non-NULL */
47
0
{
48
0
  chr      *cp;
49
0
  chr      *realstop = (stop == v->stop) ? stop : stop + 1;
50
0
  color   co;
51
0
  struct sset *css;
52
0
  struct sset *ss;
53
0
  chr      *post;
54
0
  int     i;
55
0
  struct colormap *cm = d->cm;
56
57
  /* prevent "uninitialized variable" warnings */
58
0
  if (hitstopp != NULL)
59
0
    *hitstopp = 0;
60
61
  /* if this is a backref to a known string, just match against that */
62
0
  if (d->backno >= 0)
63
0
  {
64
0
    assert((size_t) d->backno < v->nmatch);
65
0
    if (v->pmatch[d->backno].rm_so >= 0)
66
0
    {
67
0
      cp = dfa_backref(v, d, start, start, stop, false);
68
0
      if (cp == v->stop && stop == v->stop && hitstopp != NULL)
69
0
        *hitstopp = 1;
70
0
      return cp;
71
0
    }
72
0
  }
73
74
  /* fast path for matchall NFAs */
75
0
  if (d->cnfa->flags & MATCHALL)
76
0
  {
77
0
    size_t    nchr = stop - start;
78
0
    size_t    maxmatchall = d->cnfa->maxmatchall;
79
80
0
    if (nchr < d->cnfa->minmatchall)
81
0
      return NULL;
82
0
    if (maxmatchall == DUPINF)
83
0
    {
84
0
      if (stop == v->stop && hitstopp != NULL)
85
0
        *hitstopp = 1;
86
0
    }
87
0
    else
88
0
    {
89
0
      if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
90
0
        *hitstopp = 1;
91
0
      if (nchr > maxmatchall)
92
0
        return start + maxmatchall;
93
0
    }
94
0
    return stop;
95
0
  }
96
97
  /* initialize */
98
0
  css = initialize(v, d, start);
99
0
  if (css == NULL)
100
0
    return NULL;
101
0
  cp = start;
102
103
  /* startup */
104
0
  FDEBUG(("+++ startup +++\n"));
105
0
  if (cp == v->start)
106
0
  {
107
0
    co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
108
0
    FDEBUG(("color %ld\n", (long) co));
109
0
  }
110
0
  else
111
0
  {
112
0
    co = GETCOLOR(cm, *(cp - 1));
113
0
    FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
114
0
  }
115
0
  css = miss(v, d, css, co, cp, start);
116
0
  if (css == NULL)
117
0
    return NULL;
118
0
  css->lastseen = cp;
119
120
  /*
121
   * This is the main text-scanning loop.  It seems worth having two copies
122
   * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
123
   * builds, when you're not actively tracing.
124
   */
125
#ifdef REG_DEBUG
126
  if (v->eflags & REG_FTRACE)
127
  {
128
    while (cp < realstop)
129
    {
130
      FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
131
      co = GETCOLOR(cm, *cp);
132
      FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
133
      ss = css->outs[co];
134
      if (ss == NULL)
135
      {
136
        ss = miss(v, d, css, co, cp + 1, start);
137
        if (ss == NULL)
138
          break;    /* NOTE BREAK OUT */
139
      }
140
      cp++;
141
      ss->lastseen = cp;
142
      css = ss;
143
    }
144
  }
145
  else
146
#endif
147
0
  {
148
0
    while (cp < realstop)
149
0
    {
150
0
      co = GETCOLOR(cm, *cp);
151
0
      ss = css->outs[co];
152
0
      if (ss == NULL)
153
0
      {
154
0
        ss = miss(v, d, css, co, cp + 1, start);
155
0
        if (ss == NULL)
156
0
          break;   /* NOTE BREAK OUT */
157
0
      }
158
0
      cp++;
159
0
      ss->lastseen = cp;
160
0
      css = ss;
161
0
    }
162
0
  }
163
164
0
  if (ISERR())
165
0
    return NULL;
166
167
  /* shutdown */
168
0
  FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
169
0
  if (cp == v->stop && stop == v->stop)
170
0
  {
171
0
    if (hitstopp != NULL)
172
0
      *hitstopp = 1;
173
0
    co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
174
0
    FDEBUG(("color %ld\n", (long) co));
175
0
    ss = miss(v, d, css, co, cp, start);
176
0
    if (ISERR())
177
0
      return NULL;
178
    /* special case:  match ended at eol? */
179
0
    if (ss != NULL && (ss->flags & POSTSTATE))
180
0
      return cp;
181
0
    else if (ss != NULL)
182
0
      ss->lastseen = cp; /* to be tidy */
183
0
  }
184
185
  /* find last match, if any */
186
0
  post = d->lastpost;
187
0
  for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
188
0
    if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
189
0
      (post == NULL || post < ss->lastseen))
190
0
      post = ss->lastseen;
191
0
  if (post != NULL)     /* found one */
192
0
    return post - 1;
193
194
0
  return NULL;
195
0
}
196
197
/*
198
 * shortest - shortest-preferred matching engine
199
 *
200
 * On success, returns match endpoint address.  Returns NULL on no match.
201
 * Internal errors also return NULL, with v->err set.
202
 */
203
static chr *
204
shortest(struct vars *v,
205
     struct dfa *d,
206
     chr *start,      /* where the match should start */
207
     chr *min,        /* match must end at or after here */
208
     chr *max,        /* match must end at or before here */
209
     chr **coldp,     /* store coldstart pointer here, if non-NULL */
210
     int *hitstopp)     /* record whether hit v->stop, if non-NULL */
211
0
{
212
0
  chr      *cp;
213
0
  chr      *realmin = (min == v->stop) ? min : min + 1;
214
0
  chr      *realmax = (max == v->stop) ? max : max + 1;
215
0
  color   co;
216
0
  struct sset *css;
217
0
  struct sset *ss;
218
0
  struct colormap *cm = d->cm;
219
220
  /* prevent "uninitialized variable" warnings */
221
0
  if (coldp != NULL)
222
0
    *coldp = NULL;
223
0
  if (hitstopp != NULL)
224
0
    *hitstopp = 0;
225
226
  /* if this is a backref to a known string, just match against that */
227
0
  if (d->backno >= 0)
228
0
  {
229
0
    assert((size_t) d->backno < v->nmatch);
230
0
    if (v->pmatch[d->backno].rm_so >= 0)
231
0
    {
232
0
      cp = dfa_backref(v, d, start, min, max, true);
233
0
      if (cp != NULL && coldp != NULL)
234
0
        *coldp = start;
235
      /* there is no case where we should set *hitstopp */
236
0
      return cp;
237
0
    }
238
0
  }
239
240
  /* fast path for matchall NFAs */
241
0
  if (d->cnfa->flags & MATCHALL)
242
0
  {
243
0
    size_t    nchr = min - start;
244
245
0
    if (d->cnfa->maxmatchall != DUPINF &&
246
0
      nchr > d->cnfa->maxmatchall)
247
0
      return NULL;
248
0
    if ((max - start) < d->cnfa->minmatchall)
249
0
      return NULL;
250
0
    if (nchr < d->cnfa->minmatchall)
251
0
      min = start + d->cnfa->minmatchall;
252
0
    if (coldp != NULL)
253
0
      *coldp = start;
254
    /* there is no case where we should set *hitstopp */
255
0
    return min;
256
0
  }
257
258
  /* initialize */
259
0
  css = initialize(v, d, start);
260
0
  if (css == NULL)
261
0
    return NULL;
262
0
  cp = start;
263
264
  /* startup */
265
0
  FDEBUG(("--- startup ---\n"));
266
0
  if (cp == v->start)
267
0
  {
268
0
    co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
269
0
    FDEBUG(("color %ld\n", (long) co));
270
0
  }
271
0
  else
272
0
  {
273
0
    co = GETCOLOR(cm, *(cp - 1));
274
0
    FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
275
0
  }
276
0
  css = miss(v, d, css, co, cp, start);
277
0
  if (css == NULL)
278
0
    return NULL;
279
0
  css->lastseen = cp;
280
0
  ss = css;
281
282
  /*
283
   * This is the main text-scanning loop.  It seems worth having two copies
284
   * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
285
   * builds, when you're not actively tracing.
286
   */
287
#ifdef REG_DEBUG
288
  if (v->eflags & REG_FTRACE)
289
  {
290
    while (cp < realmax)
291
    {
292
      FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
293
      co = GETCOLOR(cm, *cp);
294
      FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
295
      ss = css->outs[co];
296
      if (ss == NULL)
297
      {
298
        ss = miss(v, d, css, co, cp + 1, start);
299
        if (ss == NULL)
300
          break;    /* NOTE BREAK OUT */
301
      }
302
      cp++;
303
      ss->lastseen = cp;
304
      css = ss;
305
      if ((ss->flags & POSTSTATE) && cp >= realmin)
306
        break;      /* NOTE BREAK OUT */
307
    }
308
  }
309
  else
310
#endif
311
0
  {
312
0
    while (cp < realmax)
313
0
    {
314
0
      co = GETCOLOR(cm, *cp);
315
0
      ss = css->outs[co];
316
0
      if (ss == NULL)
317
0
      {
318
0
        ss = miss(v, d, css, co, cp + 1, start);
319
0
        if (ss == NULL)
320
0
          break;   /* NOTE BREAK OUT */
321
0
      }
322
0
      cp++;
323
0
      ss->lastseen = cp;
324
0
      css = ss;
325
0
      if ((ss->flags & POSTSTATE) && cp >= realmin)
326
0
        break;     /* NOTE BREAK OUT */
327
0
    }
328
0
  }
329
330
0
  if (ss == NULL)
331
0
    return NULL;
332
333
0
  if (coldp != NULL)     /* report last no-progress state set, if any */
334
0
    *coldp = lastcold(v, d);
335
336
0
  if ((ss->flags & POSTSTATE) && cp > min)
337
0
  {
338
0
    assert(cp >= realmin);
339
0
    cp--;
340
0
  }
341
0
  else if (cp == v->stop && max == v->stop)
342
0
  {
343
0
    co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
344
0
    FDEBUG(("color %ld\n", (long) co));
345
0
    ss = miss(v, d, css, co, cp, start);
346
    /* match might have ended at eol */
347
0
    if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
348
0
      *hitstopp = 1;
349
0
  }
350
351
0
  if (ss == NULL || !(ss->flags & POSTSTATE))
352
0
    return NULL;
353
354
0
  return cp;
355
0
}
356
357
/*
358
 * matchuntil - incremental matching engine
359
 *
360
 * This is meant for use with a search-style NFA (that is, the pattern is
361
 * known to act as though it had a leading .*).  We determine whether a
362
 * match exists starting at v->start and ending at probe.  Multiple calls
363
 * require only O(N) time not O(N^2) so long as the probe values are
364
 * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
365
 * starting a series of calls.
366
 *
367
 * Returns 1 if a match exists, 0 if not.
368
 * Internal errors also return 0, with v->err set.
369
 */
370
static int
371
matchuntil(struct vars *v,
372
       struct dfa *d,
373
       chr *probe,      /* we want to know if a match ends here */
374
       struct sset **lastcss, /* state storage across calls */
375
       chr **lastcp)    /* state storage across calls */
376
0
{
377
0
  chr      *cp = *lastcp;
378
0
  color   co;
379
0
  struct sset *css = *lastcss;
380
0
  struct sset *ss;
381
0
  struct colormap *cm = d->cm;
382
383
  /* fast path for matchall NFAs */
384
0
  if (d->cnfa->flags & MATCHALL)
385
0
  {
386
0
    size_t    nchr = probe - v->start;
387
388
0
    if (nchr < d->cnfa->minmatchall)
389
0
      return 0;
390
    /* maxmatchall will always be infinity, cf. makesearch() */
391
0
    assert(d->cnfa->maxmatchall == DUPINF);
392
0
    return 1;
393
0
  }
394
395
  /* initialize and startup, or restart, if necessary */
396
0
  if (cp == NULL || cp > probe)
397
0
  {
398
0
    cp = v->start;
399
0
    css = initialize(v, d, cp);
400
0
    if (css == NULL)
401
0
      return 0;
402
403
0
    FDEBUG((">>> startup >>>\n"));
404
0
    co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
405
0
    FDEBUG(("color %ld\n", (long) co));
406
407
0
    css = miss(v, d, css, co, cp, v->start);
408
0
    if (css == NULL)
409
0
      return 0;
410
0
    css->lastseen = cp;
411
0
  }
412
0
  else if (css == NULL)
413
0
  {
414
    /* we previously found that no match is possible beyond *lastcp */
415
0
    return 0;
416
0
  }
417
0
  ss = css;
418
419
  /*
420
   * This is the main text-scanning loop.  It seems worth having two copies
421
   * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
422
   * builds, when you're not actively tracing.
423
   */
424
#ifdef REG_DEBUG
425
  if (v->eflags & REG_FTRACE)
426
  {
427
    while (cp < probe)
428
    {
429
      FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
430
      co = GETCOLOR(cm, *cp);
431
      FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
432
      ss = css->outs[co];
433
      if (ss == NULL)
434
      {
435
        ss = miss(v, d, css, co, cp + 1, v->start);
436
        if (ss == NULL)
437
          break;    /* NOTE BREAK OUT */
438
      }
439
      cp++;
440
      ss->lastseen = cp;
441
      css = ss;
442
    }
443
  }
444
  else
445
#endif
446
0
  {
447
0
    while (cp < probe)
448
0
    {
449
0
      co = GETCOLOR(cm, *cp);
450
0
      ss = css->outs[co];
451
0
      if (ss == NULL)
452
0
      {
453
0
        ss = miss(v, d, css, co, cp + 1, v->start);
454
0
        if (ss == NULL)
455
0
          break;   /* NOTE BREAK OUT */
456
0
      }
457
0
      cp++;
458
0
      ss->lastseen = cp;
459
0
      css = ss;
460
0
    }
461
0
  }
462
463
0
  *lastcss = ss;
464
0
  *lastcp = cp;
465
466
0
  if (ss == NULL)
467
0
    return 0;       /* impossible match, or internal error */
468
469
  /* We need to process one more chr, or the EOS symbol, to check match */
470
0
  if (cp < v->stop)
471
0
  {
472
0
    FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
473
0
    co = GETCOLOR(cm, *cp);
474
0
    FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
475
0
    ss = css->outs[co];
476
0
    if (ss == NULL)
477
0
      ss = miss(v, d, css, co, cp + 1, v->start);
478
0
  }
479
0
  else
480
0
  {
481
0
    assert(cp == v->stop);
482
0
    co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
483
0
    FDEBUG(("color %ld\n", (long) co));
484
0
    ss = miss(v, d, css, co, cp, v->start);
485
0
  }
486
487
0
  if (ss == NULL || !(ss->flags & POSTSTATE))
488
0
    return 0;
489
490
0
  return 1;
491
0
}
492
493
/*
494
 * dfa_backref - find best match length for a known backref string
495
 *
496
 * When the backref's referent is already available, we can deliver an exact
497
 * answer with considerably less work than running the backref node's NFA.
498
 *
499
 * Return match endpoint for longest or shortest valid repeated match,
500
 * or NULL if there is no valid match.
501
 *
502
 * Should be in sync with cbrdissect(), although that has the different task
503
 * of checking a match to a predetermined section of the string.
504
 */
505
static chr *
506
dfa_backref(struct vars *v,
507
      struct dfa *d,
508
      chr *start,     /* where the match should start */
509
      chr *min,     /* match must end at or after here */
510
      chr *max,     /* match must end at or before here */
511
      bool shortest)
512
0
{
513
0
  int     n = d->backno;
514
0
  int     backmin = d->backmin;
515
0
  int     backmax = d->backmax;
516
0
  size_t    numreps;
517
0
  size_t    minreps;
518
0
  size_t    maxreps;
519
0
  size_t    brlen;
520
0
  chr      *brstring;
521
0
  chr      *p;
522
523
  /* get the backreferenced string (caller should have checked this) */
524
0
  if (v->pmatch[n].rm_so == -1)
525
0
    return NULL;
526
0
  brstring = v->start + v->pmatch[n].rm_so;
527
0
  brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
528
529
  /* special-case zero-length backreference to avoid divide by zero */
530
0
  if (brlen == 0)
531
0
  {
532
    /*
533
     * matches only a zero-length string, but any number of repetitions
534
     * can be considered to be present
535
     */
536
0
    if (min == start && backmin <= backmax)
537
0
      return start;
538
0
    return NULL;
539
0
  }
540
541
  /*
542
   * convert min and max into numbers of possible repetitions of the backref
543
   * string, rounding appropriately
544
   */
545
0
  if (min <= start)
546
0
    minreps = 0;
547
0
  else
548
0
    minreps = (min - start - 1) / brlen + 1;
549
0
  maxreps = (max - start) / brlen;
550
551
  /* apply bounds, then see if there is any allowed match length */
552
0
  if (minreps < backmin)
553
0
    minreps = backmin;
554
0
  if (backmax != DUPINF && maxreps > backmax)
555
0
    maxreps = backmax;
556
0
  if (maxreps < minreps)
557
0
    return NULL;
558
559
  /* quick exit if zero-repetitions match is valid and preferred */
560
0
  if (shortest && minreps == 0)
561
0
    return start;
562
563
  /* okay, compare the actual string contents */
564
0
  p = start;
565
0
  numreps = 0;
566
0
  while (numreps < maxreps)
567
0
  {
568
0
    if ((*v->g->compare) (brstring, p, brlen) != 0)
569
0
      break;
570
0
    p += brlen;
571
0
    numreps++;
572
0
    if (shortest && numreps >= minreps)
573
0
      break;
574
0
  }
575
576
0
  if (numreps >= minreps)
577
0
    return p;
578
0
  return NULL;
579
0
}
580
581
/*
582
 * lastcold - determine last point at which no progress had been made
583
 */
584
static chr *          /* endpoint, or NULL */
585
lastcold(struct vars *v,
586
     struct dfa *d)
587
0
{
588
0
  struct sset *ss;
589
0
  chr      *nopr;
590
0
  int     i;
591
592
0
  nopr = d->lastnopr;
593
0
  if (nopr == NULL)
594
0
    nopr = v->start;
595
0
  for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
596
0
    if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
597
0
      nopr = ss->lastseen;
598
0
  return nopr;
599
0
}
600
601
/*
602
 * newdfa - set up a fresh DFA
603
 *
604
 * Returns NULL (and sets v->err) on failure.
605
 */
606
static struct dfa *
607
newdfa(struct vars *v,
608
     struct cnfa *cnfa,
609
     struct colormap *cm,
610
     struct smalldfa *sml)  /* preallocated space, may be NULL */
611
0
{
612
0
  struct dfa *d;
613
0
  size_t    nss = cnfa->nstates * 2;
614
0
  int     wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
615
0
  bool    ismalloced = false;
616
617
0
  assert(cnfa != NULL && cnfa->nstates != 0);
618
619
0
  if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
620
0
  {
621
0
    assert(wordsper == 1);
622
0
    if (sml == NULL)
623
0
    {
624
0
      sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
625
0
      if (sml == NULL)
626
0
      {
627
0
        ERR(REG_ESPACE);
628
0
        return NULL;
629
0
      }
630
0
      ismalloced = true;
631
0
    }
632
0
    d = &sml->dfa;
633
0
    d->ssets = sml->ssets;
634
0
    d->statesarea = sml->statesarea;
635
0
    d->work = &d->statesarea[nss];
636
0
    d->outsarea = sml->outsarea;
637
0
    d->incarea = sml->incarea;
638
0
    d->ismalloced = ismalloced;
639
0
    d->arraysmalloced = false;  /* not separately allocated, anyway */
640
0
  }
641
0
  else
642
0
  {
643
0
    d = (struct dfa *) MALLOC(sizeof(struct dfa));
644
0
    if (d == NULL)
645
0
    {
646
0
      ERR(REG_ESPACE);
647
0
      return NULL;
648
0
    }
649
0
    d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
650
0
    d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
651
0
                      sizeof(unsigned));
652
0
    d->work = &d->statesarea[nss * wordsper];
653
0
    d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
654
0
                        sizeof(struct sset *));
655
0
    d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
656
0
                      sizeof(struct arcp));
657
0
    d->ismalloced = true;
658
0
    d->arraysmalloced = true;
659
    /* now freedfa() will behave sanely */
660
0
    if (d->ssets == NULL || d->statesarea == NULL ||
661
0
      d->outsarea == NULL || d->incarea == NULL)
662
0
    {
663
0
      freedfa(d);
664
0
      ERR(REG_ESPACE);
665
0
      return NULL;
666
0
    }
667
0
  }
668
669
0
  d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
670
0
  d->nssused = 0;
671
0
  d->nstates = cnfa->nstates;
672
0
  d->ncolors = cnfa->ncolors;
673
0
  d->wordsper = wordsper;
674
0
  d->cnfa = cnfa;
675
0
  d->cm = cm;
676
0
  d->lastpost = NULL;
677
0
  d->lastnopr = NULL;
678
0
  d->search = d->ssets;
679
0
  d->backno = -1;       /* may be set by caller */
680
0
  d->backmin = d->backmax = 0;
681
682
  /* initialization of sset fields is done as needed */
683
684
0
  return d;
685
0
}
686
687
/*
688
 * freedfa - free a DFA
689
 */
690
static void
691
freedfa(struct dfa *d)
692
0
{
693
0
  if (d->arraysmalloced)
694
0
  {
695
0
    if (d->ssets != NULL)
696
0
      FREE(d->ssets);
697
0
    if (d->statesarea != NULL)
698
0
      FREE(d->statesarea);
699
0
    if (d->outsarea != NULL)
700
0
      FREE(d->outsarea);
701
0
    if (d->incarea != NULL)
702
0
      FREE(d->incarea);
703
0
  }
704
705
0
  if (d->ismalloced)
706
0
    FREE(d);
707
0
}
708
709
/*
710
 * hash - construct a hash code for a bitvector
711
 *
712
 * There are probably better ways, but they're more expensive.
713
 */
714
static unsigned
715
hash(unsigned *uv,
716
   int n)
717
0
{
718
0
  int     i;
719
0
  unsigned  h;
720
721
0
  h = 0;
722
0
  for (i = 0; i < n; i++)
723
0
    h ^= uv[i];
724
0
  return h;
725
0
}
726
727
/*
728
 * initialize - hand-craft a cache entry for startup, otherwise get ready
729
 */
730
static struct sset *
731
initialize(struct vars *v,
732
       struct dfa *d,
733
       chr *start)
734
0
{
735
0
  struct sset *ss;
736
0
  int     i;
737
738
  /* is previous one still there? */
739
0
  if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
740
0
    ss = &d->ssets[0];
741
0
  else
742
0
  {             /* no, must (re)build it */
743
0
    ss = getvacant(v, d, start, start);
744
0
    if (ss == NULL)
745
0
      return NULL;
746
0
    for (i = 0; i < d->wordsper; i++)
747
0
      ss->states[i] = 0;
748
0
    BSET(ss->states, d->cnfa->pre);
749
0
    ss->hash = HASH(ss->states, d->wordsper);
750
0
    assert(d->cnfa->pre != d->cnfa->post);
751
0
    ss->flags = STARTER | LOCKED | NOPROGRESS;
752
    /* lastseen dealt with below */
753
0
  }
754
755
0
  for (i = 0; i < d->nssused; i++)
756
0
    d->ssets[i].lastseen = NULL;
757
0
  ss->lastseen = start;   /* maybe untrue, but harmless */
758
0
  d->lastpost = NULL;
759
0
  d->lastnopr = NULL;
760
0
  return ss;
761
0
}
762
763
/*
764
 * miss - handle a stateset cache miss
765
 *
766
 * css is the current stateset, co is the color of the current input character,
767
 * cp points to the character after that (which is where we may need to test
768
 * LACONs).  start does not affect matching behavior but is needed for pickss'
769
 * heuristics about which stateset cache entry to replace.
770
 *
771
 * Ordinarily, returns the address of the next stateset (the one that is
772
 * valid after consuming the input character).  Returns NULL if no valid
773
 * NFA states remain, ie we have a certain match failure.
774
 * Internal errors also return NULL, with v->err set.
775
 */
776
static struct sset *
777
miss(struct vars *v,
778
   struct dfa *d,
779
   struct sset *css,
780
   color co,
781
   chr *cp,         /* next chr */
782
   chr *start)        /* where the attempt got started */
783
0
{
784
0
  struct cnfa *cnfa = d->cnfa;
785
0
  int     i;
786
0
  unsigned  h;
787
0
  struct carc *ca;
788
0
  struct sset *p;
789
0
  int     ispseudocolor;
790
0
  int     ispost;
791
0
  int     noprogress;
792
0
  int     gotstate;
793
0
  int     dolacons;
794
0
  int     sawlacons;
795
796
  /* for convenience, we can be called even if it might not be a miss */
797
0
  if (css->outs[co] != NULL)
798
0
  {
799
0
    FDEBUG(("hit\n"));
800
0
    return css->outs[co];
801
0
  }
802
0
  FDEBUG(("miss\n"));
803
804
  /*
805
   * Checking for operation cancel in the inner text search loop seems
806
   * unduly expensive.  As a compromise, check during cache misses.
807
   */
808
0
  INTERRUPT(v->re);
809
810
  /*
811
   * What set of states would we end up in after consuming the co character?
812
   * We first consider PLAIN arcs that consume the character, and then look
813
   * to see what LACON arcs could be traversed after consuming it.
814
   */
815
0
  for (i = 0; i < d->wordsper; i++)
816
0
    d->work[i] = 0;     /* build new stateset bitmap in d->work */
817
0
  ispseudocolor = d->cm->cd[co].flags & PSEUDO;
818
0
  ispost = 0;
819
0
  noprogress = 1;
820
0
  gotstate = 0;
821
0
  for (i = 0; i < d->nstates; i++)
822
0
    if (ISBSET(css->states, i))
823
0
      for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
824
0
        if (ca->co == co ||
825
0
          (ca->co == RAINBOW && !ispseudocolor))
826
0
        {
827
0
          BSET(d->work, ca->to);
828
0
          gotstate = 1;
829
0
          if (ca->to == cnfa->post)
830
0
            ispost = 1;
831
0
          if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
832
0
            noprogress = 0;
833
0
          FDEBUG(("%d -> %d\n", i, ca->to));
834
0
        }
835
0
  if (!gotstate)
836
0
    return NULL;     /* character cannot reach any new state */
837
0
  dolacons = (cnfa->flags & HASLACONS);
838
0
  sawlacons = 0;
839
  /* outer loop handles transitive closure of reachable-by-LACON states */
840
0
  while (dolacons)
841
0
  {
842
0
    dolacons = 0;
843
0
    for (i = 0; i < d->nstates; i++)
844
0
      if (ISBSET(d->work, i))
845
0
        for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
846
0
        {
847
0
          if (ca->co < cnfa->ncolors)
848
0
            continue; /* not a LACON arc */
849
0
          if (ISBSET(d->work, ca->to))
850
0
            continue; /* arc would be a no-op anyway */
851
0
          sawlacons = 1;  /* this LACON affects our result */
852
0
          if (!lacon(v, cnfa, cp, ca->co))
853
0
          {
854
0
            if (ISERR())
855
0
              return NULL;
856
0
            continue; /* LACON arc cannot be traversed */
857
0
          }
858
0
          if (ISERR())
859
0
            return NULL;
860
0
          BSET(d->work, ca->to);
861
0
          dolacons = 1;
862
0
          if (ca->to == cnfa->post)
863
0
            ispost = 1;
864
0
          if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
865
0
            noprogress = 0;
866
0
          FDEBUG(("%d :> %d\n", i, ca->to));
867
0
        }
868
0
  }
869
0
  h = HASH(d->work, d->wordsper);
870
871
  /* Is this stateset already in the cache? */
872
0
  for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
873
0
    if (HIT(h, d->work, p, d->wordsper))
874
0
    {
875
0
      FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
876
0
      break;        /* NOTE BREAK OUT */
877
0
    }
878
0
  if (i == 0)
879
0
  {             /* nope, need a new cache entry */
880
0
    p = getvacant(v, d, cp, start);
881
0
    if (p == NULL)
882
0
      return NULL;
883
0
    assert(p != css);
884
0
    for (i = 0; i < d->wordsper; i++)
885
0
      p->states[i] = d->work[i];
886
0
    p->hash = h;
887
0
    p->flags = (ispost) ? POSTSTATE : 0;
888
0
    if (noprogress)
889
0
      p->flags |= NOPROGRESS;
890
    /* lastseen to be dealt with by caller */
891
0
  }
892
893
  /*
894
   * Link new stateset to old, unless a LACON affected the result, in which
895
   * case we don't create the link.  That forces future transitions across
896
   * this same arc (same prior stateset and character color) to come through
897
   * miss() again, so that we can recheck the LACON(s), which might or might
898
   * not pass since context will be different.
899
   */
900
0
  if (!sawlacons)
901
0
  {
902
0
    FDEBUG(("c%d[%d]->c%d\n",
903
0
        (int) (css - d->ssets), co, (int) (p - d->ssets)));
904
0
    css->outs[co] = p;
905
0
    css->inchain[co] = p->ins;
906
0
    p->ins.ss = css;
907
0
    p->ins.co = co;
908
0
  }
909
0
  return p;
910
0
}
911
912
/*
913
 * lacon - lookaround-constraint checker for miss()
914
 */
915
static int            /* predicate:  constraint satisfied? */
916
lacon(struct vars *v,
917
    struct cnfa *pcnfa,   /* parent cnfa */
918
    chr *cp,
919
    color co)         /* "color" of the lookaround constraint */
920
0
{
921
0
  int     n;
922
0
  struct subre *sub;
923
0
  struct dfa *d;
924
0
  chr      *end;
925
0
  int     satisfied;
926
927
  /* Since this is recursive, it could be driven to stack overflow */
928
0
  if (STACK_TOO_DEEP(v->re))
929
0
  {
930
0
    ERR(REG_ETOOBIG);
931
0
    return 0;
932
0
  }
933
934
0
  n = co - pcnfa->ncolors;
935
0
  assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
936
0
  FDEBUG(("=== testing lacon %d\n", n));
937
0
  sub = &v->g->lacons[n];
938
0
  d = getladfa(v, n);
939
0
  if (d == NULL)
940
0
    return 0;
941
0
  if (LATYPE_IS_AHEAD(sub->latype))
942
0
  {
943
    /* used to use longest() here, but shortest() could be much cheaper */
944
0
    end = shortest(v, d, cp, cp, v->stop,
945
0
             (chr **) NULL, (int *) NULL);
946
0
    satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
947
0
  }
948
0
  else
949
0
  {
950
    /*
951
     * To avoid doing O(N^2) work when repeatedly testing a lookbehind
952
     * constraint in an N-character string, we use matchuntil() which can
953
     * cache the DFA state across calls.  We only need to restart if the
954
     * probe point decreases, which is not common.  The NFA we're using is
955
     * a search NFA, so it doesn't mind scanning over stuff before the
956
     * nominal match.
957
     */
958
0
    satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
959
0
    if (!LATYPE_IS_POS(sub->latype))
960
0
      satisfied = !satisfied;
961
0
  }
962
0
  FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
963
0
  return satisfied;
964
0
}
965
966
/*
967
 * getvacant - get a vacant state set
968
 *
969
 * This routine clears out the inarcs and outarcs, but does not otherwise
970
 * clear the innards of the state set -- that's up to the caller.
971
 */
972
static struct sset *
973
getvacant(struct vars *v,
974
      struct dfa *d,
975
      chr *cp,
976
      chr *start)
977
0
{
978
0
  int     i;
979
0
  struct sset *ss;
980
0
  struct sset *p;
981
0
  struct arcp ap;
982
0
  color   co;
983
984
0
  ss = pickss(v, d, cp, start);
985
0
  if (ss == NULL)
986
0
    return NULL;
987
0
  assert(!(ss->flags & LOCKED));
988
989
  /* clear out its inarcs, including self-referential ones */
990
0
  ap = ss->ins;
991
0
  while ((p = ap.ss) != NULL)
992
0
  {
993
0
    co = ap.co;
994
0
    FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
995
0
    p->outs[co] = NULL;
996
0
    ap = p->inchain[co];
997
0
    p->inchain[co].ss = NULL; /* paranoia */
998
0
  }
999
0
  ss->ins.ss = NULL;
1000
1001
  /* take it off the inarc chains of the ssets reached by its outarcs */
1002
0
  for (i = 0; i < d->ncolors; i++)
1003
0
  {
1004
0
    p = ss->outs[i];
1005
0
    assert(p != ss);    /* not self-referential */
1006
0
    if (p == NULL)
1007
0
      continue;     /* NOTE CONTINUE */
1008
0
    FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
1009
0
    if (p->ins.ss == ss && p->ins.co == i)
1010
0
      p->ins = ss->inchain[i];
1011
0
    else
1012
0
    {
1013
0
      struct arcp lastap = {NULL, 0};
1014
1015
0
      assert(p->ins.ss != NULL);
1016
0
      for (ap = p->ins; ap.ss != NULL &&
1017
0
         !(ap.ss == ss && ap.co == i);
1018
0
         ap = ap.ss->inchain[ap.co])
1019
0
        lastap = ap;
1020
0
      assert(ap.ss != NULL);
1021
0
      lastap.ss->inchain[lastap.co] = ss->inchain[i];
1022
0
    }
1023
0
    ss->outs[i] = NULL;
1024
0
    ss->inchain[i].ss = NULL;
1025
0
  }
1026
1027
  /* if ss was a success state, may need to remember location */
1028
0
  if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
1029
0
    (d->lastpost == NULL || d->lastpost < ss->lastseen))
1030
0
    d->lastpost = ss->lastseen;
1031
1032
  /* likewise for a no-progress state */
1033
0
  if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
1034
0
    (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
1035
0
    d->lastnopr = ss->lastseen;
1036
1037
0
  return ss;
1038
0
}
1039
1040
/*
1041
 * pickss - pick the next stateset to be used
1042
 */
1043
static struct sset *
1044
pickss(struct vars *v,
1045
     struct dfa *d,
1046
     chr *cp,
1047
     chr *start)
1048
0
{
1049
0
  int     i;
1050
0
  struct sset *ss;
1051
0
  struct sset *end;
1052
0
  chr      *ancient;
1053
1054
  /* shortcut for cases where cache isn't full */
1055
0
  if (d->nssused < d->nssets)
1056
0
  {
1057
0
    i = d->nssused;
1058
0
    d->nssused++;
1059
0
    ss = &d->ssets[i];
1060
0
    FDEBUG(("new c%d\n", i));
1061
    /* set up innards */
1062
0
    ss->states = &d->statesarea[i * d->wordsper];
1063
0
    ss->flags = 0;
1064
0
    ss->ins.ss = NULL;
1065
0
    ss->ins.co = WHITE;   /* give it some value */
1066
0
    ss->outs = &d->outsarea[i * d->ncolors];
1067
0
    ss->inchain = &d->incarea[i * d->ncolors];
1068
0
    for (i = 0; i < d->ncolors; i++)
1069
0
    {
1070
0
      ss->outs[i] = NULL;
1071
0
      ss->inchain[i].ss = NULL;
1072
0
    }
1073
0
    return ss;
1074
0
  }
1075
1076
  /* look for oldest, or old enough anyway */
1077
0
  if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
1078
0
    ancient = cp - d->nssets * 2 / 3;
1079
0
  else
1080
0
    ancient = start;
1081
0
  for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
1082
0
    if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1083
0
      !(ss->flags & LOCKED))
1084
0
    {
1085
0
      d->search = ss + 1;
1086
0
      FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1087
0
      return ss;
1088
0
    }
1089
0
  for (ss = d->ssets, end = d->search; ss < end; ss++)
1090
0
    if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1091
0
      !(ss->flags & LOCKED))
1092
0
    {
1093
0
      d->search = ss + 1;
1094
0
      FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1095
0
      return ss;
1096
0
    }
1097
1098
  /* nobody's old enough?!? -- something's really wrong */
1099
0
  FDEBUG(("cannot find victim to replace!\n"));
1100
0
  ERR(REG_ASSERT);
1101
0
  return NULL;
1102
0
}