Coverage Report

Created: 2026-06-13 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/pcre/pcre2lib/pcre2_study.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2023 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains functions for scanning a compiled pattern and
42
collecting data (e.g. minimum matching length). */
43
44
45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48
49
#include "pcre2_internal.h"
50
51
/* The maximum remembered capturing brackets minimum. */
52
53
1.58k
#define MAX_CACHE_BACKREF 128
54
55
/* Set a bit in the starting code unit bit map. */
56
57
16.6k
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59
/* Returns from set_start_bits() */
60
61
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62
63
64
/*************************************************
65
*   Find the minimum subject length for a group  *
66
*************************************************/
67
68
/* Scan a parenthesized group and compute the minimum length of subject that
69
is needed to match it. This is a lower bound; it does not mean there is a
70
string of that length that matches. In UTF mode, the result is in characters
71
rather than code units. The field in a compiled pattern for storing the minimum
72
length is 16-bits long (on the grounds that anything longer than that is
73
pathological), so we give up when we reach that amount. This also means that
74
integer overflow for really crazy patterns cannot happen.
75
76
Backreference minimum lengths are cached to speed up multiple references. This
77
function is called only when the highest back reference in the pattern is less
78
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79
caching vector. The zeroth element contains the number of the highest set
80
value.
81
82
Arguments:
83
  re              compiled pattern block
84
  code            pointer to start of group (the bracket)
85
  startcode       pointer to start of the whole pattern's code
86
  utf             UTF flag
87
  recurses        chain of recurse_check to catch mutual recursion
88
  countptr        pointer to call count (to catch over complexity)
89
  backref_cache   vector for caching back references.
90
91
This function is no longer called when the pattern contains (*ACCEPT); however,
92
the old code for returning -1 is retained, just in case.
93
94
Returns:   the minimum length
95
           -1 \C in UTF-8 mode
96
              or (*ACCEPT)
97
              or pattern too complicated
98
           -2 internal error (missing capturing bracket)
99
           -3 internal error (opcode not listed)
100
*/
101
102
static int
103
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104
  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105
  int *backref_cache)
106
4.06k
{
107
4.06k
int length = -1;
108
4.06k
int branchlength = 0;
109
4.06k
int prev_cap_recno = -1;
110
4.06k
int prev_cap_d = 0;
111
4.06k
int prev_recurse_recno = -1;
112
4.06k
int prev_recurse_d = 0;
113
4.06k
uint32_t once_fudge = 0;
114
4.06k
BOOL had_recurse = FALSE;
115
4.06k
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116
4.06k
PCRE2_SPTR nextbranch = code + GET(code, 1);
117
4.06k
PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118
4.06k
recurse_check this_recurse;
119
120
/* If this is a "could be empty" group, its minimum length is 0. */
121
122
4.06k
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123
124
/* Skip over capturing bracket number */
125
126
3.28k
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127
128
/* A large and/or complex regex can take too long to process. */
129
130
3.28k
if ((*countptr)++ > 1000) return -1;
131
132
/* Scan along the opcodes for this branch. If we get to the end of the branch,
133
check the length against that of the other branches. If the accumulated length
134
passes 16-bits, reset to that value and skip the rest of the branch. */
135
136
3.28k
for (;;)
137
324k
  {
138
324k
  int d, min, recno;
139
324k
  PCRE2_UCHAR op, *cs, *ce;
140
141
324k
  if (branchlength >= UINT16_MAX)
142
0
    {
143
0
    branchlength = UINT16_MAX;
144
0
    cc = (PCRE2_UCHAR *)nextbranch;
145
0
    }
146
147
324k
  op = *cc;
148
324k
  switch (op)
149
324k
    {
150
0
    case OP_COND:
151
0
    case OP_SCOND:
152
153
    /* If there is only one branch in a condition, the implied branch has zero
154
    length, so we don't add anything. This covers the DEFINE "condition"
155
    automatically. If there are two branches we can treat it the same as any
156
    other non-capturing subpattern. */
157
158
0
    cs = cc + GET(cc, 1);
159
0
    if (*cs != OP_ALT)
160
0
      {
161
0
      cc = cs + 1 + LINK_SIZE;
162
0
      break;
163
0
      }
164
0
    goto PROCESS_NON_CAPTURE;
165
166
38
    case OP_BRA:
167
    /* There's a special case of OP_BRA, when it is wrapped round a repeated
168
    OP_RECURSE. We'd like to process the latter at this level so that
169
    remembering the value works for repeated cases. So we do nothing, but
170
    set a fudge value to skip over the OP_KET after the recurse. */
171
172
38
    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173
0
      {
174
0
      once_fudge = 1 + LINK_SIZE;
175
0
      cc += 1 + LINK_SIZE;
176
0
      break;
177
0
      }
178
    /* Fall through */
179
180
45
    case OP_ONCE:
181
45
    case OP_SCRIPT_RUN:
182
46
    case OP_SBRA:
183
46
    case OP_BRAPOS:
184
47
    case OP_SBRAPOS:
185
47
    PROCESS_NON_CAPTURE:
186
47
    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187
47
      backref_cache);
188
47
    if (d < 0) return d;
189
47
    branchlength += d;
190
255
    do cc += GET(cc, 1); while (*cc == OP_ALT);
191
47
    cc += 1 + LINK_SIZE;
192
47
    break;
193
194
    /* To save time for repeated capturing subpatterns, we remember the
195
    length of the previous one. Unfortunately we can't do the same for
196
    the unnumbered ones above. Nor can we do this if (?| is present in the
197
    pattern because captures with the same number are not then identical. */
198
199
1.40k
    case OP_CBRA:
200
1.72k
    case OP_SCBRA:
201
1.86k
    case OP_CBRAPOS:
202
2.23k
    case OP_SCBRAPOS:
203
2.23k
    recno = (int)GET2(cc, 1+LINK_SIZE);
204
2.23k
    if (dupcapused || recno != prev_cap_recno)
205
2.23k
      {
206
2.23k
      prev_cap_recno = recno;
207
2.23k
      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208
2.23k
        backref_cache);
209
2.23k
      if (prev_cap_d < 0) return prev_cap_d;
210
2.23k
      }
211
2.23k
    branchlength += prev_cap_d;
212
2.87k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
213
2.23k
    cc += 1 + LINK_SIZE;
214
2.23k
    break;
215
216
    /* ACCEPT makes things far too complicated; we have to give up. In fact,
217
    from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218
    used. However, leave the code in place, just in case. */
219
220
0
    case OP_ACCEPT:
221
0
    case OP_ASSERT_ACCEPT:
222
0
    return -1;
223
224
    /* Reached end of a branch; if it's a ket it is the end of a nested
225
    call. If it's ALT it is an alternation in a nested call. If it is END it's
226
    the end of the outer call. All can be handled by the same code. If the
227
    length of any branch is zero, there is no need to scan any subsequent
228
    branches. */
229
230
9.90k
    case OP_ALT:
231
12.9k
    case OP_KET:
232
12.9k
    case OP_KETRMAX:
233
12.9k
    case OP_KETRMIN:
234
13.1k
    case OP_KETRPOS:
235
13.1k
    case OP_END:
236
13.1k
    if (length < 0 || (!had_recurse && branchlength < length))
237
4.79k
      length = branchlength;
238
13.1k
    if (op != OP_ALT || length == 0) return length;
239
9.88k
    nextbranch = cc + GET(cc, 1);
240
9.88k
    cc += 1 + LINK_SIZE;
241
9.88k
    branchlength = 0;
242
9.88k
    had_recurse = FALSE;
243
9.88k
    break;
244
245
    /* Skip over assertive subpatterns */
246
247
20
    case OP_ASSERT:
248
23
    case OP_ASSERT_NOT:
249
27
    case OP_ASSERTBACK:
250
27
    case OP_ASSERTBACK_NOT:
251
45
    case OP_ASSERT_NA:
252
54
    case OP_ASSERTBACK_NA:
253
248
    do cc += GET(cc, 1); while (*cc == OP_ALT);
254
    /* Fall through */
255
256
    /* Skip over things that don't match chars */
257
258
54
    case OP_REVERSE:
259
54
    case OP_VREVERSE:
260
54
    case OP_CREF:
261
54
    case OP_DNCREF:
262
54
    case OP_RREF:
263
54
    case OP_DNRREF:
264
54
    case OP_FALSE:
265
54
    case OP_TRUE:
266
54
    case OP_CALLOUT:
267
120
    case OP_SOD:
268
156
    case OP_SOM:
269
182
    case OP_EOD:
270
206
    case OP_EODN:
271
2.61k
    case OP_CIRC:
272
2.85k
    case OP_CIRCM:
273
3.73k
    case OP_DOLL:
274
3.77k
    case OP_DOLLM:
275
3.94k
    case OP_NOT_WORD_BOUNDARY:
276
4.23k
    case OP_WORD_BOUNDARY:
277
4.33k
    case OP_NOT_UCP_WORD_BOUNDARY:
278
4.37k
    case OP_UCP_WORD_BOUNDARY:
279
4.37k
    cc += PRIV(OP_lengths)[*cc];
280
4.37k
    break;
281
282
0
    case OP_CALLOUT_STR:
283
0
    cc += GET(cc, 1 + 2*LINK_SIZE);
284
0
    break;
285
286
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
287
288
21
    case OP_BRAZERO:
289
22
    case OP_BRAMINZERO:
290
22
    case OP_BRAPOSZERO:
291
22
    case OP_SKIPZERO:
292
22
    cc += PRIV(OP_lengths)[*cc];
293
24
    do cc += GET(cc, 1); while (*cc == OP_ALT);
294
22
    cc += 1 + LINK_SIZE;
295
22
    break;
296
297
    /* Handle literal characters and + repetitions */
298
299
172k
    case OP_CHAR:
300
258k
    case OP_CHARI:
301
258k
    case OP_NOT:
302
258k
    case OP_NOTI:
303
258k
    case OP_PLUS:
304
258k
    case OP_PLUSI:
305
258k
    case OP_MINPLUS:
306
259k
    case OP_MINPLUSI:
307
261k
    case OP_POSPLUS:
308
263k
    case OP_POSPLUSI:
309
263k
    case OP_NOTPLUS:
310
263k
    case OP_NOTPLUSI:
311
263k
    case OP_NOTMINPLUS:
312
263k
    case OP_NOTMINPLUSI:
313
263k
    case OP_NOTPOSPLUS:
314
263k
    case OP_NOTPOSPLUSI:
315
263k
    branchlength++;
316
263k
    cc += 2;
317
263k
#ifdef SUPPORT_UNICODE
318
263k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
319
263k
#endif
320
263k
    break;
321
322
1.02k
    case OP_TYPEPLUS:
323
1.58k
    case OP_TYPEMINPLUS:
324
2.43k
    case OP_TYPEPOSPLUS:
325
2.43k
    branchlength++;
326
2.43k
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
327
2.43k
    break;
328
329
    /* Handle exact repetitions. The count is already in characters, but we
330
    may need to skip over a multibyte character in UTF mode.  */
331
332
7
    case OP_EXACT:
333
16
    case OP_EXACTI:
334
16
    case OP_NOTEXACT:
335
16
    case OP_NOTEXACTI:
336
16
    branchlength += GET2(cc,1);
337
16
    cc += 2 + IMM2_SIZE;
338
16
#ifdef SUPPORT_UNICODE
339
16
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
340
16
#endif
341
16
    break;
342
343
0
    case OP_TYPEEXACT:
344
0
    branchlength += GET2(cc,1);
345
0
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
346
0
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
347
0
    break;
348
349
    /* Handle single-char non-literal matchers */
350
351
776
    case OP_PROP:
352
990
    case OP_NOTPROP:
353
990
    cc += 2;
354
    /* Fall through */
355
356
1.17k
    case OP_NOT_DIGIT:
357
1.47k
    case OP_DIGIT:
358
1.54k
    case OP_NOT_WHITESPACE:
359
1.71k
    case OP_WHITESPACE:
360
1.82k
    case OP_NOT_WORDCHAR:
361
4.23k
    case OP_WORDCHAR:
362
10.6k
    case OP_ANY:
363
10.7k
    case OP_ALLANY:
364
11.0k
    case OP_EXTUNI:
365
11.1k
    case OP_HSPACE:
366
11.3k
    case OP_NOT_HSPACE:
367
11.6k
    case OP_VSPACE:
368
11.8k
    case OP_NOT_VSPACE:
369
11.8k
    branchlength++;
370
11.8k
    cc++;
371
11.8k
    break;
372
373
    /* "Any newline" might match two characters, but it also might match just
374
    one. */
375
376
1.17k
    case OP_ANYNL:
377
1.17k
    branchlength += 1;
378
1.17k
    cc++;
379
1.17k
    break;
380
381
    /* The single-byte matcher means we can't proceed in UTF mode. (In
382
    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
383
    appear, but leave the code, just in case.) */
384
385
34
    case OP_ANYBYTE:
386
34
#ifdef SUPPORT_UNICODE
387
34
    if (utf) return -1;
388
0
#endif
389
0
    branchlength++;
390
0
    cc++;
391
0
    break;
392
393
    /* For repeated character types, we have to test for \p and \P, which have
394
    an extra two bytes of parameters. */
395
396
1.12k
    case OP_TYPESTAR:
397
1.13k
    case OP_TYPEMINSTAR:
398
2.64k
    case OP_TYPEQUERY:
399
2.89k
    case OP_TYPEMINQUERY:
400
2.92k
    case OP_TYPEPOSSTAR:
401
5.72k
    case OP_TYPEPOSQUERY:
402
5.72k
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
403
5.72k
    cc += PRIV(OP_lengths)[op];
404
5.72k
    break;
405
406
0
    case OP_TYPEUPTO:
407
0
    case OP_TYPEMINUPTO:
408
0
    case OP_TYPEPOSUPTO:
409
0
    if (cc[1 + IMM2_SIZE] == OP_PROP
410
0
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
411
0
    cc += PRIV(OP_lengths)[op];
412
0
    break;
413
414
    /* Check a class for variable quantification */
415
416
3.36k
    case OP_CLASS:
417
4.58k
    case OP_NCLASS:
418
4.58k
#ifdef SUPPORT_WIDE_CHARS
419
5.18k
    case OP_XCLASS:
420
    /* The original code caused an unsigned overflow in 64 bit systems,
421
    so now we use a conditional statement. */
422
5.18k
    if (op == OP_XCLASS)
423
600
      cc += GET(cc, 1);
424
4.58k
    else
425
4.58k
      cc += PRIV(OP_lengths)[OP_CLASS];
426
#else
427
    cc += PRIV(OP_lengths)[OP_CLASS];
428
#endif
429
430
5.18k
    switch (*cc)
431
5.18k
      {
432
266
      case OP_CRPLUS:
433
337
      case OP_CRMINPLUS:
434
643
      case OP_CRPOSPLUS:
435
643
      branchlength++;
436
      /* Fall through */
437
438
1.25k
      case OP_CRSTAR:
439
1.80k
      case OP_CRMINSTAR:
440
2.18k
      case OP_CRQUERY:
441
2.49k
      case OP_CRMINQUERY:
442
3.26k
      case OP_CRPOSSTAR:
443
3.59k
      case OP_CRPOSQUERY:
444
3.59k
      cc++;
445
3.59k
      break;
446
447
0
      case OP_CRRANGE:
448
0
      case OP_CRMINRANGE:
449
0
      case OP_CRPOSRANGE:
450
0
      branchlength += GET2(cc,1);
451
0
      cc += 1 + 2 * IMM2_SIZE;
452
0
      break;
453
454
1.59k
      default:
455
1.59k
      branchlength++;
456
1.59k
      break;
457
5.18k
      }
458
5.18k
    break;
459
460
    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461
    way: we find the minimum length for the subpattern. A recursion
462
    (backreference or subroutine) causes an a flag to be set that causes the
463
    length of this branch to be ignored. The logic is that a recursion can only
464
    make sense if there is another alternative that stops the recursing. That
465
    will provide the minimum length (when no recursion happens).
466
467
    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468
    matches an empty string (by default it causes a matching failure), so in
469
    that case we must set the minimum length to zero.
470
471
    For backreferenes, if duplicate numbers are present in the pattern we check
472
    for a reference to a duplicate. If it is, we don't know which version will
473
    be referenced, so we have to set the minimum length to zero. */
474
475
    /* Duplicate named pattern back reference. */
476
477
5.18k
    case OP_DNREF:
478
0
    case OP_DNREFI:
479
0
    if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480
0
      {
481
0
      int count = GET2(cc, 1+IMM2_SIZE);
482
0
      PCRE2_UCHAR *slot =
483
0
        (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
484
0
          GET2(cc, 1) * re->name_entry_size;
485
486
0
      d = INT_MAX;
487
488
      /* Scan all groups with the same name; find the shortest. */
489
490
0
      while (count-- > 0)
491
0
        {
492
0
        int dd, i;
493
0
        recno = GET2(slot, 0);
494
495
0
        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496
0
          dd = backref_cache[recno];
497
0
        else
498
0
          {
499
0
          ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
500
0
          if (cs == NULL) return -2;
501
0
          do ce += GET(ce, 1); while (*ce == OP_ALT);
502
503
0
          dd = 0;
504
0
          if (!dupcapused ||
505
0
              (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
506
0
            {
507
0
            if (cc > cs && cc < ce)    /* Simple recursion */
508
0
              {
509
0
              had_recurse = TRUE;
510
0
              }
511
0
            else
512
0
              {
513
0
              recurse_check *r = recurses;
514
0
              for (r = recurses; r != NULL; r = r->prev)
515
0
                if (r->group == cs) break;
516
0
              if (r != NULL)           /* Mutual recursion */
517
0
                {
518
0
                had_recurse = TRUE;
519
0
                }
520
0
              else
521
0
                {
522
0
                this_recurse.prev = recurses;  /* No recursion */
523
0
                this_recurse.group = cs;
524
0
                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525
0
                  countptr, backref_cache);
526
0
                if (dd < 0) return dd;
527
0
                }
528
0
              }
529
0
            }
530
531
0
          backref_cache[recno] = dd;
532
0
          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533
0
          backref_cache[0] = recno;
534
0
          }
535
536
0
        if (dd < d) d = dd;
537
0
        if (d <= 0) break;    /* No point looking at any more */
538
0
        slot += re->name_entry_size;
539
0
        }
540
0
      }
541
0
    else d = 0;
542
0
    cc += 1 + 2*IMM2_SIZE;
543
0
    goto REPEAT_BACK_REFERENCE;
544
545
    /* Single back reference by number. References by name are converted to by
546
    number when there is no duplication. */
547
548
696
    case OP_REF:
549
1.12k
    case OP_REFI:
550
1.12k
    recno = GET2(cc, 1);
551
1.12k
    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552
917
      d = backref_cache[recno];
553
207
    else
554
207
      {
555
207
      int i;
556
207
      d = 0;
557
558
207
      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559
207
        {
560
207
        ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
561
207
        if (cs == NULL) return -2;
562
315
        do ce += GET(ce, 1); while (*ce == OP_ALT);
563
564
207
        if (!dupcapused ||
565
0
            (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
566
207
          {
567
207
          if (cc > cs && cc < ce)    /* Simple recursion */
568
15
            {
569
15
            had_recurse = TRUE;
570
15
            }
571
192
          else
572
192
            {
573
192
            recurse_check *r = recurses;
574
192
            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
575
192
            if (r != NULL)           /* Mutual recursion */
576
0
              {
577
0
              had_recurse = TRUE;
578
0
              }
579
192
            else                     /* No recursion */
580
192
              {
581
192
              this_recurse.prev = recurses;
582
192
              this_recurse.group = cs;
583
192
              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
584
192
                backref_cache);
585
192
              if (d < 0) return d;
586
192
              }
587
192
            }
588
207
          }
589
207
        }
590
591
207
      backref_cache[recno] = d;
592
1.05k
      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
593
207
      backref_cache[0] = recno;
594
207
      }
595
596
1.12k
    cc += 1 + IMM2_SIZE;
597
598
    /* Handle repeated back references */
599
600
1.12k
    REPEAT_BACK_REFERENCE:
601
1.12k
    switch (*cc)
602
1.12k
      {
603
0
      case OP_CRSTAR:
604
0
      case OP_CRMINSTAR:
605
3
      case OP_CRQUERY:
606
3
      case OP_CRMINQUERY:
607
3
      case OP_CRPOSSTAR:
608
3
      case OP_CRPOSQUERY:
609
3
      min = 0;
610
3
      cc++;
611
3
      break;
612
613
1
      case OP_CRPLUS:
614
1
      case OP_CRMINPLUS:
615
1
      case OP_CRPOSPLUS:
616
1
      min = 1;
617
1
      cc++;
618
1
      break;
619
620
0
      case OP_CRRANGE:
621
0
      case OP_CRMINRANGE:
622
0
      case OP_CRPOSRANGE:
623
0
      min = GET2(cc, 1);
624
0
      cc += 1 + 2 * IMM2_SIZE;
625
0
      break;
626
627
1.12k
      default:
628
1.12k
      min = 1;
629
1.12k
      break;
630
1.12k
      }
631
632
     /* Take care not to overflow: (1) min and d are ints, so check that their
633
     product is not greater than INT_MAX. (2) branchlength is limited to
634
     UINT16_MAX (checked at the top of the loop). */
635
636
1.12k
    if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
637
0
      branchlength = UINT16_MAX;
638
1.12k
    else branchlength += min * d;
639
1.12k
    break;
640
641
    /* Recursion always refers to the first occurrence of a subpattern with a
642
    given number. Therefore, we can always make use of caching, even when the
643
    pattern contains multiple subpatterns with the same number. */
644
645
2
    case OP_RECURSE:
646
2
    cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
647
2
    recno = GET2(cs, 1+LINK_SIZE);
648
2
    if (recno == prev_recurse_recno)
649
0
      {
650
0
      branchlength += prev_recurse_d;
651
0
      }
652
2
    else
653
2
      {
654
4
      do ce += GET(ce, 1); while (*ce == OP_ALT);
655
2
      if (cc > cs && cc < ce)    /* Simple recursion */
656
1
        had_recurse = TRUE;
657
1
      else
658
1
        {
659
1
        recurse_check *r = recurses;
660
1
        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
661
1
        if (r != NULL)          /* Mutual recursion */
662
0
          had_recurse = TRUE;
663
1
        else
664
1
          {
665
1
          this_recurse.prev = recurses;
666
1
          this_recurse.group = cs;
667
1
          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
668
1
            countptr, backref_cache);
669
1
          if (prev_recurse_d < 0) return prev_recurse_d;
670
1
          prev_recurse_recno = recno;
671
1
          branchlength += prev_recurse_d;
672
1
          }
673
1
        }
674
2
      }
675
2
    cc += 1 + LINK_SIZE + once_fudge;
676
2
    once_fudge = 0;
677
2
    break;
678
679
    /* Anything else does not or need not match a character. We can get the
680
    item's length from the table, but for those that can match zero occurrences
681
    of a character, we must take special action for UTF-8 characters. As it
682
    happens, the "NOT" versions of these opcodes are used at present only for
683
    ASCII characters, so they could be omitted from this list. However, in
684
    future that may change, so we include them here so as not to leave a
685
    gotcha for a future maintainer. */
686
687
0
    case OP_UPTO:
688
0
    case OP_UPTOI:
689
0
    case OP_NOTUPTO:
690
0
    case OP_NOTUPTOI:
691
0
    case OP_MINUPTO:
692
0
    case OP_MINUPTOI:
693
0
    case OP_NOTMINUPTO:
694
0
    case OP_NOTMINUPTOI:
695
0
    case OP_POSUPTO:
696
0
    case OP_POSUPTOI:
697
0
    case OP_NOTPOSUPTO:
698
0
    case OP_NOTPOSUPTOI:
699
700
436
    case OP_STAR:
701
504
    case OP_STARI:
702
510
    case OP_NOTSTAR:
703
510
    case OP_NOTSTARI:
704
549
    case OP_MINSTAR:
705
662
    case OP_MINSTARI:
706
662
    case OP_NOTMINSTAR:
707
662
    case OP_NOTMINSTARI:
708
1.96k
    case OP_POSSTAR:
709
2.76k
    case OP_POSSTARI:
710
2.76k
    case OP_NOTPOSSTAR:
711
2.76k
    case OP_NOTPOSSTARI:
712
713
3.89k
    case OP_QUERY:
714
4.77k
    case OP_QUERYI:
715
4.82k
    case OP_NOTQUERY:
716
4.99k
    case OP_NOTQUERYI:
717
5.54k
    case OP_MINQUERY:
718
6.12k
    case OP_MINQUERYI:
719
6.14k
    case OP_NOTMINQUERY:
720
6.20k
    case OP_NOTMINQUERYI:
721
11.1k
    case OP_POSQUERY:
722
13.8k
    case OP_POSQUERYI:
723
13.8k
    case OP_NOTPOSQUERY:
724
13.8k
    case OP_NOTPOSQUERYI:
725
726
13.8k
    cc += PRIV(OP_lengths)[op];
727
13.8k
#ifdef SUPPORT_UNICODE
728
13.8k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
729
13.8k
#endif
730
13.8k
    break;
731
732
    /* Skip these, but we need to add in the name length. */
733
734
7
    case OP_MARK:
735
7
    case OP_COMMIT_ARG:
736
7
    case OP_PRUNE_ARG:
737
7
    case OP_SKIP_ARG:
738
7
    case OP_THEN_ARG:
739
7
    cc += PRIV(OP_lengths)[op] + cc[1];
740
7
    break;
741
742
    /* The remaining opcodes are just skipped over. */
743
744
0
    case OP_CLOSE:
745
0
    case OP_COMMIT:
746
0
    case OP_FAIL:
747
0
    case OP_PRUNE:
748
7
    case OP_SET_SOM:
749
7
    case OP_SKIP:
750
7
    case OP_THEN:
751
7
    cc += PRIV(OP_lengths)[op];
752
7
    break;
753
754
    /* This should not occur: we list all opcodes explicitly so that when
755
    new ones get added they are properly considered. */
756
757
0
    default:
758
0
    return -3;
759
324k
    }
760
324k
  }
761
/* Control never gets here */
762
3.28k
}
763
764
765
766
/*************************************************
767
*      Set a bit and maybe its alternate case    *
768
*************************************************/
769
770
/* Given a character, set its first code unit's bit in the table, and also the
771
corresponding bit for the other version of a letter if we are caseless.
772
773
Arguments:
774
  re            points to the regex block
775
  p             points to the first code unit of the character
776
  caseless      TRUE if caseless
777
  utf           TRUE for UTF mode
778
  ucp           TRUE for UCP mode
779
780
Returns:        pointer after the character
781
*/
782
783
static PCRE2_SPTR
784
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
785
  BOOL ucp)
786
9.37k
{
787
9.37k
uint32_t c = *p++;   /* First code unit */
788
789
9.37k
(void)utf;           /* Stop compiler warnings when UTF not supported */
790
9.37k
(void)ucp;
791
792
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
793
0xff. */
794
795
#if PCRE2_CODE_UNIT_WIDTH != 8
796
if (c > 0xff) SET_BIT(0xff); else
797
#endif
798
799
9.37k
SET_BIT(c);
800
801
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
802
the end of the character, even when caseless. */
803
804
9.37k
#ifdef SUPPORT_UNICODE
805
9.37k
if (utf)
806
662
  {
807
662
#if PCRE2_CODE_UNIT_WIDTH == 8
808
662
  if (c >= 0xc0) GETUTF8INC(c, p);
809
#elif PCRE2_CODE_UNIT_WIDTH == 16
810
  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
811
#endif
812
662
  }
813
9.37k
#endif  /* SUPPORT_UNICODE */
814
815
/* If caseless, handle the other case of the character. */
816
817
9.37k
if (caseless)
818
4.17k
  {
819
4.17k
#ifdef SUPPORT_UNICODE
820
4.17k
  if (utf || ucp)
821
555
    {
822
555
    c = UCD_OTHERCASE(c);
823
555
#if PCRE2_CODE_UNIT_WIDTH == 8
824
555
    if (utf)
825
555
      {
826
555
      PCRE2_UCHAR buff[6];
827
555
      (void)PRIV(ord2utf)(c, buff);
828
555
      SET_BIT(buff[0]);
829
555
      }
830
0
    else if (c < 256) SET_BIT(c);
831
#else  /* 16-bit or 32-bit mode */
832
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
833
#endif
834
555
    }
835
836
3.62k
  else
837
3.62k
#endif  /* SUPPORT_UNICODE */
838
839
  /* Not UTF or UCP */
840
841
3.62k
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
842
4.17k
  }
843
844
9.37k
return p;
845
9.37k
}
846
847
848
849
/*************************************************
850
*     Set bits for a positive character type     *
851
*************************************************/
852
853
/* This function sets starting bits for a character type. In UTF-8 mode, we can
854
only do a direct setting for bytes less than 128, as otherwise there can be
855
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
856
environment, the tables will only recognize ASCII characters anyway, but in at
857
least one Windows environment, some higher bytes bits were set in the tables.
858
So we deal with that case by considering the UTF-8 encoding.
859
860
Arguments:
861
  re             the regex block
862
  cbit type      the type of character wanted
863
  table_limit    32 for non-UTF-8; 16 for UTF-8
864
865
Returns:         nothing
866
*/
867
868
static void
869
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
870
2.19k
{
871
2.19k
uint32_t c;
872
72.5k
for (c = 0; c < table_limit; c++)
873
70.3k
  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
874
2.19k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
875
2.19k
if (table_limit == 32) return;
876
0
for (c = 128; c < 256; c++)
877
0
  {
878
0
  if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
879
0
    {
880
0
    PCRE2_UCHAR buff[6];
881
0
    (void)PRIV(ord2utf)(c, buff);
882
0
    SET_BIT(buff[0]);
883
0
    }
884
0
  }
885
0
#endif  /* UTF-8 */
886
0
}
887
888
889
/*************************************************
890
*     Set bits for a negative character type     *
891
*************************************************/
892
893
/* This function sets starting bits for a negative character type such as \D.
894
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
895
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
896
Unlike in the positive case, where we can set appropriate starting bits for
897
specific high-valued UTF-8 characters, in this case we have to set the bits for
898
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
899
0xc0 (192) for simplicity.
900
901
Arguments:
902
  re             the regex block
903
  cbit type      the type of character wanted
904
  table_limit    32 for non-UTF-8; 16 for UTF-8
905
906
Returns:         nothing
907
*/
908
909
static void
910
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
911
133
{
912
133
uint32_t c;
913
4.38k
for (c = 0; c < table_limit; c++)
914
4.25k
  re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
915
133
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
916
133
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
917
133
#endif
918
133
}
919
920
921
922
/*************************************************
923
*      Create bitmap of starting code units      *
924
*************************************************/
925
926
/* This function scans a compiled unanchored expression recursively and
927
attempts to build a bitmap of the set of possible starting code units whose
928
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
929
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
930
we pass a value of 16 rather than 32 as the final argument. (See comments in
931
those functions for the reason.)
932
933
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
934
(a*)b where the group provides some optional starting code units but scanning
935
must continue at the outer level to find at least one mandatory code unit. At
936
the outermost level, this function fails unless the result is SSB_DONE.
937
938
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
939
issues.
940
941
Arguments:
942
  re           points to the compiled regex block
943
  code         points to an expression
944
  utf          TRUE if in UTF mode
945
  ucp          TRUE if in UCP mode
946
  depthptr     pointer to recurse depth
947
948
Returns:       SSB_FAIL     => Failed to find any starting code units
949
               SSB_DONE     => Found mandatory starting code units
950
               SSB_CONTINUE => Found optional starting code units
951
               SSB_UNKNOWN  => Hit an unrecognized opcode
952
               SSB_TOODEEP  => Recursion is too deep
953
*/
954
955
static int
956
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
957
  int *depthptr)
958
2.70k
{
959
2.70k
uint32_t c;
960
2.70k
int yield = SSB_DONE;
961
962
2.70k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
963
2.70k
int table_limit = utf? 16:32;
964
#else
965
int table_limit = 32;
966
#endif
967
968
2.70k
*depthptr += 1;
969
2.70k
if (*depthptr > 1000) return SSB_TOODEEP;
970
971
2.70k
do
972
13.2k
  {
973
13.2k
  BOOL try_next = TRUE;
974
13.2k
  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
975
976
13.2k
  if (*code == OP_CBRA || *code == OP_SCBRA ||
977
12.3k
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
978
979
32.1k
  while (try_next)    /* Loop for items in this branch */
980
20.6k
    {
981
20.6k
    int rc;
982
20.6k
    PCRE2_SPTR ncode;
983
20.6k
    uint8_t *classmap = NULL;
984
20.6k
#ifdef SUPPORT_WIDE_CHARS
985
20.6k
    PCRE2_UCHAR xclassflags;
986
20.6k
#endif
987
988
20.6k
    switch(*tcode)
989
20.6k
      {
990
      /* If we reach something we don't understand, it means a new opcode has
991
      been created that hasn't been added to this function. Hopefully this
992
      problem will be discovered during testing. */
993
994
0
      default:
995
0
      return SSB_UNKNOWN;
996
997
      /* Fail for a valid opcode that implies no starting bits. */
998
999
0
      case OP_ACCEPT:
1000
0
      case OP_ASSERT_ACCEPT:
1001
101
      case OP_ALLANY:
1002
190
      case OP_ANY:
1003
198
      case OP_ANYBYTE:
1004
202
      case OP_CIRCM:
1005
202
      case OP_CLOSE:
1006
202
      case OP_COMMIT:
1007
202
      case OP_COMMIT_ARG:
1008
202
      case OP_COND:
1009
202
      case OP_CREF:
1010
202
      case OP_FALSE:
1011
202
      case OP_TRUE:
1012
202
      case OP_DNCREF:
1013
202
      case OP_DNREF:
1014
202
      case OP_DNREFI:
1015
202
      case OP_DNRREF:
1016
226
      case OP_DOLL:
1017
230
      case OP_DOLLM:
1018
230
      case OP_END:
1019
235
      case OP_EOD:
1020
237
      case OP_EODN:
1021
276
      case OP_EXTUNI:
1022
276
      case OP_FAIL:
1023
277
      case OP_MARK:
1024
290
      case OP_NOT:
1025
290
      case OP_NOTEXACT:
1026
290
      case OP_NOTEXACTI:
1027
301
      case OP_NOTI:
1028
301
      case OP_NOTMINPLUS:
1029
301
      case OP_NOTMINPLUSI:
1030
302
      case OP_NOTMINQUERY:
1031
302
      case OP_NOTMINQUERYI:
1032
302
      case OP_NOTMINSTAR:
1033
302
      case OP_NOTMINSTARI:
1034
302
      case OP_NOTMINUPTO:
1035
302
      case OP_NOTMINUPTOI:
1036
308
      case OP_NOTPLUS:
1037
316
      case OP_NOTPLUSI:
1038
316
      case OP_NOTPOSPLUS:
1039
317
      case OP_NOTPOSPLUSI:
1040
317
      case OP_NOTPOSQUERY:
1041
318
      case OP_NOTPOSQUERYI:
1042
318
      case OP_NOTPOSSTAR:
1043
318
      case OP_NOTPOSSTARI:
1044
318
      case OP_NOTPOSUPTO:
1045
318
      case OP_NOTPOSUPTOI:
1046
401
      case OP_NOTPROP:
1047
401
      case OP_NOTQUERY:
1048
444
      case OP_NOTQUERYI:
1049
446
      case OP_NOTSTAR:
1050
446
      case OP_NOTSTARI:
1051
446
      case OP_NOTUPTO:
1052
446
      case OP_NOTUPTOI:
1053
504
      case OP_NOT_HSPACE:
1054
581
      case OP_NOT_VSPACE:
1055
581
      case OP_PRUNE:
1056
581
      case OP_PRUNE_ARG:
1057
585
      case OP_RECURSE:
1058
589
      case OP_REF:
1059
589
      case OP_REFI:
1060
591
      case OP_REVERSE:
1061
591
      case OP_VREVERSE:
1062
591
      case OP_RREF:
1063
591
      case OP_SCOND:
1064
595
      case OP_SET_SOM:
1065
595
      case OP_SKIP:
1066
595
      case OP_SKIP_ARG:
1067
607
      case OP_SOD:
1068
615
      case OP_SOM:
1069
615
      case OP_THEN:
1070
615
      case OP_THEN_ARG:
1071
615
      return SSB_FAIL;
1072
1073
      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1074
      uses OP_CIRCM). Skip over it. */
1075
1076
330
      case OP_CIRC:
1077
330
      tcode += PRIV(OP_lengths)[OP_CIRC];
1078
330
      break;
1079
1080
      /* A "real" property test implies no starting bits, but the fake property
1081
      PT_CLIST identifies a list of characters. These lists are short, as they
1082
      are used for characters with more than one "other case", so there is no
1083
      point in recognizing them for OP_NOTPROP. */
1084
1085
47
      case OP_PROP:
1086
47
      if (tcode[1] != PT_CLIST) return SSB_FAIL;
1087
9
        {
1088
9
        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1089
36
        while ((c = *p++) < NOTACHAR)
1090
27
          {
1091
27
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1092
27
          if (utf)
1093
27
            {
1094
27
            PCRE2_UCHAR buff[6];
1095
27
            (void)PRIV(ord2utf)(c, buff);
1096
27
            c = buff[0];
1097
27
            }
1098
27
#endif
1099
27
          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1100
27
          }
1101
9
        }
1102
9
      try_next = FALSE;
1103
9
      break;
1104
1105
      /* We can ignore word boundary tests. */
1106
1107
7
      case OP_WORD_BOUNDARY:
1108
61
      case OP_NOT_WORD_BOUNDARY:
1109
87
      case OP_UCP_WORD_BOUNDARY:
1110
171
      case OP_NOT_UCP_WORD_BOUNDARY:
1111
171
      tcode++;
1112
171
      break;
1113
1114
      /* For a positive lookahead assertion, inspect what immediately follows,
1115
      ignoring intermediate assertions and callouts. If the next item is one
1116
      that sets a mandatory character, skip this assertion. Otherwise, treat it
1117
      the same as other bracket groups. */
1118
1119
5
      case OP_ASSERT:
1120
23
      case OP_ASSERT_NA:
1121
23
      ncode = tcode + GET(tcode, 1);
1122
120
      while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1123
23
      ncode += 1 + LINK_SIZE;
1124
1125
      /* Skip irrelevant items */
1126
1127
46
      for (BOOL done = FALSE; !done;)
1128
23
        {
1129
23
        switch (*ncode)
1130
23
          {
1131
0
          case OP_ASSERT:
1132
0
          case OP_ASSERT_NOT:
1133
0
          case OP_ASSERTBACK:
1134
0
          case OP_ASSERTBACK_NOT:
1135
0
          case OP_ASSERT_NA:
1136
0
          case OP_ASSERTBACK_NA:
1137
0
          ncode += GET(ncode, 1);
1138
0
          while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1139
0
          ncode += 1 + LINK_SIZE;
1140
0
          break;
1141
1142
0
          case OP_WORD_BOUNDARY:
1143
0
          case OP_NOT_WORD_BOUNDARY:
1144
0
          case OP_UCP_WORD_BOUNDARY:
1145
0
          case OP_NOT_UCP_WORD_BOUNDARY:
1146
0
          ncode++;
1147
0
          break;
1148
1149
0
          case OP_CALLOUT:
1150
0
          ncode += PRIV(OP_lengths)[OP_CALLOUT];
1151
0
          break;
1152
1153
0
          case OP_CALLOUT_STR:
1154
0
          ncode += GET(ncode, 1 + 2*LINK_SIZE);
1155
0
          break;
1156
1157
23
          default:
1158
23
          done = TRUE;
1159
23
          break;
1160
23
          }
1161
23
        }
1162
1163
      /* Now check the next significant item. */
1164
1165
23
      switch(*ncode)
1166
23
        {
1167
14
        default:
1168
14
        break;
1169
1170
14
        case OP_PROP:
1171
0
        if (ncode[1] != PT_CLIST) break;
1172
        /* Fall through */
1173
0
        case OP_ANYNL:
1174
9
        case OP_CHAR:
1175
9
        case OP_CHARI:
1176
9
        case OP_EXACT:
1177
9
        case OP_EXACTI:
1178
9
        case OP_HSPACE:
1179
9
        case OP_MINPLUS:
1180
9
        case OP_MINPLUSI:
1181
9
        case OP_PLUS:
1182
9
        case OP_PLUSI:
1183
9
        case OP_POSPLUS:
1184
9
        case OP_POSPLUSI:
1185
9
        case OP_VSPACE:
1186
        /* Note that these types will only be present in non-UCP mode. */
1187
9
        case OP_DIGIT:
1188
9
        case OP_NOT_DIGIT:
1189
9
        case OP_WORDCHAR:
1190
9
        case OP_NOT_WORDCHAR:
1191
9
        case OP_WHITESPACE:
1192
9
        case OP_NOT_WHITESPACE:
1193
9
        tcode = ncode;
1194
9
        continue;   /* With the following significant opcode */
1195
23
        }
1196
      /* Fall through */
1197
1198
      /* For a group bracket or a positive assertion without an immediately
1199
      following mandatory setting, recurse to set bits from within the
1200
      subpattern. If it can't find anything, we have to give up. If it finds
1201
      some mandatory character(s), we are done for this branch. Otherwise,
1202
      carry on scanning after the subpattern. */
1203
1204
48
      case OP_BRA:
1205
49
      case OP_SBRA:
1206
635
      case OP_CBRA:
1207
904
      case OP_SCBRA:
1208
904
      case OP_BRAPOS:
1209
905
      case OP_SBRAPOS:
1210
971
      case OP_CBRAPOS:
1211
1.17k
      case OP_SCBRAPOS:
1212
1.17k
      case OP_ONCE:
1213
1.17k
      case OP_SCRIPT_RUN:
1214
1.17k
      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1215
1.17k
      if (rc == SSB_DONE)
1216
190
        {
1217
190
        try_next = FALSE;
1218
190
        }
1219
988
      else if (rc == SSB_CONTINUE)
1220
980
        {
1221
1.17k
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1222
980
        tcode += 1 + LINK_SIZE;
1223
980
        }
1224
8
      else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1225
1.17k
      break;
1226
1227
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1228
      this branch, though we might have found something optional. For ALT, we
1229
      continue with the next alternative, but we have to arrange that the final
1230
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1231
      return SSB_CONTINUE: if this is the top level, that indicates failure,
1232
      but after a nested subpattern, it causes scanning to continue. */
1233
1234
2.13k
      case OP_ALT:
1235
2.13k
      yield = SSB_CONTINUE;
1236
2.13k
      try_next = FALSE;
1237
2.13k
      break;
1238
1239
500
      case OP_KET:
1240
770
      case OP_KETRMAX:
1241
770
      case OP_KETRMIN:
1242
972
      case OP_KETRPOS:
1243
972
      return SSB_CONTINUE;
1244
1245
      /* Skip over callout */
1246
1247
0
      case OP_CALLOUT:
1248
0
      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1249
0
      break;
1250
1251
0
      case OP_CALLOUT_STR:
1252
0
      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1253
0
      break;
1254
1255
      /* Skip over lookbehind and negative lookahead assertions */
1256
1257
3
      case OP_ASSERT_NOT:
1258
5
      case OP_ASSERTBACK:
1259
5
      case OP_ASSERTBACK_NOT:
1260
10
      case OP_ASSERTBACK_NA:
1261
93
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1262
10
      tcode += 1 + LINK_SIZE;
1263
10
      break;
1264
1265
      /* BRAZERO does the bracket, but carries on. */
1266
1267
5
      case OP_BRAZERO:
1268
5
      case OP_BRAMINZERO:
1269
5
      case OP_BRAPOSZERO:
1270
5
      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1271
5
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1272
2
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1273
2
      tcode += 1 + LINK_SIZE;
1274
2
      break;
1275
1276
      /* SKIPZERO skips the bracket. */
1277
1278
0
      case OP_SKIPZERO:
1279
0
      tcode++;
1280
0
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1281
0
      tcode += 1 + LINK_SIZE;
1282
0
      break;
1283
1284
      /* Single-char * or ? sets the bit and tries the next item */
1285
1286
46
      case OP_STAR:
1287
54
      case OP_MINSTAR:
1288
264
      case OP_POSSTAR:
1289
501
      case OP_QUERY:
1290
778
      case OP_MINQUERY:
1291
1.29k
      case OP_POSQUERY:
1292
1.29k
      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1293
1.29k
      break;
1294
1295
15
      case OP_STARI:
1296
71
      case OP_MINSTARI:
1297
582
      case OP_POSSTARI:
1298
933
      case OP_QUERYI:
1299
1.10k
      case OP_MINQUERYI:
1300
1.50k
      case OP_POSQUERYI:
1301
1.50k
      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1302
1.50k
      break;
1303
1304
      /* Single-char upto sets the bit and tries the next */
1305
1306
0
      case OP_UPTO:
1307
0
      case OP_MINUPTO:
1308
0
      case OP_POSUPTO:
1309
0
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1310
0
      break;
1311
1312
0
      case OP_UPTOI:
1313
0
      case OP_MINUPTOI:
1314
0
      case OP_POSUPTOI:
1315
0
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1316
0
      break;
1317
1318
      /* At least one single char sets the bit and stops */
1319
1320
0
      case OP_EXACT:
1321
0
      tcode += IMM2_SIZE;
1322
      /* Fall through */
1323
3.64k
      case OP_CHAR:
1324
3.65k
      case OP_PLUS:
1325
3.66k
      case OP_MINPLUS:
1326
3.90k
      case OP_POSPLUS:
1327
3.90k
      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1328
3.90k
      try_next = FALSE;
1329
3.90k
      break;
1330
1331
4
      case OP_EXACTI:
1332
4
      tcode += IMM2_SIZE;
1333
      /* Fall through */
1334
2.61k
      case OP_CHARI:
1335
2.63k
      case OP_PLUSI:
1336
2.64k
      case OP_MINPLUSI:
1337
2.66k
      case OP_POSPLUSI:
1338
2.66k
      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1339
2.66k
      try_next = FALSE;
1340
2.66k
      break;
1341
1342
      /* Special spacing and line-terminating items. These recognize specific
1343
      lists of characters. The difference between VSPACE and ANYNL is that the
1344
      latter can match the two-character CRLF sequence, but that is not
1345
      relevant for finding the first character, so their code here is
1346
      identical. */
1347
1348
76
      case OP_HSPACE:
1349
76
      SET_BIT(CHAR_HT);
1350
76
      SET_BIT(CHAR_SPACE);
1351
1352
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1353
      the bits for 0xA0 and for code units >= 255, independently of UTF. */
1354
1355
#if PCRE2_CODE_UNIT_WIDTH != 8
1356
      SET_BIT(0xA0);
1357
      SET_BIT(0xFF);
1358
#else
1359
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1360
      units of horizontal space characters. */
1361
1362
76
#ifdef SUPPORT_UNICODE
1363
76
      if (utf)
1364
2
        {
1365
2
        SET_BIT(0xC2);  /* For U+00A0 */
1366
2
        SET_BIT(0xE1);  /* For U+1680, U+180E */
1367
2
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1368
2
        SET_BIT(0xE3);  /* For U+3000 */
1369
2
        }
1370
74
      else
1371
74
#endif
1372
      /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1373
      the code is EBCDIC. */
1374
74
        {
1375
74
#ifndef EBCDIC
1376
74
        SET_BIT(0xA0);
1377
74
#endif  /* Not EBCDIC */
1378
74
        }
1379
76
#endif  /* 8-bit support */
1380
1381
76
      try_next = FALSE;
1382
76
      break;
1383
1384
240
      case OP_ANYNL:
1385
381
      case OP_VSPACE:
1386
381
      SET_BIT(CHAR_LF);
1387
381
      SET_BIT(CHAR_VT);
1388
381
      SET_BIT(CHAR_FF);
1389
381
      SET_BIT(CHAR_CR);
1390
1391
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1392
      the bits for NEL and for code units >= 255, independently of UTF. */
1393
1394
#if PCRE2_CODE_UNIT_WIDTH != 8
1395
      SET_BIT(CHAR_NEL);
1396
      SET_BIT(0xFF);
1397
#else
1398
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1399
      units of vertical space characters. */
1400
1401
381
#ifdef SUPPORT_UNICODE
1402
381
      if (utf)
1403
144
        {
1404
144
        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1405
144
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1406
144
        }
1407
237
      else
1408
237
#endif
1409
      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1410
237
        {
1411
237
        SET_BIT(CHAR_NEL);
1412
237
        }
1413
381
#endif  /* 8-bit support */
1414
1415
381
      try_next = FALSE;
1416
381
      break;
1417
1418
      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1419
      is set, we do not see these opcodes because \d etc are converted to
1420
      properties. Therefore, these apply in the case when only characters less
1421
      than 256 are recognized to match the types. */
1422
1423
3
      case OP_NOT_DIGIT:
1424
3
      set_nottype_bits(re, cbit_digit, table_limit);
1425
3
      try_next = FALSE;
1426
3
      break;
1427
1428
51
      case OP_DIGIT:
1429
51
      set_type_bits(re, cbit_digit, table_limit);
1430
51
      try_next = FALSE;
1431
51
      break;
1432
1433
21
      case OP_NOT_WHITESPACE:
1434
21
      set_nottype_bits(re, cbit_space, table_limit);
1435
21
      try_next = FALSE;
1436
21
      break;
1437
1438
132
      case OP_WHITESPACE:
1439
132
      set_type_bits(re, cbit_space, table_limit);
1440
132
      try_next = FALSE;
1441
132
      break;
1442
1443
91
      case OP_NOT_WORDCHAR:
1444
91
      set_nottype_bits(re, cbit_word, table_limit);
1445
91
      try_next = FALSE;
1446
91
      break;
1447
1448
1.12k
      case OP_WORDCHAR:
1449
1.12k
      set_type_bits(re, cbit_word, table_limit);
1450
1.12k
      try_next = FALSE;
1451
1.12k
      break;
1452
1453
      /* One or more character type fudges the pointer and restarts, knowing
1454
      it will hit a single character type and stop there. */
1455
1456
273
      case OP_TYPEPLUS:
1457
419
      case OP_TYPEMINPLUS:
1458
626
      case OP_TYPEPOSPLUS:
1459
626
      tcode++;
1460
626
      break;
1461
1462
0
      case OP_TYPEEXACT:
1463
0
      tcode += 1 + IMM2_SIZE;
1464
0
      break;
1465
1466
      /* Zero or more repeats of character types set the bits and then
1467
      try again. */
1468
1469
0
      case OP_TYPEUPTO:
1470
0
      case OP_TYPEMINUPTO:
1471
0
      case OP_TYPEPOSUPTO:
1472
0
      tcode += IMM2_SIZE;  /* Fall through */
1473
1474
13
      case OP_TYPESTAR:
1475
23
      case OP_TYPEMINSTAR:
1476
31
      case OP_TYPEPOSSTAR:
1477
316
      case OP_TYPEQUERY:
1478
375
      case OP_TYPEMINQUERY:
1479
1.09k
      case OP_TYPEPOSQUERY:
1480
1.09k
      switch(tcode[1])
1481
1.09k
        {
1482
43
        default:
1483
51
        case OP_ANY:
1484
51
        case OP_ALLANY:
1485
51
        return SSB_FAIL;
1486
1487
6
        case OP_HSPACE:
1488
6
        SET_BIT(CHAR_HT);
1489
6
        SET_BIT(CHAR_SPACE);
1490
1491
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1492
        the bits for 0xA0 and for code units >= 255, independently of UTF. */
1493
1494
#if PCRE2_CODE_UNIT_WIDTH != 8
1495
        SET_BIT(0xA0);
1496
        SET_BIT(0xFF);
1497
#else
1498
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1499
        units of horizontal space characters. */
1500
1501
6
#ifdef SUPPORT_UNICODE
1502
6
        if (utf)
1503
0
          {
1504
0
          SET_BIT(0xC2);  /* For U+00A0 */
1505
0
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1506
0
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1507
0
          SET_BIT(0xE3);  /* For U+3000 */
1508
0
          }
1509
6
        else
1510
6
#endif
1511
        /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1512
        the code is EBCDIC. */
1513
6
          {
1514
6
#ifndef EBCDIC
1515
6
          SET_BIT(0xA0);
1516
6
#endif  /* Not EBCDIC */
1517
6
          }
1518
6
#endif  /* 8-bit support */
1519
6
        break;
1520
1521
60
        case OP_ANYNL:
1522
133
        case OP_VSPACE:
1523
133
        SET_BIT(CHAR_LF);
1524
133
        SET_BIT(CHAR_VT);
1525
133
        SET_BIT(CHAR_FF);
1526
133
        SET_BIT(CHAR_CR);
1527
1528
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529
        the bits for NEL and for code units >= 255, independently of UTF. */
1530
1531
#if PCRE2_CODE_UNIT_WIDTH != 8
1532
        SET_BIT(CHAR_NEL);
1533
        SET_BIT(0xFF);
1534
#else
1535
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536
        units of vertical space characters. */
1537
1538
133
#ifdef SUPPORT_UNICODE
1539
133
        if (utf)
1540
72
          {
1541
72
          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1542
72
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1543
72
          }
1544
61
        else
1545
61
#endif
1546
        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547
61
          {
1548
61
          SET_BIT(CHAR_NEL);
1549
61
          }
1550
133
#endif  /* 8-bit support */
1551
133
        break;
1552
1553
0
        case OP_NOT_DIGIT:
1554
0
        set_nottype_bits(re, cbit_digit, table_limit);
1555
0
        break;
1556
1557
193
        case OP_DIGIT:
1558
193
        set_type_bits(re, cbit_digit, table_limit);
1559
193
        break;
1560
1561
16
        case OP_NOT_WHITESPACE:
1562
16
        set_nottype_bits(re, cbit_space, table_limit);
1563
16
        break;
1564
1565
85
        case OP_WHITESPACE:
1566
85
        set_type_bits(re, cbit_space, table_limit);
1567
85
        break;
1568
1569
2
        case OP_NOT_WORDCHAR:
1570
2
        set_nottype_bits(re, cbit_word, table_limit);
1571
2
        break;
1572
1573
612
        case OP_WORDCHAR:
1574
612
        set_type_bits(re, cbit_word, table_limit);
1575
612
        break;
1576
1.09k
        }
1577
1578
1.04k
      tcode += 2;
1579
1.04k
      break;
1580
1581
      /* Extended class: if there are any property checks, or if this is a
1582
      negative XCLASS without a map, give up. If there are no property checks,
1583
      there must be wide characters on the XCLASS list, because otherwise an
1584
      XCLASS would not have been created. This means that code points >= 255
1585
      are potential starters. In the UTF-8 case we can scan them and set bits
1586
      for the relevant leading bytes. */
1587
1588
0
#ifdef SUPPORT_WIDE_CHARS
1589
125
      case OP_XCLASS:
1590
125
      xclassflags = tcode[1 + LINK_SIZE];
1591
125
      if ((xclassflags & XCL_HASPROP) != 0 ||
1592
85
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1593
40
        return SSB_FAIL;
1594
1595
      /* We have a positive XCLASS or a negative one without a map. Set up the
1596
      map pointer if there is one, and fall through. */
1597
1598
85
      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1599
85
        (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1600
1601
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1602
      then jump to handle the map. */
1603
1604
85
#if PCRE2_CODE_UNIT_WIDTH == 8
1605
85
      if (utf && (xclassflags & XCL_NOT) == 0)
1606
35
        {
1607
35
        PCRE2_UCHAR b, e;
1608
35
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1609
35
        tcode += GET(tcode, 1);
1610
1611
140
        for (;;) switch (*p++)
1612
140
          {
1613
71
          case XCL_SINGLE:
1614
71
          b = *p++;
1615
194
          while ((*p & 0xc0) == 0x80) p++;
1616
71
          re->start_bitmap[b/8] |= (1u << (b&7));
1617
71
          break;
1618
1619
34
          case XCL_RANGE:
1620
34
          b = *p++;
1621
97
          while ((*p & 0xc0) == 0x80) p++;
1622
34
          e = *p++;
1623
105
          while ((*p & 0xc0) == 0x80) p++;
1624
259
          for (; b <= e; b++)
1625
225
            re->start_bitmap[b/8] |= (1u << (b&7));
1626
34
          break;
1627
1628
35
          case XCL_END:
1629
35
          goto HANDLE_CLASSMAP;
1630
1631
0
          default:
1632
0
          return SSB_UNKNOWN;   /* Internal error, should not occur */
1633
140
          }
1634
35
        }
1635
50
#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1636
50
#endif  /* SUPPORT_WIDE_CHARS */
1637
1638
      /* It seems that the fall through comment must be outside the #ifdef if
1639
      it is to avoid the gcc compiler warning. */
1640
1641
      /* Fall through */
1642
1643
      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1644
      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1645
      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1646
      there is no difference between CLASS and NCLASS. In all other wide
1647
      character modes, set the 0xFF bit to indicate code units >= 255. */
1648
1649
600
      case OP_NCLASS:
1650
600
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1651
600
      if (utf)
1652
309
        {
1653
309
        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1654
309
        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1655
309
        }
1656
#elif PCRE2_CODE_UNIT_WIDTH != 8
1657
      SET_BIT(0xFF);                             /* For characters >= 255 */
1658
#endif
1659
      /* Fall through */
1660
1661
      /* Enter here for a positive non-XCLASS. If we have fallen through from
1662
      an XCLASS, classmap will already be set; just advance the code pointer.
1663
      Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1664
1665
2.09k
      case OP_CLASS:
1666
2.09k
      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1667
2.04k
        {
1668
2.04k
        classmap = (uint8_t *)(++tcode);
1669
2.04k
        tcode += 32 / sizeof(PCRE2_UCHAR);
1670
2.04k
        }
1671
1672
      /* When wide characters are supported, classmap may be NULL. In UTF-8
1673
      (sic) mode, the bits in a class bit map correspond to character values,
1674
      not to byte values. However, the bit map we are constructing is for byte
1675
      values. So we have to do a conversion for characters whose code point is
1676
      greater than 127. In fact, there are only two possible starting bytes for
1677
      characters in the range 128 - 255. */
1678
1679
2.09k
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1680
2.13k
      HANDLE_CLASSMAP:
1681
2.13k
#endif
1682
2.13k
      if (classmap != NULL)
1683
2.13k
        {
1684
2.13k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1685
2.13k
        if (utf)
1686
479
          {
1687
8.14k
          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1688
22.4k
          for (c = 128; c < 256; c++)
1689
21.9k
            {
1690
21.9k
            if ((classmap[c/8] & (1u << (c&7))) != 0)
1691
626
              {
1692
626
              int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1693
626
              re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1694
626
              c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1695
626
              }
1696
21.9k
            }
1697
479
          }
1698
1.65k
        else
1699
1.65k
#endif
1700
        /* In all modes except UTF-8, the two bit maps are compatible. */
1701
1702
1.65k
          {
1703
54.6k
          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1704
1.65k
          }
1705
2.13k
        }
1706
1707
      /* Act on what follows the class. For a zero minimum repeat, continue;
1708
      otherwise stop processing. */
1709
1710
2.13k
      switch (*tcode)
1711
2.13k
        {
1712
295
        case OP_CRSTAR:
1713
572
        case OP_CRMINSTAR:
1714
710
        case OP_CRQUERY:
1715
848
        case OP_CRMINQUERY:
1716
1.21k
        case OP_CRPOSSTAR:
1717
1.40k
        case OP_CRPOSQUERY:
1718
1.40k
        tcode++;
1719
1.40k
        break;
1720
1721
0
        case OP_CRRANGE:
1722
0
        case OP_CRMINRANGE:
1723
0
        case OP_CRPOSRANGE:
1724
0
        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1725
0
          else try_next = FALSE;
1726
0
        break;
1727
1728
731
        default:
1729
731
        try_next = FALSE;
1730
731
        break;
1731
2.13k
        }
1732
2.13k
      break; /* End of class handling case */
1733
20.6k
      }      /* End of switch for opcodes */
1734
20.6k
    }        /* End of try_next loop */
1735
1736
11.5k
  code += GET(code, 1);   /* Advance to next branch */
1737
11.5k
  }
1738
11.5k
while (*code == OP_ALT);
1739
1740
980
return yield;
1741
2.70k
}
1742
1743
1744
1745
/*************************************************
1746
*          Study a compiled expression           *
1747
*************************************************/
1748
1749
/* This function is handed a compiled expression that it must study to produce
1750
information that will speed up the matching.
1751
1752
Argument:
1753
  re       points to the compiled expression
1754
1755
Returns:   0 normally; non-zero should never normally occur
1756
           1 unknown opcode in set_start_bits
1757
           2 missing capturing bracket
1758
           3 unknown opcode in find_minlength
1759
*/
1760
1761
int
1762
PRIV(study)(pcre2_real_code *re)
1763
1.65k
{
1764
1.65k
int count = 0;
1765
1.65k
PCRE2_UCHAR *code;
1766
1.65k
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1767
1.65k
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1768
1769
/* Find start of compiled code */
1770
1771
1.65k
code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1772
1.65k
  re->name_entry_size * re->name_count;
1773
1774
/* For a pattern that has a first code unit, or a multiline pattern that
1775
matches only at "line start", there is no point in seeking a list of starting
1776
code units. */
1777
1778
1.65k
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1779
1.52k
  {
1780
1.52k
  int depth = 0;
1781
1.52k
  int rc = set_start_bits(re, code, utf, ucp, &depth);
1782
1.52k
  if (rc == SSB_UNKNOWN) return 1;
1783
1784
  /* If a list of starting code units was set up, scan the list to see if only
1785
  one or two were listed. Having only one listed is rare because usually a
1786
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1787
  If two are listed, see if they are caseless versions of the same character;
1788
  if so we can replace the list with a caseless first code unit. This gives
1789
  better performance and is plausibly worth doing for patterns such as [Ww]ord
1790
  or (word|WORD). */
1791
1792
1.52k
  if (rc == SSB_DONE)
1793
737
    {
1794
737
    int i;
1795
737
    int a = -1;
1796
737
    int b = -1;
1797
737
    uint8_t *p = re->start_bitmap;
1798
737
    uint32_t flags = PCRE2_FIRSTMAPSET;
1799
1800
3.85k
    for (i = 0; i < 256; p++, i += 8)
1801
3.85k
      {
1802
3.85k
      uint8_t x = *p;
1803
3.85k
      if (x != 0)
1804
1.10k
        {
1805
1.10k
        int c;
1806
1.10k
        uint8_t y = x & (~x + 1);   /* Least significant bit */
1807
1.10k
        if (y != x) goto DONE;      /* More than one bit set */
1808
1809
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1810
        all wide characters", so we cannot use it here. */
1811
1812
#if PCRE2_CODE_UNIT_WIDTH != 8
1813
        if (i == 248 && x == 0x80) goto DONE;
1814
#endif
1815
1816
        /* Compute the character value */
1817
1818
645
        c = i;
1819
645
        switch (x)
1820
645
          {
1821
259
          case 1:   break;
1822
115
          case 2:   c += 1; break;  case 4:  c += 2; break;
1823
61
          case 8:   c += 3; break;  case 16: c += 4; break;
1824
62
          case 32:  c += 5; break;  case 64: c += 6; break;
1825
43
          case 128: c += 7; break;
1826
645
          }
1827
1828
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1829
        mode, only values < 128 can be used. In all the other cases, c is a
1830
        character value. */
1831
1832
645
#if PCRE2_CODE_UNIT_WIDTH == 8
1833
645
        if (utf && c > 127) goto DONE;
1834
639
#endif
1835
639
        if (a < 0) a = c;   /* First one found, save in a */
1836
277
        else if (b < 0)     /* Second one found */
1837
269
          {
1838
269
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1839
1840
269
#ifdef SUPPORT_UNICODE
1841
269
          if (utf || ucp)
1842
31
            {
1843
31
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1844
26
            if (c > 127) d = UCD_OTHERCASE(c);
1845
26
            }
1846
264
#endif  /* SUPPORT_UNICODE */
1847
1848
264
          if (d != a) goto DONE;   /* Not the other case of a */
1849
10
          b = c;                   /* Save second in b */
1850
10
          }
1851
8
        else goto DONE;   /* More than two characters found */
1852
639
        }
1853
3.85k
      }
1854
1855
    /* Replace the start code unit bits with a first code unit, but only if it
1856
    is not the same as a required later code unit. This is because a search for
1857
    a required code unit starts after an explicit first code unit, but at a
1858
    code unit found from the bitmap. Patterns such as /a*a/ don't work
1859
    if both the start unit and required unit are the same. */
1860
1861
7
    if (a >= 0 &&
1862
7
        (
1863
7
        (re->flags & PCRE2_LASTSET) == 0 ||
1864
5
          (
1865
5
          re->last_codeunit != (uint32_t)a &&
1866
3
          (b < 0 || re->last_codeunit != (uint32_t)b)
1867
5
          )
1868
7
        ))
1869
5
      {
1870
5
      re->first_codeunit = a;
1871
5
      flags = PCRE2_FIRSTSET;
1872
5
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1873
5
      }
1874
1875
737
    DONE:
1876
737
    re->flags |= flags;
1877
737
    }
1878
1.52k
  }
1879
1880
/* Find the minimum length of subject string. If the pattern can match an empty
1881
string, the minimum length is already known. If the pattern contains (*ACCEPT)
1882
all bets are off, and we don't even try to find a minimum length. If there are
1883
more back references than the size of the vector we are going to cache them in,
1884
do nothing. A pattern that complicated will probably take a long time to
1885
analyze and may in any case turn out to be too complicated. Note that back
1886
reference minima are held as 16-bit numbers. */
1887
1888
1.65k
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1889
1.58k
     re->top_backref <= MAX_CACHE_BACKREF)
1890
1.58k
  {
1891
1.58k
  int min;
1892
1.58k
  int backref_cache[MAX_CACHE_BACKREF+1];
1893
1.58k
  backref_cache[0] = 0;    /* Highest one that is set */
1894
1.58k
  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1895
1.58k
  switch(min)
1896
1.58k
    {
1897
34
    case -1:  /* \C in UTF mode or over-complex regex */
1898
34
    break;    /* Leave minlength unchanged (will be zero) */
1899
1900
0
    case -2:
1901
0
    return 2; /* missing capturing bracket */
1902
1903
0
    case -3:
1904
0
    return 3; /* unrecognized opcode */
1905
1906
1.55k
    default:
1907
1.55k
    re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1908
1.55k
    break;
1909
1.58k
    }
1910
1.58k
  }
1911
1912
1.65k
return 0;
1913
1.65k
}
1914
1915
/* End of pcre2_study.c */