Coverage Report

Created: 2026-06-10 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libgit2/deps/pcre2/pcre2_study.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains functions for scanning a compiled pattern and
43
collecting data (e.g. minimum matching length). */
44
45
46
#include "pcre2_internal.h"
47
48
49
50
/* The maximum remembered capturing brackets minimum. */
51
52
2.88k
#define MAX_CACHE_BACKREF 128
53
54
/* Set a bit in the starting code unit bit map. */
55
56
35.2k
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
57
58
/* Returns from set_start_bits() */
59
60
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
61
62
63
/*************************************************
64
*   Find the minimum subject length for a group  *
65
*************************************************/
66
67
/* Scan a parenthesized group and compute the minimum length of subject that
68
is needed to match it. This is a lower bound; it does not mean there is a
69
string of that length that matches. In UTF mode, the result is in characters
70
rather than code units. The field in a compiled pattern for storing the minimum
71
length is 16-bits long (on the grounds that anything longer than that is
72
pathological), so we give up when we reach that amount. This also means that
73
integer overflow for really crazy patterns cannot happen.
74
75
Backreference minimum lengths are cached to speed up multiple references. This
76
function is called only when the highest back reference in the pattern is less
77
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
78
caching vector. The zeroth element contains the number of the highest set
79
value.
80
81
Arguments:
82
  re              compiled pattern block
83
  code            pointer to start of group (the bracket)
84
  startcode       pointer to start of the whole pattern's code
85
  utf             UTF flag
86
  recurses        chain of recurse_check to catch mutual recursion
87
  countptr        pointer to call count (to catch over complexity)
88
  backref_cache   vector for caching back references.
89
90
This function is no longer called when the pattern contains (*ACCEPT); however,
91
the old code for returning -1 is retained, just in case.
92
93
Returns:   the minimum length
94
           -1 \C in UTF-8 mode
95
              or (*ACCEPT)
96
              or pattern too complicated
97
           -2 internal error (missing capturing bracket)
98
           -3 internal error (opcode not listed)
99
*/
100
101
static int
102
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
103
  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
104
  int *backref_cache)
105
16.1k
{
106
16.1k
int length = -1;
107
16.1k
int branchlength = 0;
108
16.1k
int prev_cap_recno = -1;
109
16.1k
int prev_cap_d = 0;
110
16.1k
int prev_recurse_recno = -1;
111
16.1k
int prev_recurse_d = 0;
112
16.1k
uint32_t once_fudge = 0;
113
16.1k
BOOL had_recurse = FALSE;
114
16.1k
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
115
16.1k
PCRE2_SPTR nextbranch = code + GET(code, 1);
116
16.1k
PCRE2_SPTR cc = code + 1 + LINK_SIZE;
117
16.1k
recurse_check this_recurse;
118
119
/* If this is a "could be empty" group, its minimum length is 0. */
120
121
16.1k
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
122
123
/* Skip over capturing bracket number */
124
125
15.9k
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
126
127
/* A large and/or complex regex can take too long to process. */
128
129
15.9k
if ((*countptr)++ > 1000) return -1;
130
131
/* Scan along the opcodes for this branch. If we get to the end of the branch,
132
check the length against that of the other branches. If the accumulated length
133
passes 16-bits, reset to that value and skip the rest of the branch. */
134
135
15.9k
for (;;)
136
178k
  {
137
178k
  int d, min, recno;
138
178k
  PCRE2_UCHAR op;
139
178k
  PCRE2_SPTR cs, ce;
140
141
178k
  if (branchlength >= (int)UINT16_MAX)
142
3
    {
143
3
    branchlength = UINT16_MAX;
144
3
    cc = nextbranch;
145
3
    }
146
147
178k
  op = *cc;
148
178k
  switch (op)
149
178k
    {
150
1.34k
    case OP_COND:
151
1.35k
    case OP_SCOND:
152
153
    /* If there is only one branch in a condition, the implied branch has zero
154
    length, so we don't add anything. This covers the DEFINE "condition"
155
    automatically. If there are two branches we can treat it the same as any
156
    other non-capturing subpattern. */
157
158
1.35k
    cs = cc + GET(cc, 1);
159
1.35k
    if (*cs != OP_ALT)
160
1.22k
      {
161
1.22k
      cc = cs + 1 + LINK_SIZE;
162
1.22k
      break;
163
1.22k
      }
164
128
    goto PROCESS_NON_CAPTURE;
165
166
3.09k
    case OP_BRA:
167
    /* There's a special case of OP_BRA, when it is wrapped round a repeated
168
    OP_RECURSE. We'd like to process the latter at this level so that
169
    remembering the value works for repeated cases. So we do nothing, but
170
    set a fudge value to skip over the OP_KET after the recurse. */
171
172
3.09k
    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173
0
      {
174
0
      once_fudge = 1 + LINK_SIZE;
175
0
      cc += 1 + LINK_SIZE;
176
0
      break;
177
0
      }
178
3.09k
    PCRE2_FALLTHROUGH /* Fall through */
179
3.09k
180
4.24k
    case OP_ONCE:
181
4.24k
    case OP_SCRIPT_RUN:
182
4.27k
    case OP_SBRA:
183
4.44k
    case OP_BRAPOS:
184
4.56k
    case OP_SBRAPOS:
185
4.68k
    PROCESS_NON_CAPTURE:
186
4.68k
    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187
4.68k
      backref_cache);
188
4.68k
    if (d < 0) return d;
189
4.57k
    branchlength += d;
190
5.24k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
191
4.57k
    cc += 1 + LINK_SIZE;
192
4.57k
    break;
193
194
    /* To save time for repeated capturing subpatterns, we remember the
195
    length of the previous one. Unfortunately we can't do the same for
196
    the unnumbered ones above. Nor can we do this if (?| is present in the
197
    pattern because captures with the same number are not then identical. */
198
199
103k
    case OP_CBRA:
200
103k
    case OP_SCBRA:
201
104k
    case OP_CBRAPOS:
202
104k
    case OP_SCBRAPOS:
203
104k
    recno = (int)GET2(cc, 1+LINK_SIZE);
204
104k
    if (dupcapused || recno != prev_cap_recno)
205
8.30k
      {
206
8.30k
      prev_cap_recno = recno;
207
8.30k
      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208
8.30k
        backref_cache);
209
8.30k
      if (prev_cap_d < 0) return prev_cap_d;
210
8.30k
      }
211
104k
    branchlength += prev_cap_d;
212
105k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
213
104k
    cc += 1 + LINK_SIZE;
214
104k
    break;
215
216
    /* ACCEPT makes things far too complicated; we have to give up. In fact,
217
    from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218
    used. However, leave the code in place, just in case. */
219
220
0
    case OP_ACCEPT:
221
0
    case OP_ASSERT_ACCEPT:
222
0
    return -1;
223
224
    /* Reached end of a branch; if it's a ket it is the end of a nested
225
    call. If it's ALT it is an alternation in a nested call. If it is END it's
226
    the end of the outer call. All can be handled by the same code. If the
227
    length of any branch is zero, there is no need to scan any subsequent
228
    branches. */
229
230
1.65k
    case OP_ALT:
231
16.7k
    case OP_KET:
232
17.0k
    case OP_KETRMAX:
233
17.0k
    case OP_KETRMIN:
234
17.3k
    case OP_KETRPOS:
235
17.3k
    case OP_END:
236
17.3k
    if (length < 0 || (!had_recurse && branchlength < length))
237
16.4k
      length = branchlength;
238
17.3k
    if (op != OP_ALT || length == 0) return length;
239
1.58k
    nextbranch = cc + GET(cc, 1);
240
1.58k
    cc += 1 + LINK_SIZE;
241
1.58k
    branchlength = 0;
242
1.58k
    had_recurse = FALSE;
243
1.58k
    break;
244
245
    /* Skip over assertive subpatterns */
246
247
156
    case OP_ASSERT:
248
423
    case OP_ASSERT_NOT:
249
542
    case OP_ASSERTBACK:
250
573
    case OP_ASSERTBACK_NOT:
251
573
    case OP_ASSERT_NA:
252
573
    case OP_ASSERT_SCS:
253
573
    case OP_ASSERTBACK_NA:
254
591
    do cc += GET(cc, 1); while (*cc == OP_ALT);
255
573
    PCRE2_FALLTHROUGH /* Fall through */
256
257
    /* Skip over things that don't match chars */
258
259
573
    case OP_REVERSE:
260
573
    case OP_VREVERSE:
261
582
    case OP_CREF:
262
582
    case OP_DNCREF:
263
700
    case OP_RREF:
264
700
    case OP_DNRREF:
265
700
    case OP_FALSE:
266
700
    case OP_TRUE:
267
1.14k
    case OP_CALLOUT:
268
1.21k
    case OP_SOD:
269
1.21k
    case OP_SOM:
270
1.29k
    case OP_EOD:
271
1.29k
    case OP_EODN:
272
1.52k
    case OP_CIRC:
273
1.60k
    case OP_CIRCM:
274
1.64k
    case OP_DOLL:
275
2.03k
    case OP_DOLLM:
276
2.12k
    case OP_NOT_WORD_BOUNDARY:
277
2.37k
    case OP_WORD_BOUNDARY:
278
2.37k
    case OP_NOT_UCP_WORD_BOUNDARY:
279
2.37k
    case OP_UCP_WORD_BOUNDARY:
280
2.37k
    cc += PRIV(OP_lengths)[*cc];
281
2.37k
    break;
282
283
0
    case OP_CALLOUT_STR:
284
0
    cc += GET(cc, 1 + 2*LINK_SIZE);
285
0
    break;
286
287
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
288
289
1.35k
    case OP_BRAZERO:
290
1.36k
    case OP_BRAMINZERO:
291
1.37k
    case OP_BRAPOSZERO:
292
1.38k
    case OP_SKIPZERO:
293
1.38k
    cc += PRIV(OP_lengths)[*cc];
294
1.39k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
295
1.38k
    cc += 1 + LINK_SIZE;
296
1.38k
    break;
297
298
    /* Handle literal characters and + repetitions */
299
300
23.1k
    case OP_CHAR:
301
24.4k
    case OP_CHARI:
302
24.6k
    case OP_NOT:
303
24.7k
    case OP_NOTI:
304
25.4k
    case OP_PLUS:
305
25.4k
    case OP_PLUSI:
306
25.4k
    case OP_MINPLUS:
307
25.4k
    case OP_MINPLUSI:
308
25.9k
    case OP_POSPLUS:
309
26.3k
    case OP_POSPLUSI:
310
26.3k
    case OP_NOTPLUS:
311
26.7k
    case OP_NOTPLUSI:
312
26.7k
    case OP_NOTMINPLUS:
313
26.7k
    case OP_NOTMINPLUSI:
314
26.7k
    case OP_NOTPOSPLUS:
315
26.8k
    case OP_NOTPOSPLUSI:
316
26.8k
    branchlength++;
317
26.8k
    cc += 2;
318
26.8k
#ifdef SUPPORT_UNICODE
319
26.8k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
320
26.8k
#endif
321
26.8k
    break;
322
323
245
    case OP_TYPEPLUS:
324
262
    case OP_TYPEMINPLUS:
325
323
    case OP_TYPEPOSPLUS:
326
323
    branchlength++;
327
323
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
328
323
    break;
329
330
    /* Handle exact repetitions. The count is already in characters, but we
331
    may need to skip over a multibyte character in UTF mode.  */
332
333
484
    case OP_EXACT:
334
1.05k
    case OP_EXACTI:
335
1.08k
    case OP_NOTEXACT:
336
1.18k
    case OP_NOTEXACTI:
337
1.18k
    branchlength += GET2(cc,1);
338
1.18k
    cc += 2 + IMM2_SIZE;
339
1.18k
#ifdef SUPPORT_UNICODE
340
1.18k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
341
1.18k
#endif
342
1.18k
    break;
343
344
238
    case OP_TYPEEXACT:
345
238
    branchlength += GET2(cc,1);
346
238
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
347
238
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
348
238
    break;
349
350
    /* Handle single-char non-literal matchers */
351
352
0
    case OP_PROP:
353
0
    case OP_NOTPROP:
354
0
    cc += 2;
355
0
    PCRE2_FALLTHROUGH /* Fall through */
356
357
18
    case OP_NOT_DIGIT:
358
42
    case OP_DIGIT:
359
65
    case OP_NOT_WHITESPACE:
360
91
    case OP_WHITESPACE:
361
112
    case OP_NOT_WORDCHAR:
362
128
    case OP_WORDCHAR:
363
591
    case OP_ANY:
364
714
    case OP_ALLANY:
365
715
    case OP_EXTUNI:
366
730
    case OP_HSPACE:
367
751
    case OP_NOT_HSPACE:
368
766
    case OP_VSPACE:
369
798
    case OP_NOT_VSPACE:
370
798
    branchlength++;
371
798
    cc++;
372
798
    break;
373
374
    /* "Any newline" might match two characters, but it also might match just
375
    one. */
376
377
21
    case OP_ANYNL:
378
21
    branchlength += 1;
379
21
    cc++;
380
21
    break;
381
382
    /* The single-byte matcher means we can't proceed in UTF mode. (In
383
    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
384
    appear, but leave the code, just in case.) */
385
386
0
    case OP_ANYBYTE:
387
0
#ifdef SUPPORT_UNICODE
388
0
    if (utf) return -1;
389
0
#endif
390
0
    branchlength++;
391
0
    cc++;
392
0
    break;
393
394
    /* For repeated character types, we have to test for \p and \P, which have
395
    an extra two bytes of parameters. */
396
397
150
    case OP_TYPESTAR:
398
153
    case OP_TYPEMINSTAR:
399
279
    case OP_TYPEQUERY:
400
293
    case OP_TYPEMINQUERY:
401
377
    case OP_TYPEPOSSTAR:
402
434
    case OP_TYPEPOSQUERY:
403
434
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
404
434
    cc += PRIV(OP_lengths)[op];
405
434
    break;
406
407
95
    case OP_TYPEUPTO:
408
100
    case OP_TYPEMINUPTO:
409
112
    case OP_TYPEPOSUPTO:
410
112
    if (cc[1 + IMM2_SIZE] == OP_PROP
411
112
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
412
112
    cc += PRIV(OP_lengths)[op];
413
112
    break;
414
415
    /* Check a class for variable quantification */
416
417
1.35k
    case OP_CLASS:
418
2.00k
    case OP_NCLASS:
419
2.00k
#ifdef SUPPORT_WIDE_CHARS
420
2.00k
    case OP_XCLASS:
421
2.00k
    case OP_ECLASS:
422
    /* The original code caused an unsigned overflow in 64 bit systems,
423
    so now we use a conditional statement. */
424
2.00k
    if (op == OP_XCLASS || op == OP_ECLASS)
425
0
      cc += GET(cc, 1);
426
2.00k
    else
427
2.00k
#endif
428
2.00k
      cc += PRIV(OP_lengths)[OP_CLASS];
429
430
2.00k
    switch (*cc)
431
2.00k
      {
432
29
      case OP_CRPLUS:
433
38
      case OP_CRMINPLUS:
434
399
      case OP_CRPOSPLUS:
435
399
      branchlength++;
436
399
      PCRE2_FALLTHROUGH /* Fall through */
437
438
414
      case OP_CRSTAR:
439
415
      case OP_CRMINSTAR:
440
502
      case OP_CRQUERY:
441
516
      case OP_CRMINQUERY:
442
539
      case OP_CRPOSSTAR:
443
786
      case OP_CRPOSQUERY:
444
786
      cc++;
445
786
      break;
446
447
6
      case OP_CRRANGE:
448
11
      case OP_CRMINRANGE:
449
55
      case OP_CRPOSRANGE:
450
55
      branchlength += GET2(cc,1);
451
55
      cc += 1 + 2 * IMM2_SIZE;
452
55
      break;
453
454
1.16k
      default:
455
1.16k
      branchlength++;
456
1.16k
      break;
457
2.00k
      }
458
2.00k
    break;
459
460
    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461
    way: we find the minimum length for the subpattern. A recursion
462
    (backreference or subroutine) causes an a flag to be set that causes the
463
    length of this branch to be ignored. The logic is that a recursion can only
464
    make sense if there is another alternative that stops the recursing. That
465
    will provide the minimum length (when no recursion happens).
466
467
    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468
    matches an empty string (by default it causes a matching failure), so in
469
    that case we must set the minimum length to zero.
470
471
    For backreferenes, if duplicate numbers are present in the pattern we check
472
    for a reference to a duplicate. If it is, we don't know which version will
473
    be referenced, so we have to set the minimum length to zero. */
474
475
    /* Duplicate named pattern back reference. */
476
477
2.00k
    case OP_DNREF:
478
772
    case OP_DNREFI:
479
772
    if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480
772
      {
481
772
      int count = GET2(cc, 1+IMM2_SIZE);
482
772
      PCRE2_SPTR slot =
483
772
        (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
484
772
          GET2(cc, 1) * re->name_entry_size;
485
486
772
      d = INT_MAX;
487
488
      /* Scan all groups with the same name; find the shortest. */
489
490
786
      while (count-- > 0)
491
782
        {
492
782
        int dd, i;
493
782
        recno = GET2(slot, 0);
494
495
782
        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496
720
          dd = backref_cache[recno];
497
62
        else
498
62
          {
499
62
          ce = cs = PRIV(find_bracket)(startcode, utf, recno);
500
62
          if (cs == NULL) return -2;
501
64
          do ce += GET(ce, 1); while (*ce == OP_ALT);
502
503
62
          dd = 0;
504
62
          if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
505
62
            {
506
62
            if (cc > cs && cc < ce)    /* Simple recursion */
507
38
              {
508
38
              had_recurse = TRUE;
509
38
              }
510
24
            else
511
24
              {
512
24
              recurse_check *r = recurses;
513
24
              for (r = recurses; r != NULL; r = r->prev)
514
0
                if (r->group == cs) break;
515
24
              if (r != NULL)           /* Mutual recursion */
516
0
                {
517
0
                had_recurse = TRUE;
518
0
                }
519
24
              else
520
24
                {
521
24
                this_recurse.prev = recurses;  /* No recursion */
522
24
                this_recurse.group = cs;
523
24
                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
524
24
                  countptr, backref_cache);
525
24
                if (dd < 0) return dd;
526
24
                }
527
24
              }
528
62
            }
529
530
62
          backref_cache[recno] = dd;
531
287
          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
532
62
          backref_cache[0] = recno;
533
62
          }
534
535
782
        if (dd < d) d = dd;
536
782
        if (d <= 0) break;    /* No point looking at any more */
537
14
        slot += re->name_entry_size;
538
14
        }
539
772
      }
540
0
    else d = 0;
541
772
    cc += PRIV(OP_lengths)[*cc];
542
772
    goto REPEAT_BACK_REFERENCE;
543
544
    /* Single back reference by number. References by name are converted to by
545
    number when there is no duplication. */
546
547
256
    case OP_REF:
548
256
    case OP_REFI:
549
256
    recno = GET2(cc, 1);
550
256
    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
551
148
      d = backref_cache[recno];
552
108
    else
553
108
      {
554
108
      int i;
555
108
      d = 0;
556
557
108
      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
558
108
        {
559
108
        ce = cs = PRIV(find_bracket)(startcode, utf, recno);
560
108
        if (cs == NULL) return -2;
561
134
        do ce += GET(ce, 1); while (*ce == OP_ALT);
562
563
108
        if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
564
108
          {
565
108
          if (cc > cs && cc < ce)    /* Simple recursion */
566
88
            {
567
88
            had_recurse = TRUE;
568
88
            }
569
20
          else
570
20
            {
571
20
            recurse_check *r = recurses;
572
20
            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
573
20
            if (r != NULL)           /* Mutual recursion */
574
0
              {
575
0
              had_recurse = TRUE;
576
0
              }
577
20
            else                     /* No recursion */
578
20
              {
579
20
              this_recurse.prev = recurses;
580
20
              this_recurse.group = cs;
581
20
              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
582
20
                backref_cache);
583
20
              if (d < 0) return d;
584
20
              }
585
20
            }
586
108
          }
587
108
        }
588
589
108
      backref_cache[recno] = d;
590
173
      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
591
108
      backref_cache[0] = recno;
592
108
      }
593
594
256
    cc += PRIV(OP_lengths)[*cc];
595
596
    /* Handle repeated back references */
597
598
1.02k
    REPEAT_BACK_REFERENCE:
599
1.02k
    switch (*cc)
600
1.02k
      {
601
0
      case OP_CRSTAR:
602
0
      case OP_CRMINSTAR:
603
184
      case OP_CRQUERY:
604
192
      case OP_CRMINQUERY:
605
192
      case OP_CRPOSSTAR:
606
192
      case OP_CRPOSQUERY:
607
192
      min = 0;
608
192
      cc++;
609
192
      break;
610
611
3
      case OP_CRPLUS:
612
3
      case OP_CRMINPLUS:
613
3
      case OP_CRPOSPLUS:
614
3
      min = 1;
615
3
      cc++;
616
3
      break;
617
618
1
      case OP_CRRANGE:
619
1
      case OP_CRMINRANGE:
620
1
      case OP_CRPOSRANGE:
621
1
      min = GET2(cc, 1);
622
1
      cc += 1 + 2 * IMM2_SIZE;
623
1
      break;
624
625
832
      default:
626
832
      min = 1;
627
832
      break;
628
1.02k
      }
629
630
    /* Take care not to overflow: (1) min and d are ints, so check that their
631
    product is not greater than INT_MAX. (2) branchlength is limited to
632
    UINT16_MAX (checked at the top of the loop). */
633
634
1.02k
    if ((d > 0 && (INT_MAX/d) < min) || (int)UINT16_MAX - branchlength < min*d)
635
0
      branchlength = UINT16_MAX;
636
1.02k
    else branchlength += min * d;
637
1.02k
    break;
638
639
    /* Recursion always refers to the first occurrence of a subpattern with a
640
    given number. Therefore, we can always make use of caching, even when the
641
    pattern contains multiple subpatterns with the same number. */
642
643
11.2k
    case OP_RECURSE:
644
11.2k
    cs = ce = startcode + GET(cc, 1);
645
11.2k
    recno = GET2(cs, 1+LINK_SIZE);
646
11.2k
    if (recno == prev_recurse_recno)
647
10.5k
      {
648
10.5k
      branchlength += prev_recurse_d;
649
10.5k
      }
650
770
    else
651
770
      {
652
786
      do ce += GET(ce, 1); while (*ce == OP_ALT);
653
770
      if (cc > cs && cc < ce)    /* Simple recursion */
654
572
        had_recurse = TRUE;
655
198
      else
656
198
        {
657
198
        recurse_check *r = recurses;
658
224
        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
659
198
        if (r != NULL)          /* Mutual recursion */
660
6
          had_recurse = TRUE;
661
192
        else
662
192
          {
663
192
          this_recurse.prev = recurses;
664
192
          this_recurse.group = cs;
665
192
          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
666
192
            countptr, backref_cache);
667
192
          if (prev_recurse_d < 0) return prev_recurse_d;
668
192
          prev_recurse_recno = recno;
669
192
          branchlength += prev_recurse_d;
670
192
          }
671
198
        }
672
770
      }
673
11.2k
    cc += 1 + LINK_SIZE + once_fudge;
674
11.2k
    once_fudge = 0;
675
11.2k
    break;
676
677
    /* Anything else does not or need not match a character. We can get the
678
    item's length from the table, but for those that can match zero occurrences
679
    of a character, we must take special action for UTF-8 characters. As it
680
    happens, the "NOT" versions of these opcodes are used at present only for
681
    ASCII characters, so they could be omitted from this list. However, in
682
    future that may change, so we include them here so as not to leave a
683
    gotcha for a future maintainer. */
684
685
62
    case OP_UPTO:
686
62
    case OP_UPTOI:
687
63
    case OP_NOTUPTO:
688
65
    case OP_NOTUPTOI:
689
66
    case OP_MINUPTO:
690
66
    case OP_MINUPTOI:
691
66
    case OP_NOTMINUPTO:
692
66
    case OP_NOTMINUPTOI:
693
214
    case OP_POSUPTO:
694
312
    case OP_POSUPTOI:
695
408
    case OP_NOTPOSUPTO:
696
525
    case OP_NOTPOSUPTOI:
697
698
703
    case OP_STAR:
699
708
    case OP_STARI:
700
834
    case OP_NOTSTAR:
701
887
    case OP_NOTSTARI:
702
887
    case OP_MINSTAR:
703
887
    case OP_MINSTARI:
704
887
    case OP_NOTMINSTAR:
705
888
    case OP_NOTMINSTARI:
706
1.14k
    case OP_POSSTAR:
707
1.18k
    case OP_POSSTARI:
708
1.18k
    case OP_NOTPOSSTAR:
709
1.18k
    case OP_NOTPOSSTARI:
710
711
1.43k
    case OP_QUERY:
712
1.43k
    case OP_QUERYI:
713
1.46k
    case OP_NOTQUERY:
714
1.49k
    case OP_NOTQUERYI:
715
1.50k
    case OP_MINQUERY:
716
1.50k
    case OP_MINQUERYI:
717
1.50k
    case OP_NOTMINQUERY:
718
1.50k
    case OP_NOTMINQUERYI:
719
1.77k
    case OP_POSQUERY:
720
1.84k
    case OP_POSQUERYI:
721
1.85k
    case OP_NOTPOSQUERY:
722
1.85k
    case OP_NOTPOSQUERYI:
723
724
1.85k
    cc += PRIV(OP_lengths)[op];
725
1.85k
#ifdef SUPPORT_UNICODE
726
1.85k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
727
1.85k
#endif
728
1.85k
    break;
729
730
    /* Skip these, but we need to add in the name length. */
731
732
241
    case OP_MARK:
733
241
    case OP_COMMIT_ARG:
734
305
    case OP_PRUNE_ARG:
735
390
    case OP_SKIP_ARG:
736
472
    case OP_THEN_ARG:
737
472
    cc += PRIV(OP_lengths)[op] + cc[1];
738
472
    break;
739
740
    /* The remaining opcodes are just skipped over. */
741
742
0
    case OP_CLOSE:
743
0
    case OP_COMMIT:
744
335
    case OP_FAIL:
745
335
    case OP_PRUNE:
746
337
    case OP_SET_SOM:
747
353
    case OP_SKIP:
748
375
    case OP_THEN:
749
375
    cc += PRIV(OP_lengths)[op];
750
375
    break;
751
752
    /* This should not occur: we list all opcodes explicitly so that when
753
    new ones get added they are properly considered. */
754
755
    /* LCOV_EXCL_START */
756
0
    default:
757
0
    PCRE2_DEBUG_UNREACHABLE();
758
0
    return -3;
759
    /* LCOV_EXCL_STOP */
760
178k
    }
761
178k
  }
762
763
/* LCOV_EXCL_START */
764
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
765
0
return -3;                 /* Avoid compiler warnings */
766
/* LCOV_EXCL_STOP */
767
15.9k
}
768
769
770
771
/*************************************************
772
*      Set a bit and maybe its alternate case    *
773
*************************************************/
774
775
/* Given a character, set its first code unit's bit in the table, and also the
776
corresponding bit for the other version of a letter if we are caseless.
777
778
Arguments:
779
  re            points to the regex block
780
  p             points to the first code unit of the character
781
  caseless      TRUE if caseless
782
  utf           TRUE for UTF mode
783
  ucp           TRUE for UCP mode
784
785
Returns:        pointer after the character
786
*/
787
788
static PCRE2_SPTR
789
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
790
  BOOL ucp)
791
25.1k
{
792
25.1k
uint32_t c = *p++;   /* First code unit */
793
794
25.1k
(void)utf;           /* Stop compiler warnings when UTF not supported */
795
25.1k
(void)ucp;
796
797
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
798
0xff. */
799
800
#if PCRE2_CODE_UNIT_WIDTH != 8
801
if (c > 0xff) SET_BIT(0xff); else
802
#endif
803
804
25.1k
SET_BIT(c);
805
806
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
807
the end of the character, even when caseless. */
808
809
25.1k
#ifdef SUPPORT_UNICODE
810
25.1k
if (utf)
811
0
  {
812
0
#if PCRE2_CODE_UNIT_WIDTH == 8
813
0
  if (c >= 0xc0) GETUTF8INC(c, p);
814
#elif PCRE2_CODE_UNIT_WIDTH == 16
815
  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
816
#endif
817
0
  }
818
25.1k
#endif  /* SUPPORT_UNICODE */
819
820
/* If caseless, handle the other case of the character. */
821
822
25.1k
if (caseless)
823
3.21k
  {
824
3.21k
#ifdef SUPPORT_UNICODE
825
3.21k
  if (utf || ucp)
826
0
    {
827
0
    c = UCD_OTHERCASE(c);
828
0
#if PCRE2_CODE_UNIT_WIDTH == 8
829
0
    if (utf)
830
0
      {
831
0
      PCRE2_UCHAR buff[6];
832
0
      (void)PRIV(ord2utf)(c, buff);
833
0
      SET_BIT(buff[0]);
834
0
      }
835
0
    else if (c < 256) SET_BIT(c);
836
#else  /* 16-bit or 32-bit mode */
837
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
838
#endif
839
0
    }
840
841
3.21k
  else
842
3.21k
#endif  /* SUPPORT_UNICODE */
843
844
  /* Not UTF or UCP */
845
846
3.21k
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
847
3.21k
  }
848
849
25.1k
return p;
850
25.1k
}
851
852
853
854
/*************************************************
855
*     Set bits for a positive character type     *
856
*************************************************/
857
858
/* This function sets starting bits for a character type. In UTF-8 mode, we can
859
only do a direct setting for bytes less than 128, as otherwise there can be
860
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
861
environment, the tables will only recognize ASCII characters anyway, but in at
862
least one Windows environment, some higher bytes bits were set in the tables.
863
So we deal with that case by considering the UTF-8 encoding.
864
865
Arguments:
866
  re             the regex block
867
  cbit type      the type of character wanted
868
  table_limit    32 for non-UTF-8; 16 for UTF-8
869
870
Returns:         nothing
871
*/
872
873
static void
874
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
875
2.61k
{
876
2.61k
uint32_t c;
877
86.1k
for (c = 0; c < table_limit; c++)
878
83.5k
  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
879
2.61k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
880
2.61k
if (table_limit == 32) return;
881
0
for (c = 128; c < 256; c++)
882
0
  {
883
0
  if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
884
0
    {
885
0
    PCRE2_UCHAR buff[6];
886
0
    (void)PRIV(ord2utf)(c, buff);
887
0
    SET_BIT(buff[0]);
888
0
    }
889
0
  }
890
0
#endif  /* UTF-8 */
891
0
}
892
893
894
/*************************************************
895
*     Set bits for a negative character type     *
896
*************************************************/
897
898
/* This function sets starting bits for a negative character type such as \D.
899
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
900
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
901
Unlike in the positive case, where we can set appropriate starting bits for
902
specific high-valued UTF-8 characters, in this case we have to set the bits for
903
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
904
0xc0 (192) for simplicity.
905
906
Arguments:
907
  re             the regex block
908
  cbit type      the type of character wanted
909
  table_limit    32 for non-UTF-8; 16 for UTF-8
910
911
Returns:         nothing
912
*/
913
914
static void
915
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
916
1.71k
{
917
1.71k
uint32_t c;
918
56.6k
for (c = 0; c < table_limit; c++)
919
54.9k
  re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
920
1.71k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
921
1.71k
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
922
1.71k
#endif
923
1.71k
}
924
925
926
927
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
928
/*************************************************
929
*     Set starting bits for a character list.    *
930
*************************************************/
931
932
/* This function sets starting bits for a character list. It enumerates
933
all characters and character ranges in the character list, and sets
934
the starting bits accordingly.
935
936
Arguments:
937
  code           pointer to the code
938
  start_bitmap   pointer to the starting bitmap
939
940
Returns:         nothing
941
*/
942
static void
943
study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,
944
  const uint8_t *char_lists_end)
945
0
{
946
0
uint32_t type, list_ind;
947
0
uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
948
0
uint32_t range_start = ~(uint32_t)0, range_end = 0;
949
0
const uint8_t *next_char;
950
0
PCRE2_UCHAR start_buffer[6], end_buffer[6];
951
0
PCRE2_UCHAR start, end;
952
953
/* Only needed in 8-bit mode at the moment. */
954
0
type = (uint32_t)(code[0] << 8) | code[1];
955
0
code += 2;
956
957
/* Align characters. */
958
0
next_char = char_lists_end - (GET(code, 0) << 1);
959
0
type &= XCL_TYPE_MASK;
960
0
list_ind = 0;
961
962
0
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
963
0
  range_start = XCL_CHAR_LIST_LOW_16_START;
964
965
0
while (type > 0)
966
0
  {
967
0
  uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
968
969
0
  if (item_count == XCL_ITEM_COUNT_MASK)
970
0
    {
971
0
    if (list_ind <= 1)
972
0
      {
973
0
      item_count = *(const uint16_t*)next_char;
974
0
      next_char += 2;
975
0
      }
976
0
    else
977
0
      {
978
0
      item_count = *(const uint32_t*)next_char;
979
0
      next_char += 4;
980
0
      }
981
0
    }
982
983
0
  while (item_count > 0)
984
0
    {
985
0
    if (list_ind <= 1)
986
0
      {
987
0
      range_end = *(const uint16_t*)next_char;
988
0
      next_char += 2;
989
0
      }
990
0
    else
991
0
      {
992
0
      range_end = *(const uint32_t*)next_char;
993
0
      next_char += 4;
994
0
      }
995
996
0
    if ((range_end & XCL_CHAR_END) != 0)
997
0
      {
998
0
      range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
999
1000
0
      PRIV(ord2utf)(range_end, end_buffer);
1001
0
      end = end_buffer[0];
1002
1003
0
      if (range_start < range_end)
1004
0
        {
1005
0
        PRIV(ord2utf)(range_start, start_buffer);
1006
0
        for (start = start_buffer[0]; start <= end; start++)
1007
0
          start_bitmap[start / 8] |= (1u << (start & 7));
1008
0
        }
1009
0
      else
1010
0
        start_bitmap[end / 8] |= (1u << (end & 7));
1011
1012
0
      range_start = ~(uint32_t)0;
1013
0
      }
1014
0
    else
1015
0
      range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
1016
1017
0
    item_count--;
1018
0
    }
1019
1020
0
  list_ind++;
1021
0
  type >>= XCL_TYPE_BIT_LEN;
1022
1023
0
  if (range_start == ~(uint32_t)0)
1024
0
    {
1025
0
    if ((type & XCL_BEGIN_WITH_RANGE) != 0)
1026
0
      {
1027
      /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */
1028
0
      if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
1029
0
      else range_start = XCL_CHAR_LIST_LOW_32_START;
1030
0
      }
1031
0
    }
1032
0
  else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
1033
0
    {
1034
0
    PRIV(ord2utf)(range_start, start_buffer);
1035
1036
    /* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and
1037
    XCL_CHAR_LIST_HIGH_32_END are not possible. */
1038
0
    if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
1039
0
    else range_end = XCL_CHAR_LIST_HIGH_16_END;
1040
1041
0
    PRIV(ord2utf)(range_end, end_buffer);
1042
0
    end = end_buffer[0];
1043
1044
0
    for (start = start_buffer[0]; start <= end; start++)
1045
0
      start_bitmap[start / 8] |= (1u << (start & 7));
1046
1047
0
    range_start = ~(uint32_t)0;
1048
0
    }
1049
1050
  /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */
1051
0
  if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
1052
0
  else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
1053
0
  }
1054
0
}
1055
#endif
1056
1057
1058
1059
/*************************************************
1060
*      Create bitmap of starting code units      *
1061
*************************************************/
1062
1063
/* This function scans a compiled unanchored expression recursively and
1064
attempts to build a bitmap of the set of possible starting code units whose
1065
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
1066
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
1067
we pass a value of 16 rather than 32 as the final argument. (See comments in
1068
those functions for the reason.)
1069
1070
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
1071
(a*)b where the group provides some optional starting code units but scanning
1072
must continue at the outer level to find at least one mandatory code unit. At
1073
the outermost level, this function fails unless the result is SSB_DONE.
1074
1075
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
1076
issues.
1077
1078
Arguments:
1079
  re           points to the compiled regex block
1080
  code         points to an expression
1081
  utf          TRUE if in UTF mode
1082
  ucp          TRUE if in UCP mode
1083
  depthptr     pointer to recurse depth
1084
1085
Returns:       SSB_FAIL     => Failed to find any starting code units
1086
               SSB_DONE     => Found mandatory starting code units
1087
               SSB_CONTINUE => Found optional starting code units
1088
               SSB_UNKNOWN  => Hit an unrecognized opcode
1089
               SSB_TOODEEP  => Recursion is too deep
1090
*/
1091
1092
static int
1093
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
1094
  int *depthptr)
1095
101k
{
1096
101k
uint32_t c;
1097
101k
int yield = SSB_DONE;
1098
101k
BOOL done;
1099
1100
101k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1101
101k
int table_limit = utf? 16:32;
1102
#else
1103
int table_limit = 32;
1104
#endif
1105
1106
101k
*depthptr += 1;
1107
101k
if (*depthptr > 1000) return SSB_TOODEEP;
1108
1109
101k
do
1110
130k
  {
1111
130k
  BOOL try_next = TRUE;
1112
130k
  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
1113
1114
130k
  if (*code == OP_CBRA || *code == OP_SCBRA ||
1115
78.5k
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1116
1117
282k
  while (try_next)    /* Loop for items in this branch */
1118
247k
    {
1119
247k
    int rc;
1120
247k
    PCRE2_SPTR ncode;
1121
247k
    const uint8_t *classmap = NULL;
1122
247k
#ifdef SUPPORT_WIDE_CHARS
1123
247k
    PCRE2_UCHAR xclassflags;
1124
247k
#endif
1125
1126
247k
    switch(*tcode)
1127
247k
      {
1128
      /* If we reach something we don't understand, it means a new opcode has
1129
      been created that hasn't been added to this function. Hopefully this
1130
      problem will be discovered during testing. */
1131
1132
0
      default:
1133
0
      return SSB_UNKNOWN;
1134
1135
      /* Fail for a valid opcode that implies no starting bits. */
1136
1137
11
      case OP_ACCEPT:
1138
11
      case OP_ASSERT_ACCEPT:
1139
27
      case OP_ALLANY:
1140
405
      case OP_ANY:
1141
405
      case OP_ANYBYTE:
1142
407
      case OP_CIRCM:
1143
424
      case OP_CLOSE:
1144
424
      case OP_COMMIT:
1145
424
      case OP_COMMIT_ARG:
1146
676
      case OP_COND:
1147
677
      case OP_CREF:
1148
677
      case OP_FALSE:
1149
677
      case OP_TRUE:
1150
678
      case OP_DNCREF:
1151
733
      case OP_DNREF:
1152
740
      case OP_DNREFI:
1153
740
      case OP_DNRREF:
1154
811
      case OP_DOLL:
1155
815
      case OP_DOLLM:
1156
815
      case OP_END:
1157
827
      case OP_EOD:
1158
836
      case OP_EODN:
1159
837
      case OP_EXTUNI:
1160
848
      case OP_FAIL:
1161
880
      case OP_MARK:
1162
912
      case OP_NOT:
1163
922
      case OP_NOTEXACT:
1164
935
      case OP_NOTEXACTI:
1165
951
      case OP_NOTI:
1166
959
      case OP_NOTMINPLUS:
1167
967
      case OP_NOTMINPLUSI:
1168
967
      case OP_NOTMINQUERY:
1169
967
      case OP_NOTMINQUERYI:
1170
967
      case OP_NOTMINSTAR:
1171
967
      case OP_NOTMINSTARI:
1172
967
      case OP_NOTMINUPTO:
1173
967
      case OP_NOTMINUPTOI:
1174
979
      case OP_NOTPLUS:
1175
995
      case OP_NOTPLUSI:
1176
1.01k
      case OP_NOTPOSPLUS:
1177
1.01k
      case OP_NOTPOSPLUSI:
1178
1.02k
      case OP_NOTPOSQUERY:
1179
1.02k
      case OP_NOTPOSQUERYI:
1180
1.03k
      case OP_NOTPOSSTAR:
1181
1.03k
      case OP_NOTPOSSTARI:
1182
1.03k
      case OP_NOTPOSUPTO:
1183
1.03k
      case OP_NOTPOSUPTOI:
1184
1.03k
      case OP_NOTPROP:
1185
1.04k
      case OP_NOTQUERY:
1186
1.04k
      case OP_NOTQUERYI:
1187
1.05k
      case OP_NOTSTAR:
1188
1.05k
      case OP_NOTSTARI:
1189
1.05k
      case OP_NOTUPTO:
1190
1.05k
      case OP_NOTUPTOI:
1191
1.07k
      case OP_NOT_HSPACE:
1192
1.10k
      case OP_NOT_VSPACE:
1193
1.10k
      case OP_PRUNE:
1194
1.12k
      case OP_PRUNE_ARG:
1195
1.37k
      case OP_RECURSE:
1196
1.43k
      case OP_REF:
1197
1.43k
      case OP_REFI:
1198
1.43k
      case OP_REVERSE:
1199
1.43k
      case OP_VREVERSE:
1200
1.43k
      case OP_RREF:
1201
1.47k
      case OP_SCOND:
1202
1.47k
      case OP_SET_SOM:
1203
1.47k
      case OP_SKIP:
1204
1.49k
      case OP_SKIP_ARG:
1205
1.51k
      case OP_SOD:
1206
1.52k
      case OP_SOM:
1207
1.52k
      case OP_THEN:
1208
1.54k
      case OP_THEN_ARG:
1209
1.54k
      return SSB_FAIL;
1210
1211
      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1212
      uses OP_CIRCM). Skip over it. */
1213
1214
1.02k
      case OP_CIRC:
1215
1.02k
      tcode += PRIV(OP_lengths)[OP_CIRC];
1216
1.02k
      break;
1217
1218
      /* A "real" property test implies no starting bits, but the fake property
1219
      PT_CLIST identifies a list of characters. These lists are short, as they
1220
      are used for characters with more than one "other case", so there is no
1221
      point in recognizing them for OP_NOTPROP. */
1222
1223
0
      case OP_PROP:
1224
0
      if (tcode[1] != PT_CLIST) return SSB_FAIL;
1225
0
        {
1226
0
        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1227
0
        while ((c = *p++) < NOTACHAR)
1228
0
          {
1229
0
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1230
0
          if (utf)
1231
0
            {
1232
0
            PCRE2_UCHAR buff[6];
1233
0
            (void)PRIV(ord2utf)(c, buff);
1234
0
            c = buff[0];
1235
0
            }
1236
0
#endif
1237
0
          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1238
0
          }
1239
0
        }
1240
0
      try_next = FALSE;
1241
0
      break;
1242
1243
      /* We can ignore word boundary tests. */
1244
1245
282
      case OP_WORD_BOUNDARY:
1246
568
      case OP_NOT_WORD_BOUNDARY:
1247
568
      case OP_UCP_WORD_BOUNDARY:
1248
568
      case OP_NOT_UCP_WORD_BOUNDARY:
1249
568
      tcode++;
1250
568
      break;
1251
1252
      /* For a positive lookahead assertion, inspect what immediately follows,
1253
      ignoring intermediate assertions and callouts. If the next item is one
1254
      that sets a mandatory character, skip this assertion. Otherwise, treat it
1255
      the same as other bracket groups. */
1256
1257
3.97k
      case OP_ASSERT:
1258
3.97k
      case OP_ASSERT_NA:
1259
3.97k
      ncode = tcode + GET(tcode, 1);
1260
6.54k
      while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1261
3.97k
      ncode += 1 + LINK_SIZE;
1262
1263
      /* Skip irrelevant items */
1264
1265
13.3M
      for (done = FALSE; !done;)
1266
13.2M
        {
1267
13.2M
        switch (*ncode)
1268
13.2M
          {
1269
13.2M
          case OP_ASSERT:
1270
13.2M
          case OP_ASSERT_NOT:
1271
13.2M
          case OP_ASSERTBACK:
1272
13.2M
          case OP_ASSERTBACK_NOT:
1273
13.2M
          case OP_ASSERT_NA:
1274
13.2M
          case OP_ASSERTBACK_NA:
1275
13.2M
          case OP_ASSERT_SCS:
1276
13.2M
          ncode += GET(ncode, 1);
1277
13.2M
          while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1278
13.2M
          ncode += 1 + LINK_SIZE;
1279
13.2M
          break;
1280
1281
32
          case OP_WORD_BOUNDARY:
1282
32
          case OP_NOT_WORD_BOUNDARY:
1283
32
          case OP_UCP_WORD_BOUNDARY:
1284
32
          case OP_NOT_UCP_WORD_BOUNDARY:
1285
32
          ncode++;
1286
32
          break;
1287
1288
4
          case OP_CALLOUT:
1289
4
          ncode += PRIV(OP_lengths)[OP_CALLOUT];
1290
4
          break;
1291
1292
0
          case OP_CALLOUT_STR:
1293
0
          ncode += GET(ncode, 1 + 2*LINK_SIZE);
1294
0
          break;
1295
1296
3.97k
          default:
1297
3.97k
          done = TRUE;
1298
3.97k
          break;
1299
13.2M
          }
1300
13.2M
        }
1301
1302
      /* Now check the next significant item. */
1303
1304
3.97k
      switch(*ncode)
1305
3.97k
        {
1306
3.87k
        default:
1307
3.87k
        break;
1308
1309
3.87k
        case OP_PROP:
1310
0
        if (ncode[1] != PT_CLIST) break;
1311
0
        PCRE2_FALLTHROUGH /* Fall through */
1312
0
        case OP_ANYNL:
1313
8
        case OP_CHAR:
1314
104
        case OP_CHARI:
1315
104
        case OP_EXACT:
1316
104
        case OP_EXACTI:
1317
104
        case OP_HSPACE:
1318
104
        case OP_MINPLUS:
1319
104
        case OP_MINPLUSI:
1320
104
        case OP_PLUS:
1321
104
        case OP_PLUSI:
1322
104
        case OP_POSPLUS:
1323
104
        case OP_POSPLUSI:
1324
104
        case OP_VSPACE:
1325
        /* Note that these types will only be present in non-UCP mode. */
1326
104
        case OP_DIGIT:
1327
104
        case OP_NOT_DIGIT:
1328
104
        case OP_WORDCHAR:
1329
104
        case OP_NOT_WORDCHAR:
1330
104
        case OP_WHITESPACE:
1331
104
        case OP_NOT_WHITESPACE:
1332
104
        tcode = ncode;
1333
104
        continue;   /* With the following significant opcode */
1334
3.97k
        }
1335
3.87k
      PCRE2_FALLTHROUGH /* Fall through */
1336
3.87k
1337
3.87k
      /* For a group bracket or a positive assertion without an immediately
1338
3.87k
      following mandatory setting, recurse to set bits from within the
1339
3.87k
      subpattern. If it can't find anything, we have to give up. If it finds
1340
3.87k
      some mandatory character(s), we are done for this branch. Otherwise,
1341
3.87k
      carry on scanning after the subpattern. */
1342
3.87k
1343
7.88k
      case OP_BRA:
1344
8.18k
      case OP_SBRA:
1345
80.7k
      case OP_CBRA:
1346
82.0k
      case OP_SCBRA:
1347
82.2k
      case OP_BRAPOS:
1348
82.6k
      case OP_SBRAPOS:
1349
83.0k
      case OP_CBRAPOS:
1350
84.1k
      case OP_SCBRAPOS:
1351
86.9k
      case OP_ONCE:
1352
86.9k
      case OP_SCRIPT_RUN:
1353
86.9k
      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1354
86.9k
      if (rc == SSB_DONE)
1355
2.75k
        {
1356
2.75k
        try_next = FALSE;
1357
2.75k
        }
1358
84.1k
      else if (rc == SSB_CONTINUE)
1359
82.6k
        {
1360
107k
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1361
82.6k
        tcode += 1 + LINK_SIZE;
1362
82.6k
        }
1363
1.57k
      else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1364
85.3k
      break;
1365
1366
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1367
      this branch, though we might have found something optional. For ALT, we
1368
      continue with the next alternative, but we have to arrange that the final
1369
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1370
      return SSB_CONTINUE: if this is the top level, that indicates failure,
1371
      but after a nested subpattern, it causes scanning to continue. */
1372
1373
85.3k
      case OP_ALT:
1374
15.3k
      yield = SSB_CONTINUE;
1375
15.3k
      try_next = FALSE;
1376
15.3k
      break;
1377
1378
83.8k
      case OP_KET:
1379
85.8k
      case OP_KETRMAX:
1380
86.8k
      case OP_KETRMIN:
1381
88.9k
      case OP_KETRPOS:
1382
88.9k
      return SSB_CONTINUE;
1383
1384
      /* Skip over callout */
1385
1386
595
      case OP_CALLOUT:
1387
595
      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1388
595
      break;
1389
1390
0
      case OP_CALLOUT_STR:
1391
0
      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1392
0
      break;
1393
1394
      /* Skip over lookbehind, negative lookahead, and scan substring
1395
      assertions */
1396
1397
1.33k
      case OP_ASSERT_NOT:
1398
2.13k
      case OP_ASSERTBACK:
1399
4.01k
      case OP_ASSERTBACK_NOT:
1400
4.01k
      case OP_ASSERTBACK_NA:
1401
4.01k
      case OP_ASSERT_SCS:
1402
5.46k
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1403
4.01k
      tcode += 1 + LINK_SIZE;
1404
4.01k
      break;
1405
1406
      /* BRAZERO does the bracket, but carries on. */
1407
1408
8.53k
      case OP_BRAZERO:
1409
9.16k
      case OP_BRAMINZERO:
1410
9.78k
      case OP_BRAPOSZERO:
1411
9.78k
      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1412
9.78k
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1413
6.05k
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1414
5.71k
      tcode += 1 + LINK_SIZE;
1415
5.71k
      break;
1416
1417
      /* SKIPZERO skips the bracket. */
1418
1419
348
      case OP_SKIPZERO:
1420
348
      tcode++;
1421
348
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1422
348
      tcode += 1 + LINK_SIZE;
1423
348
      break;
1424
1425
      /* Single-char * or ? sets the bit and tries the next item */
1426
1427
6.53k
      case OP_STAR:
1428
6.53k
      case OP_MINSTAR:
1429
6.81k
      case OP_POSSTAR:
1430
14.7k
      case OP_QUERY:
1431
14.7k
      case OP_MINQUERY:
1432
15.2k
      case OP_POSQUERY:
1433
15.2k
      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1434
15.2k
      break;
1435
1436
1
      case OP_STARI:
1437
1
      case OP_MINSTARI:
1438
7
      case OP_POSSTARI:
1439
259
      case OP_QUERYI:
1440
260
      case OP_MINQUERYI:
1441
278
      case OP_POSQUERYI:
1442
278
      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1443
278
      break;
1444
1445
      /* Single-char upto sets the bit and tries the next */
1446
1447
0
      case OP_UPTO:
1448
0
      case OP_MINUPTO:
1449
23
      case OP_POSUPTO:
1450
23
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1451
23
      break;
1452
1453
0
      case OP_UPTOI:
1454
0
      case OP_MINUPTOI:
1455
0
      case OP_POSUPTOI:
1456
0
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1457
0
      break;
1458
1459
      /* At least one single char sets the bit and stops */
1460
1461
877
      case OP_EXACT:
1462
877
      tcode += IMM2_SIZE;
1463
877
      PCRE2_FALLTHROUGH /* Fall through */
1464
3.99k
      case OP_CHAR:
1465
5.53k
      case OP_PLUS:
1466
6.33k
      case OP_MINPLUS:
1467
6.65k
      case OP_POSPLUS:
1468
6.65k
      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1469
6.65k
      try_next = FALSE;
1470
6.65k
      break;
1471
1472
735
      case OP_EXACTI:
1473
735
      tcode += IMM2_SIZE;
1474
735
      PCRE2_FALLTHROUGH /* Fall through */
1475
1.49k
      case OP_CHARI:
1476
1.96k
      case OP_PLUSI:
1477
2.48k
      case OP_MINPLUSI:
1478
2.94k
      case OP_POSPLUSI:
1479
2.94k
      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1480
2.94k
      try_next = FALSE;
1481
2.94k
      break;
1482
1483
      /* Special spacing and line-terminating items. These recognize specific
1484
      lists of characters. The difference between VSPACE and ANYNL is that the
1485
      latter can match the two-character CRLF sequence, but that is not
1486
      relevant for finding the first character, so their code here is
1487
      identical. */
1488
1489
304
      case OP_HSPACE:
1490
304
      SET_BIT(CHAR_HT);
1491
304
      SET_BIT(CHAR_SPACE);
1492
1493
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1494
      the bits for NBSP and for code units >= 255, independently of UTF. */
1495
1496
#if PCRE2_CODE_UNIT_WIDTH != 8
1497
      SET_BIT(CHAR_NBSP);
1498
      SET_BIT(0xFF);
1499
#else
1500
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1501
      units of horizontal space characters. */
1502
1503
304
#ifdef SUPPORT_UNICODE
1504
304
      if (utf)
1505
0
        {
1506
0
        SET_BIT(0xC2);  /* For U+00A0 */
1507
0
        SET_BIT(0xE1);  /* For U+1680, U+180E */
1508
0
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1509
0
        SET_BIT(0xE3);  /* For U+3000 */
1510
0
        }
1511
304
      else
1512
304
#endif
1513
      /* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */
1514
304
        {
1515
304
        SET_BIT(CHAR_NBSP);
1516
304
        }
1517
304
#endif  /* 8-bit support */
1518
1519
304
      try_next = FALSE;
1520
304
      break;
1521
1522
289
      case OP_ANYNL:
1523
624
      case OP_VSPACE:
1524
624
      SET_BIT(CHAR_LF);
1525
624
      SET_BIT(CHAR_VT);
1526
624
      SET_BIT(CHAR_FF);
1527
624
      SET_BIT(CHAR_CR);
1528
1529
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1530
      the bits for NEL and for code units >= 255, independently of UTF. */
1531
1532
#if PCRE2_CODE_UNIT_WIDTH != 8
1533
      SET_BIT(CHAR_NEL);
1534
      SET_BIT(0xFF);
1535
#else
1536
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1537
      units of vertical space characters. */
1538
1539
624
#ifdef SUPPORT_UNICODE
1540
624
      if (utf)
1541
0
        {
1542
0
        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1543
0
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1544
0
        }
1545
624
      else
1546
624
#endif
1547
      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1548
624
        {
1549
624
        SET_BIT(CHAR_NEL);
1550
624
        }
1551
624
#endif  /* 8-bit support */
1552
1553
624
      try_next = FALSE;
1554
624
      break;
1555
1556
      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1557
      is set, we do not see these opcodes because \d etc are converted to
1558
      properties. Therefore, these apply in the case when only characters less
1559
      than 256 are recognized to match the types. */
1560
1561
320
      case OP_NOT_DIGIT:
1562
320
      set_nottype_bits(re, cbit_digit, table_limit);
1563
320
      try_next = FALSE;
1564
320
      break;
1565
1566
541
      case OP_DIGIT:
1567
541
      set_type_bits(re, cbit_digit, table_limit);
1568
541
      try_next = FALSE;
1569
541
      break;
1570
1571
560
      case OP_NOT_WHITESPACE:
1572
560
      set_nottype_bits(re, cbit_space, table_limit);
1573
560
      try_next = FALSE;
1574
560
      break;
1575
1576
235
      case OP_WHITESPACE:
1577
235
      set_type_bits(re, cbit_space, table_limit);
1578
235
      try_next = FALSE;
1579
235
      break;
1580
1581
350
      case OP_NOT_WORDCHAR:
1582
350
      set_nottype_bits(re, cbit_word, table_limit);
1583
350
      try_next = FALSE;
1584
350
      break;
1585
1586
272
      case OP_WORDCHAR:
1587
272
      set_type_bits(re, cbit_word, table_limit);
1588
272
      try_next = FALSE;
1589
272
      break;
1590
1591
      /* One or more character type fudges the pointer and restarts, knowing
1592
      it will hit a single character type and stop there. */
1593
1594
224
      case OP_TYPEPLUS:
1595
248
      case OP_TYPEMINPLUS:
1596
295
      case OP_TYPEPOSPLUS:
1597
295
      tcode++;
1598
295
      break;
1599
1600
52
      case OP_TYPEEXACT:
1601
52
      tcode += 1 + IMM2_SIZE;
1602
52
      break;
1603
1604
      /* Zero or more repeats of character types set the bits and then
1605
      try again. */
1606
1607
18
      case OP_TYPEUPTO:
1608
29
      case OP_TYPEMINUPTO:
1609
52
      case OP_TYPEPOSUPTO:
1610
52
      tcode += IMM2_SIZE;
1611
52
      PCRE2_FALLTHROUGH /* Fall through */
1612
1613
1.60k
      case OP_TYPESTAR:
1614
1.61k
      case OP_TYPEMINSTAR:
1615
1.69k
      case OP_TYPEPOSSTAR:
1616
2.82k
      case OP_TYPEQUERY:
1617
2.84k
      case OP_TYPEMINQUERY:
1618
2.95k
      case OP_TYPEPOSQUERY:
1619
2.95k
      switch(tcode[1])
1620
2.95k
        {
1621
50
        default:
1622
219
        case OP_ANY:
1623
254
        case OP_ALLANY:
1624
254
        return SSB_FAIL;
1625
1626
207
        case OP_HSPACE:
1627
207
        SET_BIT(CHAR_HT);
1628
207
        SET_BIT(CHAR_SPACE);
1629
1630
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1631
        the bits for NBSP and for code units >= 255, independently of UTF. */
1632
1633
#if PCRE2_CODE_UNIT_WIDTH != 8
1634
        SET_BIT(CHAR_NBSP);
1635
        SET_BIT(0xFF);
1636
#else
1637
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1638
        units of horizontal space characters. */
1639
1640
207
#ifdef SUPPORT_UNICODE
1641
207
        if (utf)
1642
0
          {
1643
0
          SET_BIT(0xC2);  /* For U+00A0 */
1644
0
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1645
0
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1646
0
          SET_BIT(0xE3);  /* For U+3000 */
1647
0
          }
1648
207
        else
1649
207
#endif
1650
        /* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */
1651
207
          {
1652
207
          SET_BIT(CHAR_NBSP);
1653
207
          }
1654
207
#endif  /* 8-bit support */
1655
207
        break;
1656
1657
205
        case OP_ANYNL:
1658
444
        case OP_VSPACE:
1659
444
        SET_BIT(CHAR_LF);
1660
444
        SET_BIT(CHAR_VT);
1661
444
        SET_BIT(CHAR_FF);
1662
444
        SET_BIT(CHAR_CR);
1663
1664
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1665
        the bits for NEL and for code units >= 255, independently of UTF. */
1666
1667
#if PCRE2_CODE_UNIT_WIDTH != 8
1668
        SET_BIT(CHAR_NEL);
1669
        SET_BIT(0xFF);
1670
#else
1671
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1672
        units of vertical space characters. */
1673
1674
444
#ifdef SUPPORT_UNICODE
1675
444
        if (utf)
1676
0
          {
1677
0
          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1678
0
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1679
0
          }
1680
444
        else
1681
444
#endif
1682
        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1683
444
          {
1684
444
          SET_BIT(CHAR_NEL);
1685
444
          }
1686
444
#endif  /* 8-bit support */
1687
444
        break;
1688
1689
141
        case OP_NOT_DIGIT:
1690
141
        set_nottype_bits(re, cbit_digit, table_limit);
1691
141
        break;
1692
1693
286
        case OP_DIGIT:
1694
286
        set_type_bits(re, cbit_digit, table_limit);
1695
286
        break;
1696
1697
161
        case OP_NOT_WHITESPACE:
1698
161
        set_nottype_bits(re, cbit_space, table_limit);
1699
161
        break;
1700
1701
151
        case OP_WHITESPACE:
1702
151
        set_type_bits(re, cbit_space, table_limit);
1703
151
        break;
1704
1705
186
        case OP_NOT_WORDCHAR:
1706
186
        set_nottype_bits(re, cbit_word, table_limit);
1707
186
        break;
1708
1709
1.12k
        case OP_WORDCHAR:
1710
1.12k
        set_type_bits(re, cbit_word, table_limit);
1711
1.12k
        break;
1712
2.95k
        }
1713
1714
2.70k
      tcode += 2;
1715
2.70k
      break;
1716
1717
      /* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */
1718
1719
0
#ifdef SUPPORT_WIDE_CHARS
1720
0
      case OP_ECLASS:
1721
0
      return SSB_FAIL;
1722
0
#endif
1723
1724
      /* Extended class: if there are any property checks, or if this is a
1725
      negative XCLASS without a map, give up. If there are no property checks,
1726
      there must be wide characters on the XCLASS list, because otherwise an
1727
      XCLASS would not have been created. This means that code points >= 255
1728
      are potential starters. In the UTF-8 case we can scan them and set bits
1729
      for the relevant leading bytes. */
1730
1731
0
#ifdef SUPPORT_WIDE_CHARS
1732
0
      case OP_XCLASS:
1733
0
      xclassflags = tcode[1 + LINK_SIZE];
1734
0
      if ((xclassflags & XCL_HASPROP) != 0 ||
1735
0
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1736
0
        return SSB_FAIL;
1737
1738
      /* We have a positive XCLASS or a negative one without a map. Set up the
1739
      map pointer if there is one, and fall through. */
1740
1741
0
      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1742
0
        (const uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1743
1744
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1745
      then jump to handle the map. */
1746
1747
0
#if PCRE2_CODE_UNIT_WIDTH == 8
1748
0
      if (utf && (xclassflags & XCL_NOT) == 0)
1749
0
        {
1750
0
        PCRE2_UCHAR b, e;
1751
0
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1752
0
        tcode += GET(tcode, 1);
1753
1754
0
        if (*p >= XCL_LIST)
1755
0
          {
1756
0
          study_char_list(p, re->start_bitmap,
1757
0
            ((const uint8_t *)re + re->code_start));
1758
0
          goto HANDLE_CLASSMAP;
1759
0
          }
1760
1761
0
        for (;;) switch (*p++)
1762
0
          {
1763
0
          case XCL_SINGLE:
1764
0
          b = *p++;
1765
0
          while ((*p & 0xc0) == 0x80) p++;
1766
0
          re->start_bitmap[b/8] |= (1u << (b&7));
1767
0
          break;
1768
1769
0
          case XCL_RANGE:
1770
0
          b = *p++;
1771
0
          while ((*p & 0xc0) == 0x80) p++;
1772
0
          e = *p++;
1773
0
          while ((*p & 0xc0) == 0x80) p++;
1774
0
          for (; b <= e; b++)
1775
0
            re->start_bitmap[b/8] |= (1u << (b&7));
1776
0
          break;
1777
1778
0
          case XCL_END:
1779
0
          goto HANDLE_CLASSMAP;
1780
1781
          /* LCOV_EXCL_START */
1782
0
          default:
1783
0
          PCRE2_DEBUG_UNREACHABLE();
1784
0
          return SSB_UNKNOWN;   /* Internal error, should not occur */
1785
          /* LCOV_EXCL_STOP */
1786
0
          }
1787
0
        }
1788
0
#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1789
0
#endif  /* SUPPORT_WIDE_CHARS */
1790
1791
      /* It seems that the fall through comment must be outside the #ifdef if
1792
      it is to avoid the gcc compiler warning. */
1793
1794
0
      PCRE2_FALLTHROUGH /* Fall through */
1795
0
1796
0
      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1797
0
      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1798
0
      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1799
0
      there is no difference between CLASS and NCLASS. In all other wide
1800
0
      character modes, set the 0xFF bit to indicate code units >= 255. */
1801
0
1802
750
      case OP_NCLASS:
1803
750
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1804
750
      if (utf)
1805
0
        {
1806
0
        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1807
0
        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1808
0
        }
1809
750
      PCRE2_FALLTHROUGH /* Fall through */
1810
#elif PCRE2_CODE_UNIT_WIDTH != 8
1811
      SET_BIT(0xFF);                             /* For characters >= 255 */
1812
      PCRE2_FALLTHROUGH /* Fall through */
1813
#endif
1814
1815
      /* Enter here for a positive non-XCLASS. If we have fallen through from
1816
      an XCLASS, classmap will already be set; just advance the code pointer.
1817
      Otherwise, set up classmap for a non-XCLASS and advance past it. */
1818
1819
7.05k
      case OP_CLASS:
1820
7.05k
      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1821
7.05k
        {
1822
7.05k
        classmap = (const uint8_t *)(++tcode);
1823
7.05k
        tcode += 32 / sizeof(PCRE2_UCHAR);
1824
7.05k
        }
1825
1826
      /* When wide characters are supported, classmap may be NULL. In UTF-8
1827
      (sic) mode, the bits in a class bit map correspond to character values,
1828
      not to byte values. However, the bit map we are constructing is for byte
1829
      values. So we have to do a conversion for characters whose code point is
1830
      greater than 127. In fact, there are only two possible starting bytes for
1831
      characters in the range 128 - 255. */
1832
1833
7.05k
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1834
7.05k
      HANDLE_CLASSMAP:
1835
7.05k
#endif
1836
7.05k
      if (classmap != NULL)
1837
7.05k
        {
1838
7.05k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1839
7.05k
        if (utf)
1840
0
          {
1841
0
          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1842
0
          for (c = 128; c < 256; c++)
1843
0
            {
1844
0
            if ((classmap[c/8] & (1u << (c&7))) != 0)
1845
0
              {
1846
0
              int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1847
0
              re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1848
0
              c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1849
0
              }
1850
0
            }
1851
0
          }
1852
7.05k
        else
1853
7.05k
#endif
1854
        /* In all modes except UTF-8, the two bit maps are compatible. */
1855
1856
7.05k
          {
1857
232k
          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1858
7.05k
          }
1859
7.05k
        }
1860
1861
      /* Act on what follows the class. For a zero minimum repeat, continue;
1862
      otherwise stop processing. */
1863
1864
7.05k
      switch (*tcode)
1865
7.05k
        {
1866
393
        case OP_CRSTAR:
1867
725
        case OP_CRMINSTAR:
1868
1.46k
        case OP_CRQUERY:
1869
2.04k
        case OP_CRMINQUERY:
1870
2.58k
        case OP_CRPOSSTAR:
1871
3.24k
        case OP_CRPOSQUERY:
1872
3.24k
        tcode++;
1873
3.24k
        break;
1874
1875
596
        case OP_CRRANGE:
1876
786
        case OP_CRMINRANGE:
1877
876
        case OP_CRPOSRANGE:
1878
876
        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1879
356
          else try_next = FALSE;
1880
876
        break;
1881
1882
2.94k
        default:
1883
2.94k
        try_next = FALSE;
1884
2.94k
        break;
1885
7.05k
        }
1886
7.05k
      break; /* End of class handling case */
1887
247k
      }      /* End of switch for opcodes */
1888
247k
    }        /* End of try_next loop */
1889
1890
34.2k
  code += GET(code, 1);   /* Advance to next branch */
1891
34.2k
  }
1892
101k
while (*code == OP_ALT);
1893
1894
5.13k
return yield;
1895
101k
}
1896
1897
1898
1899
/*************************************************
1900
*          Study a compiled expression           *
1901
*************************************************/
1902
1903
/* This function is handed a compiled expression that it must study to produce
1904
information that will speed up the matching.
1905
1906
Argument:
1907
  re       points to the compiled expression
1908
1909
Returns:   0 normally; non-zero should never normally occur
1910
           1 unknown opcode in set_start_bits
1911
           2 missing capturing bracket
1912
           3 unknown opcode in find_minlength
1913
*/
1914
1915
int
1916
PRIV(study)(pcre2_real_code *re)
1917
6.22k
{
1918
6.22k
int count = 0;
1919
6.22k
PCRE2_UCHAR *code;
1920
6.22k
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1921
6.22k
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1922
1923
/* Find start of compiled code */
1924
1925
6.22k
code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
1926
1927
/* For a pattern that has a first code unit, or a multiline pattern that
1928
matches only at "line start", there is no point in seeking a list of starting
1929
code units. */
1930
1931
6.22k
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1932
4.84k
  {
1933
4.84k
  int depth = 0;
1934
4.84k
  int rc = set_start_bits(re, code, utf, ucp, &depth);
1935
  /* LCOV_EXCL_START */
1936
4.84k
  if (rc == SSB_UNKNOWN)
1937
0
    {
1938
0
    PCRE2_DEBUG_UNREACHABLE();
1939
0
    return 1;
1940
0
    }
1941
  /* LCOV_EXCL_STOP */
1942
1943
  /* If a list of starting code units was set up, scan the list to see if only
1944
  one or two were listed. Having only one listed is rare because usually a
1945
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1946
  If two are listed, see if they are caseless versions of the same character;
1947
  if so we can replace the list with a caseless first code unit. This gives
1948
  better performance and is plausibly worth doing for patterns such as [Ww]ord
1949
  or (word|WORD). */
1950
1951
4.84k
  if (rc == SSB_DONE)
1952
1.01k
    {
1953
1.01k
    int i;
1954
1.01k
    int a = -1;
1955
1.01k
    int b = -1;
1956
1.01k
    uint8_t *p = re->start_bitmap;
1957
1.01k
    uint32_t flags = PCRE2_FIRSTMAPSET;
1958
1959
11.2k
    for (i = 0; i < 256; p++, i += 8)
1960
11.1k
      {
1961
11.1k
      uint8_t x = *p;
1962
11.1k
      if (x != 0)
1963
1.44k
        {
1964
1.44k
        int c;
1965
1.44k
        uint8_t y = x & (~x + 1);   /* Least significant bit */
1966
1.44k
        if (y != x) goto DONE;      /* More than one bit set */
1967
1968
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1969
        all wide characters", so we cannot use it here. */
1970
1971
#if PCRE2_CODE_UNIT_WIDTH != 8
1972
        if (i == 248 && x == 0x80) goto DONE;
1973
#endif
1974
1975
        /* Compute the character value */
1976
1977
913
        c = i;
1978
913
        switch (x)
1979
913
          {
1980
129
          case 1:   break;
1981
114
          case 2:   c += 1; break;  case 4:  c += 2; break;
1982
107
          case 8:   c += 3; break;  case 16: c += 4; break;
1983
144
          case 32:  c += 5; break;  case 64: c += 6; break;
1984
141
          case 128: c += 7; break;
1985
913
          }
1986
1987
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1988
        mode, only values < 128 can be used. In all the other cases, c is a
1989
        character value. */
1990
1991
913
#if PCRE2_CODE_UNIT_WIDTH == 8
1992
913
        if (utf && c > 127) goto DONE;
1993
913
#endif
1994
913
        if (a < 0) a = c;   /* First one found, save in a */
1995
382
        else if (b < 0)     /* Second one found */
1996
378
          {
1997
378
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1998
1999
378
#ifdef SUPPORT_UNICODE
2000
378
          if (utf || ucp)
2001
0
            {
2002
0
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
2003
0
            if (c > 127) d = UCD_OTHERCASE(c);
2004
0
            }
2005
378
#endif  /* SUPPORT_UNICODE */
2006
2007
378
          if (d != a) goto DONE;   /* Not the other case of a */
2008
4
          b = c;                   /* Save second in b */
2009
2010
#ifdef EBCDIC
2011
          /* To match ASCII (which puts the uppercase one in a), swap a & b
2012
          if needed. This doesn't really matter, but neatens the tests. */
2013
          if (TABLE_GET((unsigned int)a, re->tables + lcc_offset, a) == a)
2014
            {
2015
            b = a;
2016
            a = c;
2017
            }
2018
#endif
2019
4
          }
2020
4
        else goto DONE;   /* More than two characters found */
2021
913
        }
2022
11.1k
      }
2023
2024
    /* Replace the start code unit bits with a first code unit. If it is the
2025
    same as a required later code unit, then clear the required later code
2026
    unit. This is because a search for a required code unit starts after an
2027
    explicit first code unit, but at a code unit found from the bitmap.
2028
    Patterns such as /a*a/ don't work if both the start unit and required
2029
    unit are the same. */
2030
2031
103
    if (a >= 0) {
2032
103
      if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {
2033
14
        re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);
2034
14
        re->last_codeunit = 0;
2035
14
      }
2036
103
      re->first_codeunit = a;
2037
103
      flags = PCRE2_FIRSTSET;
2038
103
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
2039
103
    }
2040
2041
1.01k
    DONE:
2042
1.01k
    re->flags |= flags;
2043
1.01k
    }
2044
4.84k
  }
2045
2046
/* Find the minimum length of subject string. If the pattern can match an empty
2047
string, the minimum length is already known. If the pattern contains (*ACCEPT)
2048
all bets are off, and we don't even try to find a minimum length. If there are
2049
more back references than the size of the vector we are going to cache them in,
2050
do nothing. A pattern that complicated will probably take a long time to
2051
analyze and may in any case turn out to be too complicated. Note that back
2052
reference minima are held as 16-bit numbers. */
2053
2054
6.22k
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
2055
2.88k
     re->top_backref <= MAX_CACHE_BACKREF)
2056
2.88k
  {
2057
2.88k
  int min;
2058
2.88k
  int backref_cache[MAX_CACHE_BACKREF+1];
2059
2.88k
  backref_cache[0] = 0;    /* Highest one that is set */
2060
2.88k
  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
2061
2.88k
  switch(min)
2062
2.88k
    {
2063
2
    case -1:  /* \C in UTF mode or over-complex regex */
2064
2
    break;    /* Leave minlength unchanged (will be zero) */
2065
2066
    /* LCOV_EXCL_START */
2067
0
    case -2:
2068
0
    PCRE2_DEBUG_UNREACHABLE();
2069
0
    return 2; /* missing capturing bracket */
2070
    /* LCOV_EXCL_STOP */
2071
2072
    /* LCOV_EXCL_START */
2073
0
    case -3:
2074
0
    PCRE2_DEBUG_UNREACHABLE();
2075
0
    return 3; /* unrecognized opcode */
2076
    /* LCOV_EXCL_STOP */
2077
2078
2.88k
    default:
2079
2.88k
    re->minlength = (min > (int)UINT16_MAX)? (int)UINT16_MAX : min;
2080
2.88k
    break;
2081
2.88k
    }
2082
2.88k
  }
2083
2084
6.22k
return 0;
2085
6.22k
}
2086
2087
/* End of pcre2_study.c */