Coverage Report

Created: 2025-06-13 06:43

/src/php-src/ext/pcre/pcre2lib/pcre2_study.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains functions for scanning a compiled pattern and
42
collecting data (e.g. minimum matching length). */
43
44
45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48
49
#include "pcre2_internal.h"
50
51
/* The maximum remembered capturing brackets minimum. */
52
53
1.10k
#define MAX_CACHE_BACKREF 128
54
55
/* Set a bit in the starting code unit bit map. */
56
57
17.3k
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59
/* Returns from set_start_bits() */
60
61
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62
63
64
/*************************************************
65
*   Find the minimum subject length for a group  *
66
*************************************************/
67
68
/* Scan a parenthesized group and compute the minimum length of subject that
69
is needed to match it. This is a lower bound; it does not mean there is a
70
string of that length that matches. In UTF mode, the result is in characters
71
rather than code units. The field in a compiled pattern for storing the minimum
72
length is 16-bits long (on the grounds that anything longer than that is
73
pathological), so we give up when we reach that amount. This also means that
74
integer overflow for really crazy patterns cannot happen.
75
76
Backreference minimum lengths are cached to speed up multiple references. This
77
function is called only when the highest back reference in the pattern is less
78
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79
caching vector. The zeroth element contains the number of the highest set
80
value.
81
82
Arguments:
83
  re              compiled pattern block
84
  code            pointer to start of group (the bracket)
85
  startcode       pointer to start of the whole pattern's code
86
  utf             UTF flag
87
  recurses        chain of recurse_check to catch mutual recursion
88
  countptr        pointer to call count (to catch over complexity)
89
  backref_cache   vector for caching back references.
90
91
This function is no longer called when the pattern contains (*ACCEPT); however,
92
the old code for returning -1 is retained, just in case.
93
94
Returns:   the minimum length
95
           -1 \C in UTF-8 mode
96
              or (*ACCEPT)
97
              or pattern too complicated
98
           -2 internal error (missing capturing bracket)
99
           -3 internal error (opcode not listed)
100
*/
101
102
static int
103
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104
  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105
  int *backref_cache)
106
7.47k
{
107
7.47k
int length = -1;
108
7.47k
int branchlength = 0;
109
7.47k
int prev_cap_recno = -1;
110
7.47k
int prev_cap_d = 0;
111
7.47k
int prev_recurse_recno = -1;
112
7.47k
int prev_recurse_d = 0;
113
7.47k
uint32_t once_fudge = 0;
114
7.47k
BOOL had_recurse = FALSE;
115
7.47k
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116
7.47k
PCRE2_SPTR nextbranch = code + GET(code, 1);
117
7.47k
PCRE2_SPTR cc = code + 1 + LINK_SIZE;
118
7.47k
recurse_check this_recurse;
119
120
/* If this is a "could be empty" group, its minimum length is 0. */
121
122
7.47k
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123
124
/* Skip over capturing bracket number */
125
126
6.99k
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127
128
/* A large and/or complex regex can take too long to process. */
129
130
6.99k
if ((*countptr)++ > 1000) return -1;
131
132
/* Scan along the opcodes for this branch. If we get to the end of the branch,
133
check the length against that of the other branches. If the accumulated length
134
passes 16-bits, reset to that value and skip the rest of the branch. */
135
136
6.99k
for (;;)
137
2.11M
  {
138
2.11M
  int d, min, recno;
139
2.11M
  PCRE2_UCHAR op;
140
2.11M
  PCRE2_SPTR cs, ce;
141
142
2.11M
  if (branchlength >= UINT16_MAX)
143
0
    {
144
0
    branchlength = UINT16_MAX;
145
0
    cc = nextbranch;
146
0
    }
147
148
2.11M
  op = *cc;
149
2.11M
  switch (op)
150
2.11M
    {
151
0
    case OP_COND:
152
0
    case OP_SCOND:
153
154
    /* If there is only one branch in a condition, the implied branch has zero
155
    length, so we don't add anything. This covers the DEFINE "condition"
156
    automatically. If there are two branches we can treat it the same as any
157
    other non-capturing subpattern. */
158
159
0
    cs = cc + GET(cc, 1);
160
0
    if (*cs != OP_ALT)
161
0
      {
162
0
      cc = cs + 1 + LINK_SIZE;
163
0
      break;
164
0
      }
165
0
    goto PROCESS_NON_CAPTURE;
166
167
98
    case OP_BRA:
168
    /* There's a special case of OP_BRA, when it is wrapped round a repeated
169
    OP_RECURSE. We'd like to process the latter at this level so that
170
    remembering the value works for repeated cases. So we do nothing, but
171
    set a fudge value to skip over the OP_KET after the recurse. */
172
173
98
    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
174
0
      {
175
0
      once_fudge = 1 + LINK_SIZE;
176
0
      cc += 1 + LINK_SIZE;
177
0
      break;
178
0
      }
179
    /* Fall through */
180
181
99
    case OP_ONCE:
182
99
    case OP_SCRIPT_RUN:
183
99
    case OP_SBRA:
184
99
    case OP_BRAPOS:
185
99
    case OP_SBRAPOS:
186
99
    PROCESS_NON_CAPTURE:
187
99
    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
188
99
      backref_cache);
189
99
    if (d < 0) return d;
190
99
    branchlength += d;
191
348
    do cc += GET(cc, 1); while (*cc == OP_ALT);
192
99
    cc += 1 + LINK_SIZE;
193
99
    break;
194
195
    /* To save time for repeated capturing subpatterns, we remember the
196
    length of the previous one. Unfortunately we can't do the same for
197
    the unnumbered ones above. Nor can we do this if (?| is present in the
198
    pattern because captures with the same number are not then identical. */
199
200
1.77k
    case OP_CBRA:
201
1.95k
    case OP_SCBRA:
202
2.07k
    case OP_CBRAPOS:
203
2.33k
    case OP_SCBRAPOS:
204
2.33k
    recno = (int)GET2(cc, 1+LINK_SIZE);
205
2.33k
    if (dupcapused || recno != prev_cap_recno)
206
2.33k
      {
207
2.33k
      prev_cap_recno = recno;
208
2.33k
      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
209
2.33k
        backref_cache);
210
2.33k
      if (prev_cap_d < 0) return prev_cap_d;
211
2.33k
      }
212
2.33k
    branchlength += prev_cap_d;
213
5.42k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
214
2.33k
    cc += 1 + LINK_SIZE;
215
2.33k
    break;
216
217
    /* ACCEPT makes things far too complicated; we have to give up. In fact,
218
    from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
219
    used. However, leave the code in place, just in case. */
220
221
0
    case OP_ACCEPT:
222
0
    case OP_ASSERT_ACCEPT:
223
0
    return -1;
224
225
    /* Reached end of a branch; if it's a ket it is the end of a nested
226
    call. If it's ALT it is an alternation in a nested call. If it is END it's
227
    the end of the outer call. All can be handled by the same code. If the
228
    length of any branch is zero, there is no need to scan any subsequent
229
    branches. */
230
231
27.7k
    case OP_ALT:
232
34.1k
    case OP_KET:
233
34.2k
    case OP_KETRMAX:
234
34.2k
    case OP_KETRMIN:
235
34.3k
    case OP_KETRPOS:
236
34.3k
    case OP_END:
237
34.3k
    if (length < 0 || (!had_recurse && branchlength < length))
238
9.34k
      length = branchlength;
239
34.3k
    if (op != OP_ALT || length == 0) return length;
240
27.3k
    nextbranch = cc + GET(cc, 1);
241
27.3k
    cc += 1 + LINK_SIZE;
242
27.3k
    branchlength = 0;
243
27.3k
    had_recurse = FALSE;
244
27.3k
    break;
245
246
    /* Skip over assertive subpatterns */
247
248
14
    case OP_ASSERT:
249
21
    case OP_ASSERT_NOT:
250
50
    case OP_ASSERTBACK:
251
50
    case OP_ASSERTBACK_NOT:
252
50
    case OP_ASSERT_NA:
253
50
    case OP_ASSERT_SCS:
254
79
    case OP_ASSERTBACK_NA:
255
467
    do cc += GET(cc, 1); while (*cc == OP_ALT);
256
    /* Fall through */
257
258
    /* Skip over things that don't match chars */
259
260
79
    case OP_REVERSE:
261
79
    case OP_VREVERSE:
262
79
    case OP_CREF:
263
79
    case OP_DNCREF:
264
79
    case OP_RREF:
265
79
    case OP_DNRREF:
266
79
    case OP_FALSE:
267
79
    case OP_TRUE:
268
79
    case OP_CALLOUT:
269
278
    case OP_SOD:
270
282
    case OP_SOM:
271
313
    case OP_EOD:
272
320
    case OP_EODN:
273
4.44k
    case OP_CIRC:
274
4.69k
    case OP_CIRCM:
275
8.77k
    case OP_DOLL:
276
8.79k
    case OP_DOLLM:
277
8.85k
    case OP_NOT_WORD_BOUNDARY:
278
9.43k
    case OP_WORD_BOUNDARY:
279
9.48k
    case OP_NOT_UCP_WORD_BOUNDARY:
280
9.48k
    case OP_UCP_WORD_BOUNDARY:
281
9.48k
    cc += PRIV(OP_lengths)[*cc];
282
9.48k
    break;
283
284
0
    case OP_CALLOUT_STR:
285
0
    cc += GET(cc, 1 + 2*LINK_SIZE);
286
0
    break;
287
288
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
289
290
37
    case OP_BRAZERO:
291
37
    case OP_BRAMINZERO:
292
37
    case OP_BRAPOSZERO:
293
37
    case OP_SKIPZERO:
294
37
    cc += PRIV(OP_lengths)[*cc];
295
68
    do cc += GET(cc, 1); while (*cc == OP_ALT);
296
37
    cc += 1 + LINK_SIZE;
297
37
    break;
298
299
    /* Handle literal characters and + repetitions */
300
301
1.91M
    case OP_CHAR:
302
1.99M
    case OP_CHARI:
303
1.99M
    case OP_NOT:
304
1.99M
    case OP_NOTI:
305
1.99M
    case OP_PLUS:
306
1.99M
    case OP_PLUSI:
307
1.99M
    case OP_MINPLUS:
308
1.99M
    case OP_MINPLUSI:
309
1.99M
    case OP_POSPLUS:
310
1.99M
    case OP_POSPLUSI:
311
1.99M
    case OP_NOTPLUS:
312
1.99M
    case OP_NOTPLUSI:
313
1.99M
    case OP_NOTMINPLUS:
314
1.99M
    case OP_NOTMINPLUSI:
315
1.99M
    case OP_NOTPOSPLUS:
316
1.99M
    case OP_NOTPOSPLUSI:
317
1.99M
    branchlength++;
318
1.99M
    cc += 2;
319
1.99M
#ifdef SUPPORT_UNICODE
320
1.99M
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
321
1.99M
#endif
322
1.99M
    break;
323
324
954
    case OP_TYPEPLUS:
325
1.34k
    case OP_TYPEMINPLUS:
326
2.48k
    case OP_TYPEPOSPLUS:
327
2.48k
    branchlength++;
328
2.48k
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
329
2.48k
    break;
330
331
    /* Handle exact repetitions. The count is already in characters, but we
332
    may need to skip over a multibyte character in UTF mode.  */
333
334
0
    case OP_EXACT:
335
0
    case OP_EXACTI:
336
0
    case OP_NOTEXACT:
337
0
    case OP_NOTEXACTI:
338
0
    branchlength += GET2(cc,1);
339
0
    cc += 2 + IMM2_SIZE;
340
0
#ifdef SUPPORT_UNICODE
341
0
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
342
0
#endif
343
0
    break;
344
345
0
    case OP_TYPEEXACT:
346
0
    branchlength += GET2(cc,1);
347
0
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
348
0
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
349
0
    break;
350
351
    /* Handle single-char non-literal matchers */
352
353
747
    case OP_PROP:
354
887
    case OP_NOTPROP:
355
887
    cc += 2;
356
    /* Fall through */
357
358
943
    case OP_NOT_DIGIT:
359
1.36k
    case OP_DIGIT:
360
1.47k
    case OP_NOT_WHITESPACE:
361
1.71k
    case OP_WHITESPACE:
362
1.79k
    case OP_NOT_WORDCHAR:
363
5.18k
    case OP_WORDCHAR:
364
14.8k
    case OP_ANY:
365
15.0k
    case OP_ALLANY:
366
15.2k
    case OP_EXTUNI:
367
15.2k
    case OP_HSPACE:
368
15.3k
    case OP_NOT_HSPACE:
369
15.5k
    case OP_VSPACE:
370
15.7k
    case OP_NOT_VSPACE:
371
15.7k
    branchlength++;
372
15.7k
    cc++;
373
15.7k
    break;
374
375
    /* "Any newline" might match two characters, but it also might match just
376
    one. */
377
378
1.39k
    case OP_ANYNL:
379
1.39k
    branchlength += 1;
380
1.39k
    cc++;
381
1.39k
    break;
382
383
    /* The single-byte matcher means we can't proceed in UTF mode. (In
384
    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
385
    appear, but leave the code, just in case.) */
386
387
10
    case OP_ANYBYTE:
388
10
#ifdef SUPPORT_UNICODE
389
10
    if (utf) return -1;
390
0
#endif
391
0
    branchlength++;
392
0
    cc++;
393
0
    break;
394
395
    /* For repeated character types, we have to test for \p and \P, which have
396
    an extra two bytes of parameters. */
397
398
119
    case OP_TYPESTAR:
399
127
    case OP_TYPEMINSTAR:
400
2.15k
    case OP_TYPEQUERY:
401
2.40k
    case OP_TYPEMINQUERY:
402
2.41k
    case OP_TYPEPOSSTAR:
403
6.35k
    case OP_TYPEPOSQUERY:
404
6.35k
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
405
6.35k
    cc += PRIV(OP_lengths)[op];
406
6.35k
    break;
407
408
0
    case OP_TYPEUPTO:
409
0
    case OP_TYPEMINUPTO:
410
0
    case OP_TYPEPOSUPTO:
411
0
    if (cc[1 + IMM2_SIZE] == OP_PROP
412
0
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
413
0
    cc += PRIV(OP_lengths)[op];
414
0
    break;
415
416
    /* Check a class for variable quantification */
417
418
3.78k
    case OP_CLASS:
419
5.68k
    case OP_NCLASS:
420
5.68k
#ifdef SUPPORT_WIDE_CHARS
421
7.07k
    case OP_XCLASS:
422
7.07k
    case OP_ECLASS:
423
    /* The original code caused an unsigned overflow in 64 bit systems,
424
    so now we use a conditional statement. */
425
7.07k
    if (op == OP_XCLASS || op == OP_ECLASS)
426
1.39k
      cc += GET(cc, 1);
427
5.68k
    else
428
5.68k
#endif
429
5.68k
      cc += PRIV(OP_lengths)[OP_CLASS];
430
431
7.07k
    switch (*cc)
432
7.07k
      {
433
293
      case OP_CRPLUS:
434
457
      case OP_CRMINPLUS:
435
803
      case OP_CRPOSPLUS:
436
803
      branchlength++;
437
      /* Fall through */
438
439
1.67k
      case OP_CRSTAR:
440
2.29k
      case OP_CRMINSTAR:
441
2.95k
      case OP_CRQUERY:
442
3.53k
      case OP_CRMINQUERY:
443
4.25k
      case OP_CRPOSSTAR:
444
4.82k
      case OP_CRPOSQUERY:
445
4.82k
      cc++;
446
4.82k
      break;
447
448
0
      case OP_CRRANGE:
449
0
      case OP_CRMINRANGE:
450
0
      case OP_CRPOSRANGE:
451
0
      branchlength += GET2(cc,1);
452
0
      cc += 1 + 2 * IMM2_SIZE;
453
0
      break;
454
455
2.25k
      default:
456
2.25k
      branchlength++;
457
2.25k
      break;
458
7.07k
      }
459
7.07k
    break;
460
461
    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
462
    way: we find the minimum length for the subpattern. A recursion
463
    (backreference or subroutine) causes an a flag to be set that causes the
464
    length of this branch to be ignored. The logic is that a recursion can only
465
    make sense if there is another alternative that stops the recursing. That
466
    will provide the minimum length (when no recursion happens).
467
468
    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
469
    matches an empty string (by default it causes a matching failure), so in
470
    that case we must set the minimum length to zero.
471
472
    For backreferenes, if duplicate numbers are present in the pattern we check
473
    for a reference to a duplicate. If it is, we don't know which version will
474
    be referenced, so we have to set the minimum length to zero. */
475
476
    /* Duplicate named pattern back reference. */
477
478
7.07k
    case OP_DNREF:
479
0
    case OP_DNREFI:
480
0
    if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
481
0
      {
482
0
      int count = GET2(cc, 1+IMM2_SIZE);
483
0
      PCRE2_SPTR slot =
484
0
        (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
485
0
          GET2(cc, 1) * re->name_entry_size;
486
487
0
      d = INT_MAX;
488
489
      /* Scan all groups with the same name; find the shortest. */
490
491
0
      while (count-- > 0)
492
0
        {
493
0
        int dd, i;
494
0
        recno = GET2(slot, 0);
495
496
0
        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
497
0
          dd = backref_cache[recno];
498
0
        else
499
0
          {
500
0
          ce = cs = PRIV(find_bracket)(startcode, utf, recno);
501
0
          if (cs == NULL) return -2;
502
0
          do ce += GET(ce, 1); while (*ce == OP_ALT);
503
504
0
          dd = 0;
505
0
          if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
506
0
            {
507
0
            if (cc > cs && cc < ce)    /* Simple recursion */
508
0
              {
509
0
              had_recurse = TRUE;
510
0
              }
511
0
            else
512
0
              {
513
0
              recurse_check *r = recurses;
514
0
              for (r = recurses; r != NULL; r = r->prev)
515
0
                if (r->group == cs) break;
516
0
              if (r != NULL)           /* Mutual recursion */
517
0
                {
518
0
                had_recurse = TRUE;
519
0
                }
520
0
              else
521
0
                {
522
0
                this_recurse.prev = recurses;  /* No recursion */
523
0
                this_recurse.group = cs;
524
0
                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525
0
                  countptr, backref_cache);
526
0
                if (dd < 0) return dd;
527
0
                }
528
0
              }
529
0
            }
530
531
0
          backref_cache[recno] = dd;
532
0
          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533
0
          backref_cache[0] = recno;
534
0
          }
535
536
0
        if (dd < d) d = dd;
537
0
        if (d <= 0) break;    /* No point looking at any more */
538
0
        slot += re->name_entry_size;
539
0
        }
540
0
      }
541
0
    else d = 0;
542
0
    cc += PRIV(OP_lengths)[*cc];
543
0
    goto REPEAT_BACK_REFERENCE;
544
545
    /* Single back reference by number. References by name are converted to by
546
    number when there is no duplication. */
547
548
649
    case OP_REF:
549
685
    case OP_REFI:
550
685
    recno = GET2(cc, 1);
551
685
    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552
495
      d = backref_cache[recno];
553
190
    else
554
190
      {
555
190
      int i;
556
190
      d = 0;
557
558
190
      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559
190
        {
560
190
        ce = cs = PRIV(find_bracket)(startcode, utf, recno);
561
190
        if (cs == NULL) return -2;
562
279
        do ce += GET(ce, 1); while (*ce == OP_ALT);
563
564
190
        if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
565
190
          {
566
190
          if (cc > cs && cc < ce)    /* Simple recursion */
567
13
            {
568
13
            had_recurse = TRUE;
569
13
            }
570
177
          else
571
177
            {
572
177
            recurse_check *r = recurses;
573
177
            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
574
177
            if (r != NULL)           /* Mutual recursion */
575
0
              {
576
0
              had_recurse = TRUE;
577
0
              }
578
177
            else                     /* No recursion */
579
177
              {
580
177
              this_recurse.prev = recurses;
581
177
              this_recurse.group = cs;
582
177
              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
583
177
                backref_cache);
584
177
              if (d < 0) return d;
585
177
              }
586
177
            }
587
190
          }
588
190
        }
589
590
190
      backref_cache[recno] = d;
591
768
      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
592
190
      backref_cache[0] = recno;
593
190
      }
594
595
685
    cc += PRIV(OP_lengths)[*cc];
596
597
    /* Handle repeated back references */
598
599
685
    REPEAT_BACK_REFERENCE:
600
685
    switch (*cc)
601
685
      {
602
0
      case OP_CRSTAR:
603
0
      case OP_CRMINSTAR:
604
1
      case OP_CRQUERY:
605
1
      case OP_CRMINQUERY:
606
1
      case OP_CRPOSSTAR:
607
1
      case OP_CRPOSQUERY:
608
1
      min = 0;
609
1
      cc++;
610
1
      break;
611
612
18
      case OP_CRPLUS:
613
18
      case OP_CRMINPLUS:
614
18
      case OP_CRPOSPLUS:
615
18
      min = 1;
616
18
      cc++;
617
18
      break;
618
619
0
      case OP_CRRANGE:
620
0
      case OP_CRMINRANGE:
621
0
      case OP_CRPOSRANGE:
622
0
      min = GET2(cc, 1);
623
0
      cc += 1 + 2 * IMM2_SIZE;
624
0
      break;
625
626
666
      default:
627
666
      min = 1;
628
666
      break;
629
685
      }
630
631
     /* Take care not to overflow: (1) min and d are ints, so check that their
632
     product is not greater than INT_MAX. (2) branchlength is limited to
633
     UINT16_MAX (checked at the top of the loop). */
634
635
685
    if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
636
0
      branchlength = UINT16_MAX;
637
685
    else branchlength += min * d;
638
685
    break;
639
640
    /* Recursion always refers to the first occurrence of a subpattern with a
641
    given number. Therefore, we can always make use of caching, even when the
642
    pattern contains multiple subpatterns with the same number. */
643
644
14.3k
    case OP_RECURSE:
645
14.3k
    cs = ce = startcode + GET(cc, 1);
646
14.3k
    recno = GET2(cs, 1+LINK_SIZE);
647
14.3k
    if (recno == prev_recurse_recno)
648
1.08k
      {
649
1.08k
      branchlength += prev_recurse_d;
650
1.08k
      }
651
13.2k
    else
652
13.2k
      {
653
73.1k
      do ce += GET(ce, 1); while (*ce == OP_ALT);
654
13.2k
      if (cc > cs && cc < ce)    /* Simple recursion */
655
2.50k
        had_recurse = TRUE;
656
10.7k
      else
657
10.7k
        {
658
10.7k
        recurse_check *r = recurses;
659
31.8k
        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
660
10.7k
        if (r != NULL)          /* Mutual recursion */
661
7.01k
          had_recurse = TRUE;
662
3.75k
        else
663
3.75k
          {
664
3.75k
          this_recurse.prev = recurses;
665
3.75k
          this_recurse.group = cs;
666
3.75k
          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
667
3.75k
            countptr, backref_cache);
668
3.75k
          if (prev_recurse_d < 0) return prev_recurse_d;
669
3.75k
          prev_recurse_recno = recno;
670
3.75k
          branchlength += prev_recurse_d;
671
3.75k
          }
672
10.7k
        }
673
13.2k
      }
674
14.3k
    cc += 1 + LINK_SIZE + once_fudge;
675
14.3k
    once_fudge = 0;
676
14.3k
    break;
677
678
    /* Anything else does not or need not match a character. We can get the
679
    item's length from the table, but for those that can match zero occurrences
680
    of a character, we must take special action for UTF-8 characters. As it
681
    happens, the "NOT" versions of these opcodes are used at present only for
682
    ASCII characters, so they could be omitted from this list. However, in
683
    future that may change, so we include them here so as not to leave a
684
    gotcha for a future maintainer. */
685
686
0
    case OP_UPTO:
687
0
    case OP_UPTOI:
688
0
    case OP_NOTUPTO:
689
0
    case OP_NOTUPTOI:
690
0
    case OP_MINUPTO:
691
0
    case OP_MINUPTOI:
692
0
    case OP_NOTMINUPTO:
693
0
    case OP_NOTMINUPTOI:
694
0
    case OP_POSUPTO:
695
0
    case OP_POSUPTOI:
696
0
    case OP_NOTPOSUPTO:
697
0
    case OP_NOTPOSUPTOI:
698
699
363
    case OP_STAR:
700
487
    case OP_STARI:
701
491
    case OP_NOTSTAR:
702
493
    case OP_NOTSTARI:
703
565
    case OP_MINSTAR:
704
832
    case OP_MINSTARI:
705
834
    case OP_NOTMINSTAR:
706
835
    case OP_NOTMINSTARI:
707
2.09k
    case OP_POSSTAR:
708
2.84k
    case OP_POSSTARI:
709
2.84k
    case OP_NOTPOSSTAR:
710
2.84k
    case OP_NOTPOSSTARI:
711
712
4.46k
    case OP_QUERY:
713
5.82k
    case OP_QUERYI:
714
6.08k
    case OP_NOTQUERY:
715
6.39k
    case OP_NOTQUERYI:
716
7.07k
    case OP_MINQUERY:
717
7.85k
    case OP_MINQUERYI:
718
8.06k
    case OP_NOTMINQUERY:
719
8.34k
    case OP_NOTMINQUERYI:
720
15.2k
    case OP_POSQUERY:
721
17.8k
    case OP_POSQUERYI:
722
17.8k
    case OP_NOTPOSQUERY:
723
17.8k
    case OP_NOTPOSQUERYI:
724
725
17.8k
    cc += PRIV(OP_lengths)[op];
726
17.8k
#ifdef SUPPORT_UNICODE
727
17.8k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
728
17.8k
#endif
729
17.8k
    break;
730
731
    /* Skip these, but we need to add in the name length. */
732
733
49
    case OP_MARK:
734
49
    case OP_COMMIT_ARG:
735
49
    case OP_PRUNE_ARG:
736
49
    case OP_SKIP_ARG:
737
49
    case OP_THEN_ARG:
738
49
    cc += PRIV(OP_lengths)[op] + cc[1];
739
49
    break;
740
741
    /* The remaining opcodes are just skipped over. */
742
743
0
    case OP_CLOSE:
744
0
    case OP_COMMIT:
745
0
    case OP_FAIL:
746
0
    case OP_PRUNE:
747
3
    case OP_SET_SOM:
748
3
    case OP_SKIP:
749
3
    case OP_THEN:
750
3
    cc += PRIV(OP_lengths)[op];
751
3
    break;
752
753
    /* This should not occur: we list all opcodes explicitly so that when
754
    new ones get added they are properly considered. */
755
756
0
    default:
757
0
    PCRE2_DEBUG_UNREACHABLE();
758
0
    return -3;
759
2.11M
    }
760
2.11M
  }
761
762
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
763
0
return -3;                 /* Avoid compiler warnings */
764
6.99k
}
765
766
767
768
/*************************************************
769
*      Set a bit and maybe its alternate case    *
770
*************************************************/
771
772
/* Given a character, set its first code unit's bit in the table, and also the
773
corresponding bit for the other version of a letter if we are caseless.
774
775
Arguments:
776
  re            points to the regex block
777
  p             points to the first code unit of the character
778
  caseless      TRUE if caseless
779
  utf           TRUE for UTF mode
780
  ucp           TRUE for UCP mode
781
782
Returns:        pointer after the character
783
*/
784
785
static PCRE2_SPTR
786
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
787
  BOOL ucp)
788
10.6k
{
789
10.6k
uint32_t c = *p++;   /* First code unit */
790
791
10.6k
(void)utf;           /* Stop compiler warnings when UTF not supported */
792
10.6k
(void)ucp;
793
794
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
795
0xff. */
796
797
#if PCRE2_CODE_UNIT_WIDTH != 8
798
if (c > 0xff) SET_BIT(0xff); else
799
#endif
800
801
10.6k
SET_BIT(c);
802
803
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
804
the end of the character, even when caseless. */
805
806
10.6k
#ifdef SUPPORT_UNICODE
807
10.6k
if (utf)
808
498
  {
809
498
#if PCRE2_CODE_UNIT_WIDTH == 8
810
498
  if (c >= 0xc0) GETUTF8INC(c, p);
811
#elif PCRE2_CODE_UNIT_WIDTH == 16
812
  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
813
#endif
814
498
  }
815
10.6k
#endif  /* SUPPORT_UNICODE */
816
817
/* If caseless, handle the other case of the character. */
818
819
10.6k
if (caseless)
820
3.65k
  {
821
3.65k
#ifdef SUPPORT_UNICODE
822
3.65k
  if (utf || ucp)
823
479
    {
824
479
    c = UCD_OTHERCASE(c);
825
479
#if PCRE2_CODE_UNIT_WIDTH == 8
826
479
    if (utf)
827
479
      {
828
479
      PCRE2_UCHAR buff[6];
829
479
      (void)PRIV(ord2utf)(c, buff);
830
479
      SET_BIT(buff[0]);
831
479
      }
832
0
    else if (c < 256) SET_BIT(c);
833
#else  /* 16-bit or 32-bit mode */
834
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
835
#endif
836
479
    }
837
838
3.17k
  else
839
3.17k
#endif  /* SUPPORT_UNICODE */
840
841
  /* Not UTF or UCP */
842
843
3.17k
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
844
3.65k
  }
845
846
10.6k
return p;
847
10.6k
}
848
849
850
851
/*************************************************
852
*     Set bits for a positive character type     *
853
*************************************************/
854
855
/* This function sets starting bits for a character type. In UTF-8 mode, we can
856
only do a direct setting for bytes less than 128, as otherwise there can be
857
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
858
environment, the tables will only recognize ASCII characters anyway, but in at
859
least one Windows environment, some higher bytes bits were set in the tables.
860
So we deal with that case by considering the UTF-8 encoding.
861
862
Arguments:
863
  re             the regex block
864
  cbit type      the type of character wanted
865
  table_limit    32 for non-UTF-8; 16 for UTF-8
866
867
Returns:         nothing
868
*/
869
870
static void
871
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
872
2.81k
{
873
2.81k
uint32_t c;
874
92.9k
for (c = 0; c < table_limit; c++)
875
90.1k
  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
876
2.81k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
877
2.81k
if (table_limit == 32) return;
878
0
for (c = 128; c < 256; c++)
879
0
  {
880
0
  if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
881
0
    {
882
0
    PCRE2_UCHAR buff[6];
883
0
    (void)PRIV(ord2utf)(c, buff);
884
0
    SET_BIT(buff[0]);
885
0
    }
886
0
  }
887
0
#endif  /* UTF-8 */
888
0
}
889
890
891
/*************************************************
892
*     Set bits for a negative character type     *
893
*************************************************/
894
895
/* This function sets starting bits for a negative character type such as \D.
896
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
897
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
898
Unlike in the positive case, where we can set appropriate starting bits for
899
specific high-valued UTF-8 characters, in this case we have to set the bits for
900
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
901
0xc0 (192) for simplicity.
902
903
Arguments:
904
  re             the regex block
905
  cbit type      the type of character wanted
906
  table_limit    32 for non-UTF-8; 16 for UTF-8
907
908
Returns:         nothing
909
*/
910
911
static void
912
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
913
272
{
914
272
uint32_t c;
915
8.97k
for (c = 0; c < table_limit; c++)
916
8.70k
  re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
917
272
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
918
272
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
919
272
#endif
920
272
}
921
922
923
924
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
925
/*************************************************
926
*     Set starting bits for a character list.    *
927
*************************************************/
928
929
/* This function sets starting bits for a character list. It enumerates
930
all characters and character ranges in the character list, and sets
931
the starting bits accordingly.
932
933
Arguments:
934
  code           pointer to the code
935
  start_bitmap   pointer to the starting bitmap
936
937
Returns:         nothing
938
*/
939
static void
940
study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,
941
  const uint8_t *char_lists_end)
942
0
{
943
0
uint32_t type, list_ind;
944
0
uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
945
0
uint32_t range_start = ~(uint32_t)0, range_end = 0;
946
0
const uint8_t *next_char;
947
0
PCRE2_UCHAR start_buffer[6], end_buffer[6];
948
0
PCRE2_UCHAR start, end;
949
950
/* Only needed in 8-bit mode at the moment. */
951
0
type = (uint32_t)(code[0] << 8) | code[1];
952
0
code += 2;
953
954
/* Align characters. */
955
0
next_char = char_lists_end - (GET(code, 0) << 1);
956
0
type &= XCL_TYPE_MASK;
957
0
list_ind = 0;
958
959
0
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
960
0
  range_start = XCL_CHAR_LIST_LOW_16_START;
961
962
0
while (type > 0)
963
0
  {
964
0
  uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
965
966
0
  if (item_count == XCL_ITEM_COUNT_MASK)
967
0
    {
968
0
    if (list_ind <= 1)
969
0
      {
970
0
      item_count = *(const uint16_t*)next_char;
971
0
      next_char += 2;
972
0
      }
973
0
    else
974
0
      {
975
0
      item_count = *(const uint32_t*)next_char;
976
0
      next_char += 4;
977
0
      }
978
0
    }
979
980
0
  while (item_count > 0)
981
0
    {
982
0
    if (list_ind <= 1)
983
0
      {
984
0
      range_end = *(const uint16_t*)next_char;
985
0
      next_char += 2;
986
0
      }
987
0
    else
988
0
      {
989
0
      range_end = *(const uint32_t*)next_char;
990
0
      next_char += 4;
991
0
      }
992
993
0
    if ((range_end & XCL_CHAR_END) != 0)
994
0
      {
995
0
      range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
996
997
0
      PRIV(ord2utf)(range_end, end_buffer);
998
0
      end = end_buffer[0];
999
1000
0
      if (range_start < range_end)
1001
0
        {
1002
0
        PRIV(ord2utf)(range_start, start_buffer);
1003
0
        for (start = start_buffer[0]; start <= end; start++)
1004
0
          start_bitmap[start / 8] |= (1u << (start & 7));
1005
0
        }
1006
0
      else
1007
0
        start_bitmap[end / 8] |= (1u << (end & 7));
1008
1009
0
      range_start = ~(uint32_t)0;
1010
0
      }
1011
0
    else
1012
0
      range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
1013
1014
0
    item_count--;
1015
0
    }
1016
1017
0
  list_ind++;
1018
0
  type >>= XCL_TYPE_BIT_LEN;
1019
1020
0
  if (range_start == ~(uint32_t)0)
1021
0
    {
1022
0
    if ((type & XCL_BEGIN_WITH_RANGE) != 0)
1023
0
      {
1024
      /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */
1025
0
      if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
1026
0
      else range_start = XCL_CHAR_LIST_LOW_32_START;
1027
0
      }
1028
0
    }
1029
0
  else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
1030
0
    {
1031
0
    PRIV(ord2utf)(range_start, start_buffer);
1032
1033
    /* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and
1034
    XCL_CHAR_LIST_HIGH_32_END are not possible. */
1035
0
    if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
1036
0
    else range_end = XCL_CHAR_LIST_HIGH_16_END;
1037
1038
0
    PRIV(ord2utf)(range_end, end_buffer);
1039
0
    end = end_buffer[0];
1040
1041
0
    for (start = start_buffer[0]; start <= end; start++)
1042
0
      start_bitmap[start / 8] |= (1u << (start & 7));
1043
1044
0
    range_start = ~(uint32_t)0;
1045
0
    }
1046
1047
  /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */
1048
0
  if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
1049
0
  else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
1050
0
  }
1051
0
}
1052
#endif
1053
1054
1055
1056
/*************************************************
1057
*      Create bitmap of starting code units      *
1058
*************************************************/
1059
1060
/* This function scans a compiled unanchored expression recursively and
1061
attempts to build a bitmap of the set of possible starting code units whose
1062
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
1063
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
1064
we pass a value of 16 rather than 32 as the final argument. (See comments in
1065
those functions for the reason.)
1066
1067
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
1068
(a*)b where the group provides some optional starting code units but scanning
1069
must continue at the outer level to find at least one mandatory code unit. At
1070
the outermost level, this function fails unless the result is SSB_DONE.
1071
1072
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
1073
issues.
1074
1075
Arguments:
1076
  re           points to the compiled regex block
1077
  code         points to an expression
1078
  utf          TRUE if in UTF mode
1079
  ucp          TRUE if in UCP mode
1080
  depthptr     pointer to recurse depth
1081
1082
Returns:       SSB_FAIL     => Failed to find any starting code units
1083
               SSB_DONE     => Found mandatory starting code units
1084
               SSB_CONTINUE => Found optional starting code units
1085
               SSB_UNKNOWN  => Hit an unrecognized opcode
1086
               SSB_TOODEEP  => Recursion is too deep
1087
*/
1088
1089
static int
1090
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
1091
  int *depthptr)
1092
2.42k
{
1093
2.42k
uint32_t c;
1094
2.42k
int yield = SSB_DONE;
1095
1096
2.42k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1097
2.42k
int table_limit = utf? 16:32;
1098
#else
1099
int table_limit = 32;
1100
#endif
1101
1102
2.42k
*depthptr += 1;
1103
2.42k
if (*depthptr > 1000) return SSB_TOODEEP;
1104
1105
2.42k
do
1106
15.1k
  {
1107
15.1k
  BOOL try_next = TRUE;
1108
15.1k
  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
1109
1110
15.1k
  if (*code == OP_CBRA || *code == OP_SCBRA ||
1111
15.1k
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1112
1113
38.1k
  while (try_next)    /* Loop for items in this branch */
1114
24.3k
    {
1115
24.3k
    int rc;
1116
24.3k
    PCRE2_SPTR ncode;
1117
24.3k
    const uint8_t *classmap = NULL;
1118
24.3k
#ifdef SUPPORT_WIDE_CHARS
1119
24.3k
    PCRE2_UCHAR xclassflags;
1120
24.3k
#endif
1121
1122
24.3k
    switch(*tcode)
1123
24.3k
      {
1124
      /* If we reach something we don't understand, it means a new opcode has
1125
      been created that hasn't been added to this function. Hopefully this
1126
      problem will be discovered during testing. */
1127
1128
0
      default:
1129
0
      return SSB_UNKNOWN;
1130
1131
      /* Fail for a valid opcode that implies no starting bits. */
1132
1133
0
      case OP_ACCEPT:
1134
0
      case OP_ASSERT_ACCEPT:
1135
69
      case OP_ALLANY:
1136
127
      case OP_ANY:
1137
127
      case OP_ANYBYTE:
1138
130
      case OP_CIRCM:
1139
130
      case OP_CLOSE:
1140
130
      case OP_COMMIT:
1141
130
      case OP_COMMIT_ARG:
1142
130
      case OP_COND:
1143
130
      case OP_CREF:
1144
130
      case OP_FALSE:
1145
130
      case OP_TRUE:
1146
130
      case OP_DNCREF:
1147
130
      case OP_DNREF:
1148
130
      case OP_DNREFI:
1149
130
      case OP_DNRREF:
1150
158
      case OP_DOLL:
1151
160
      case OP_DOLLM:
1152
160
      case OP_END:
1153
161
      case OP_EOD:
1154
162
      case OP_EODN:
1155
182
      case OP_EXTUNI:
1156
182
      case OP_FAIL:
1157
184
      case OP_MARK:
1158
187
      case OP_NOT:
1159
187
      case OP_NOTEXACT:
1160
187
      case OP_NOTEXACTI:
1161
190
      case OP_NOTI:
1162
190
      case OP_NOTMINPLUS:
1163
190
      case OP_NOTMINPLUSI:
1164
190
      case OP_NOTMINQUERY:
1165
191
      case OP_NOTMINQUERYI:
1166
191
      case OP_NOTMINSTAR:
1167
191
      case OP_NOTMINSTARI:
1168
191
      case OP_NOTMINUPTO:
1169
191
      case OP_NOTMINUPTOI:
1170
192
      case OP_NOTPLUS:
1171
200
      case OP_NOTPLUSI:
1172
200
      case OP_NOTPOSPLUS:
1173
200
      case OP_NOTPOSPLUSI:
1174
200
      case OP_NOTPOSQUERY:
1175
200
      case OP_NOTPOSQUERYI:
1176
200
      case OP_NOTPOSSTAR:
1177
200
      case OP_NOTPOSSTARI:
1178
200
      case OP_NOTPOSUPTO:
1179
200
      case OP_NOTPOSUPTOI:
1180
263
      case OP_NOTPROP:
1181
268
      case OP_NOTQUERY:
1182
310
      case OP_NOTQUERYI:
1183
310
      case OP_NOTSTAR:
1184
310
      case OP_NOTSTARI:
1185
310
      case OP_NOTUPTO:
1186
310
      case OP_NOTUPTOI:
1187
345
      case OP_NOT_HSPACE:
1188
385
      case OP_NOT_VSPACE:
1189
385
      case OP_PRUNE:
1190
385
      case OP_PRUNE_ARG:
1191
400
      case OP_RECURSE:
1192
400
      case OP_REF:
1193
400
      case OP_REFI:
1194
402
      case OP_REVERSE:
1195
405
      case OP_VREVERSE:
1196
405
      case OP_RREF:
1197
405
      case OP_SCOND:
1198
405
      case OP_SET_SOM:
1199
405
      case OP_SKIP:
1200
405
      case OP_SKIP_ARG:
1201
408
      case OP_SOD:
1202
412
      case OP_SOM:
1203
412
      case OP_THEN:
1204
412
      case OP_THEN_ARG:
1205
412
      return SSB_FAIL;
1206
1207
      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1208
      uses OP_CIRCM). Skip over it. */
1209
1210
485
      case OP_CIRC:
1211
485
      tcode += PRIV(OP_lengths)[OP_CIRC];
1212
485
      break;
1213
1214
      /* A "real" property test implies no starting bits, but the fake property
1215
      PT_CLIST identifies a list of characters. These lists are short, as they
1216
      are used for characters with more than one "other case", so there is no
1217
      point in recognizing them for OP_NOTPROP. */
1218
1219
45
      case OP_PROP:
1220
45
      if (tcode[1] != PT_CLIST) return SSB_FAIL;
1221
3
        {
1222
3
        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1223
12
        while ((c = *p++) < NOTACHAR)
1224
9
          {
1225
9
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1226
9
          if (utf)
1227
9
            {
1228
9
            PCRE2_UCHAR buff[6];
1229
9
            (void)PRIV(ord2utf)(c, buff);
1230
9
            c = buff[0];
1231
9
            }
1232
9
#endif
1233
9
          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1234
9
          }
1235
3
        }
1236
3
      try_next = FALSE;
1237
3
      break;
1238
1239
      /* We can ignore word boundary tests. */
1240
1241
35
      case OP_WORD_BOUNDARY:
1242
91
      case OP_NOT_WORD_BOUNDARY:
1243
91
      case OP_UCP_WORD_BOUNDARY:
1244
130
      case OP_NOT_UCP_WORD_BOUNDARY:
1245
130
      tcode++;
1246
130
      break;
1247
1248
      /* For a positive lookahead assertion, inspect what immediately follows,
1249
      ignoring intermediate assertions and callouts. If the next item is one
1250
      that sets a mandatory character, skip this assertion. Otherwise, treat it
1251
      the same as other bracket groups. */
1252
1253
2
      case OP_ASSERT:
1254
2
      case OP_ASSERT_NA:
1255
2
      ncode = tcode + GET(tcode, 1);
1256
136
      while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1257
2
      ncode += 1 + LINK_SIZE;
1258
1259
      /* Skip irrelevant items */
1260
1261
4
      for (BOOL done = FALSE; !done;)
1262
2
        {
1263
2
        switch (*ncode)
1264
2
          {
1265
0
          case OP_ASSERT:
1266
0
          case OP_ASSERT_NOT:
1267
0
          case OP_ASSERTBACK:
1268
0
          case OP_ASSERTBACK_NOT:
1269
0
          case OP_ASSERT_NA:
1270
0
          case OP_ASSERTBACK_NA:
1271
0
          case OP_ASSERT_SCS:
1272
0
          ncode += GET(ncode, 1);
1273
0
          while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1274
0
          ncode += 1 + LINK_SIZE;
1275
0
          break;
1276
1277
0
          case OP_WORD_BOUNDARY:
1278
0
          case OP_NOT_WORD_BOUNDARY:
1279
0
          case OP_UCP_WORD_BOUNDARY:
1280
0
          case OP_NOT_UCP_WORD_BOUNDARY:
1281
0
          ncode++;
1282
0
          break;
1283
1284
0
          case OP_CALLOUT:
1285
0
          ncode += PRIV(OP_lengths)[OP_CALLOUT];
1286
0
          break;
1287
1288
0
          case OP_CALLOUT_STR:
1289
0
          ncode += GET(ncode, 1 + 2*LINK_SIZE);
1290
0
          break;
1291
1292
2
          default:
1293
2
          done = TRUE;
1294
2
          break;
1295
2
          }
1296
2
        }
1297
1298
      /* Now check the next significant item. */
1299
1300
2
      switch(*ncode)
1301
2
        {
1302
0
        default:
1303
0
        break;
1304
1305
0
        case OP_PROP:
1306
0
        if (ncode[1] != PT_CLIST) break;
1307
        /* Fall through */
1308
0
        case OP_ANYNL:
1309
2
        case OP_CHAR:
1310
2
        case OP_CHARI:
1311
2
        case OP_EXACT:
1312
2
        case OP_EXACTI:
1313
2
        case OP_HSPACE:
1314
2
        case OP_MINPLUS:
1315
2
        case OP_MINPLUSI:
1316
2
        case OP_PLUS:
1317
2
        case OP_PLUSI:
1318
2
        case OP_POSPLUS:
1319
2
        case OP_POSPLUSI:
1320
2
        case OP_VSPACE:
1321
        /* Note that these types will only be present in non-UCP mode. */
1322
2
        case OP_DIGIT:
1323
2
        case OP_NOT_DIGIT:
1324
2
        case OP_WORDCHAR:
1325
2
        case OP_NOT_WORDCHAR:
1326
2
        case OP_WHITESPACE:
1327
2
        case OP_NOT_WHITESPACE:
1328
2
        tcode = ncode;
1329
2
        continue;   /* With the following significant opcode */
1330
2
        }
1331
      /* Fall through */
1332
1333
      /* For a group bracket or a positive assertion without an immediately
1334
      following mandatory setting, recurse to set bits from within the
1335
      subpattern. If it can't find anything, we have to give up. If it finds
1336
      some mandatory character(s), we are done for this branch. Otherwise,
1337
      carry on scanning after the subpattern. */
1338
1339
52
      case OP_BRA:
1340
54
      case OP_SBRA:
1341
965
      case OP_CBRA:
1342
1.09k
      case OP_SCBRA:
1343
1.09k
      case OP_BRAPOS:
1344
1.09k
      case OP_SBRAPOS:
1345
1.16k
      case OP_CBRAPOS:
1346
1.31k
      case OP_SCBRAPOS:
1347
1.31k
      case OP_ONCE:
1348
1.31k
      case OP_SCRIPT_RUN:
1349
1.31k
      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1350
1.31k
      if (rc == SSB_DONE)
1351
177
        {
1352
177
        try_next = FALSE;
1353
177
        }
1354
1.13k
      else if (rc == SSB_CONTINUE)
1355
1.12k
        {
1356
3.05k
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1357
1.12k
        tcode += 1 + LINK_SIZE;
1358
1.12k
        }
1359
15
      else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1360
1.29k
      break;
1361
1362
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1363
      this branch, though we might have found something optional. For ALT, we
1364
      continue with the next alternative, but we have to arrange that the final
1365
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1366
      return SSB_CONTINUE: if this is the top level, that indicates failure,
1367
      but after a nested subpattern, it causes scanning to continue. */
1368
1369
3.21k
      case OP_ALT:
1370
3.21k
      yield = SSB_CONTINUE;
1371
3.21k
      try_next = FALSE;
1372
3.21k
      break;
1373
1374
566
      case OP_KET:
1375
697
      case OP_KETRMAX:
1376
698
      case OP_KETRMIN:
1377
849
      case OP_KETRPOS:
1378
849
      return SSB_CONTINUE;
1379
1380
      /* Skip over callout */
1381
1382
0
      case OP_CALLOUT:
1383
0
      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1384
0
      break;
1385
1386
0
      case OP_CALLOUT_STR:
1387
0
      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1388
0
      break;
1389
1390
      /* Skip over lookbehind, negative lookahead, and scan substring
1391
      assertions */
1392
1393
7
      case OP_ASSERT_NOT:
1394
31
      case OP_ASSERTBACK:
1395
31
      case OP_ASSERTBACK_NOT:
1396
60
      case OP_ASSERTBACK_NA:
1397
60
      case OP_ASSERT_SCS:
1398
326
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1399
60
      tcode += 1 + LINK_SIZE;
1400
60
      break;
1401
1402
      /* BRAZERO does the bracket, but carries on. */
1403
1404
5
      case OP_BRAZERO:
1405
5
      case OP_BRAMINZERO:
1406
5
      case OP_BRAPOSZERO:
1407
5
      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1408
5
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1409
0
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1410
0
      tcode += 1 + LINK_SIZE;
1411
0
      break;
1412
1413
      /* SKIPZERO skips the bracket. */
1414
1415
0
      case OP_SKIPZERO:
1416
0
      tcode++;
1417
0
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1418
0
      tcode += 1 + LINK_SIZE;
1419
0
      break;
1420
1421
      /* Single-char * or ? sets the bit and tries the next item */
1422
1423
128
      case OP_STAR:
1424
199
      case OP_MINSTAR:
1425
391
      case OP_POSSTAR:
1426
838
      case OP_QUERY:
1427
1.14k
      case OP_MINQUERY:
1428
1.70k
      case OP_POSQUERY:
1429
1.70k
      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1430
1.70k
      break;
1431
1432
84
      case OP_STARI:
1433
136
      case OP_MINSTARI:
1434
529
      case OP_POSSTARI:
1435
938
      case OP_QUERYI:
1436
1.11k
      case OP_MINQUERYI:
1437
1.64k
      case OP_POSQUERYI:
1438
1.64k
      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1439
1.64k
      break;
1440
1441
      /* Single-char upto sets the bit and tries the next */
1442
1443
0
      case OP_UPTO:
1444
0
      case OP_MINUPTO:
1445
0
      case OP_POSUPTO:
1446
0
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1447
0
      break;
1448
1449
0
      case OP_UPTOI:
1450
0
      case OP_MINUPTOI:
1451
0
      case OP_POSUPTOI:
1452
0
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1453
0
      break;
1454
1455
      /* At least one single char sets the bit and stops */
1456
1457
0
      case OP_EXACT:
1458
0
      tcode += IMM2_SIZE;
1459
      /* Fall through */
1460
4.85k
      case OP_CHAR:
1461
4.85k
      case OP_PLUS:
1462
4.86k
      case OP_MINPLUS:
1463
5.31k
      case OP_POSPLUS:
1464
5.31k
      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1465
5.31k
      try_next = FALSE;
1466
5.31k
      break;
1467
1468
0
      case OP_EXACTI:
1469
0
      tcode += IMM2_SIZE;
1470
      /* Fall through */
1471
1.96k
      case OP_CHARI:
1472
1.98k
      case OP_PLUSI:
1473
1.98k
      case OP_MINPLUSI:
1474
2.00k
      case OP_POSPLUSI:
1475
2.00k
      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1476
2.00k
      try_next = FALSE;
1477
2.00k
      break;
1478
1479
      /* Special spacing and line-terminating items. These recognize specific
1480
      lists of characters. The difference between VSPACE and ANYNL is that the
1481
      latter can match the two-character CRLF sequence, but that is not
1482
      relevant for finding the first character, so their code here is
1483
      identical. */
1484
1485
26
      case OP_HSPACE:
1486
26
      SET_BIT(CHAR_HT);
1487
26
      SET_BIT(CHAR_SPACE);
1488
1489
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1490
      the bits for 0xA0 and for code units >= 255, independently of UTF. */
1491
1492
#if PCRE2_CODE_UNIT_WIDTH != 8
1493
      SET_BIT(0xA0);
1494
      SET_BIT(0xFF);
1495
#else
1496
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1497
      units of horizontal space characters. */
1498
1499
26
#ifdef SUPPORT_UNICODE
1500
26
      if (utf)
1501
0
        {
1502
0
        SET_BIT(0xC2);  /* For U+00A0 */
1503
0
        SET_BIT(0xE1);  /* For U+1680, U+180E */
1504
0
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1505
0
        SET_BIT(0xE3);  /* For U+3000 */
1506
0
        }
1507
26
      else
1508
26
#endif
1509
      /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1510
      the code is EBCDIC. */
1511
26
        {
1512
26
#ifndef EBCDIC
1513
26
        SET_BIT(0xA0);
1514
26
#endif  /* Not EBCDIC */
1515
26
        }
1516
26
#endif  /* 8-bit support */
1517
1518
26
      try_next = FALSE;
1519
26
      break;
1520
1521
280
      case OP_ANYNL:
1522
399
      case OP_VSPACE:
1523
399
      SET_BIT(CHAR_LF);
1524
399
      SET_BIT(CHAR_VT);
1525
399
      SET_BIT(CHAR_FF);
1526
399
      SET_BIT(CHAR_CR);
1527
1528
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529
      the bits for NEL and for code units >= 255, independently of UTF. */
1530
1531
#if PCRE2_CODE_UNIT_WIDTH != 8
1532
      SET_BIT(CHAR_NEL);
1533
      SET_BIT(0xFF);
1534
#else
1535
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536
      units of vertical space characters. */
1537
1538
399
#ifdef SUPPORT_UNICODE
1539
399
      if (utf)
1540
95
        {
1541
95
        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1542
95
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1543
95
        }
1544
304
      else
1545
304
#endif
1546
      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547
304
        {
1548
304
        SET_BIT(CHAR_NEL);
1549
304
        }
1550
399
#endif  /* 8-bit support */
1551
1552
399
      try_next = FALSE;
1553
399
      break;
1554
1555
      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1556
      is set, we do not see these opcodes because \d etc are converted to
1557
      properties. Therefore, these apply in the case when only characters less
1558
      than 256 are recognized to match the types. */
1559
1560
51
      case OP_NOT_DIGIT:
1561
51
      set_nottype_bits(re, cbit_digit, table_limit);
1562
51
      try_next = FALSE;
1563
51
      break;
1564
1565
61
      case OP_DIGIT:
1566
61
      set_type_bits(re, cbit_digit, table_limit);
1567
61
      try_next = FALSE;
1568
61
      break;
1569
1570
25
      case OP_NOT_WHITESPACE:
1571
25
      set_nottype_bits(re, cbit_space, table_limit);
1572
25
      try_next = FALSE;
1573
25
      break;
1574
1575
169
      case OP_WHITESPACE:
1576
169
      set_type_bits(re, cbit_space, table_limit);
1577
169
      try_next = FALSE;
1578
169
      break;
1579
1580
48
      case OP_NOT_WORDCHAR:
1581
48
      set_nottype_bits(re, cbit_word, table_limit);
1582
48
      try_next = FALSE;
1583
48
      break;
1584
1585
1.27k
      case OP_WORDCHAR:
1586
1.27k
      set_type_bits(re, cbit_word, table_limit);
1587
1.27k
      try_next = FALSE;
1588
1.27k
      break;
1589
1590
      /* One or more character type fudges the pointer and restarts, knowing
1591
      it will hit a single character type and stop there. */
1592
1593
194
      case OP_TYPEPLUS:
1594
326
      case OP_TYPEMINPLUS:
1595
528
      case OP_TYPEPOSPLUS:
1596
528
      tcode++;
1597
528
      break;
1598
1599
0
      case OP_TYPEEXACT:
1600
0
      tcode += 1 + IMM2_SIZE;
1601
0
      break;
1602
1603
      /* Zero or more repeats of character types set the bits and then
1604
      try again. */
1605
1606
0
      case OP_TYPEUPTO:
1607
0
      case OP_TYPEMINUPTO:
1608
0
      case OP_TYPEPOSUPTO:
1609
0
      tcode += IMM2_SIZE;  /* Fall through */
1610
1611
15
      case OP_TYPESTAR:
1612
20
      case OP_TYPEMINSTAR:
1613
51
      case OP_TYPEPOSSTAR:
1614
347
      case OP_TYPEQUERY:
1615
457
      case OP_TYPEMINQUERY:
1616
1.66k
      case OP_TYPEPOSQUERY:
1617
1.66k
      switch(tcode[1])
1618
1.66k
        {
1619
25
        default:
1620
25
        case OP_ANY:
1621
28
        case OP_ALLANY:
1622
28
        return SSB_FAIL;
1623
1624
0
        case OP_HSPACE:
1625
0
        SET_BIT(CHAR_HT);
1626
0
        SET_BIT(CHAR_SPACE);
1627
1628
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1629
        the bits for 0xA0 and for code units >= 255, independently of UTF. */
1630
1631
#if PCRE2_CODE_UNIT_WIDTH != 8
1632
        SET_BIT(0xA0);
1633
        SET_BIT(0xFF);
1634
#else
1635
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1636
        units of horizontal space characters. */
1637
1638
0
#ifdef SUPPORT_UNICODE
1639
0
        if (utf)
1640
0
          {
1641
0
          SET_BIT(0xC2);  /* For U+00A0 */
1642
0
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1643
0
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1644
0
          SET_BIT(0xE3);  /* For U+3000 */
1645
0
          }
1646
0
        else
1647
0
#endif
1648
        /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1649
        the code is EBCDIC. */
1650
0
          {
1651
0
#ifndef EBCDIC
1652
0
          SET_BIT(0xA0);
1653
0
#endif  /* Not EBCDIC */
1654
0
          }
1655
0
#endif  /* 8-bit support */
1656
0
        break;
1657
1658
121
        case OP_ANYNL:
1659
168
        case OP_VSPACE:
1660
168
        SET_BIT(CHAR_LF);
1661
168
        SET_BIT(CHAR_VT);
1662
168
        SET_BIT(CHAR_FF);
1663
168
        SET_BIT(CHAR_CR);
1664
1665
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1666
        the bits for NEL and for code units >= 255, independently of UTF. */
1667
1668
#if PCRE2_CODE_UNIT_WIDTH != 8
1669
        SET_BIT(CHAR_NEL);
1670
        SET_BIT(0xFF);
1671
#else
1672
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1673
        units of vertical space characters. */
1674
1675
168
#ifdef SUPPORT_UNICODE
1676
168
        if (utf)
1677
46
          {
1678
46
          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1679
46
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1680
46
          }
1681
122
        else
1682
122
#endif
1683
        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1684
122
          {
1685
122
          SET_BIT(CHAR_NEL);
1686
122
          }
1687
168
#endif  /* 8-bit support */
1688
168
        break;
1689
1690
145
        case OP_NOT_DIGIT:
1691
145
        set_nottype_bits(re, cbit_digit, table_limit);
1692
145
        break;
1693
1694
175
        case OP_DIGIT:
1695
175
        set_type_bits(re, cbit_digit, table_limit);
1696
175
        break;
1697
1698
0
        case OP_NOT_WHITESPACE:
1699
0
        set_nottype_bits(re, cbit_space, table_limit);
1700
0
        break;
1701
1702
192
        case OP_WHITESPACE:
1703
192
        set_type_bits(re, cbit_space, table_limit);
1704
192
        break;
1705
1706
3
        case OP_NOT_WORDCHAR:
1707
3
        set_nottype_bits(re, cbit_word, table_limit);
1708
3
        break;
1709
1710
949
        case OP_WORDCHAR:
1711
949
        set_type_bits(re, cbit_word, table_limit);
1712
949
        break;
1713
1.66k
        }
1714
1715
1.63k
      tcode += 2;
1716
1.63k
      break;
1717
1718
      /* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */
1719
1720
0
#ifdef SUPPORT_WIDE_CHARS
1721
0
      case OP_ECLASS:
1722
0
      return SSB_FAIL;
1723
0
#endif
1724
1725
      /* Extended class: if there are any property checks, or if this is a
1726
      negative XCLASS without a map, give up. If there are no property checks,
1727
      there must be wide characters on the XCLASS list, because otherwise an
1728
      XCLASS would not have been created. This means that code points >= 255
1729
      are potential starters. In the UTF-8 case we can scan them and set bits
1730
      for the relevant leading bytes. */
1731
1732
0
#ifdef SUPPORT_WIDE_CHARS
1733
159
      case OP_XCLASS:
1734
159
      xclassflags = tcode[1 + LINK_SIZE];
1735
159
      if ((xclassflags & XCL_HASPROP) != 0 ||
1736
159
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1737
22
        return SSB_FAIL;
1738
1739
      /* We have a positive XCLASS or a negative one without a map. Set up the
1740
      map pointer if there is one, and fall through. */
1741
1742
137
      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1743
137
        (const uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1744
1745
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1746
      then jump to handle the map. */
1747
1748
137
#if PCRE2_CODE_UNIT_WIDTH == 8
1749
137
      if (utf && (xclassflags & XCL_NOT) == 0)
1750
32
        {
1751
32
        PCRE2_UCHAR b, e;
1752
32
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1753
32
        tcode += GET(tcode, 1);
1754
1755
32
        if (*p >= XCL_LIST)
1756
0
          {
1757
0
          study_char_list(p, re->start_bitmap,
1758
0
            ((const uint8_t *)re + re->code_start));
1759
0
          goto HANDLE_CLASSMAP;
1760
0
          }
1761
1762
73
        for (;;) switch (*p++)
1763
73
          {
1764
40
          case XCL_SINGLE:
1765
40
          b = *p++;
1766
101
          while ((*p & 0xc0) == 0x80) p++;
1767
40
          re->start_bitmap[b/8] |= (1u << (b&7));
1768
40
          break;
1769
1770
1
          case XCL_RANGE:
1771
1
          b = *p++;
1772
2
          while ((*p & 0xc0) == 0x80) p++;
1773
1
          e = *p++;
1774
2
          while ((*p & 0xc0) == 0x80) p++;
1775
2
          for (; b <= e; b++)
1776
1
            re->start_bitmap[b/8] |= (1u << (b&7));
1777
1
          break;
1778
1779
32
          case XCL_END:
1780
32
          goto HANDLE_CLASSMAP;
1781
1782
0
          default:
1783
0
          PCRE2_DEBUG_UNREACHABLE();
1784
0
          return SSB_UNKNOWN;   /* Internal error, should not occur */
1785
73
          }
1786
32
        }
1787
105
#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1788
105
#endif  /* SUPPORT_WIDE_CHARS */
1789
1790
      /* It seems that the fall through comment must be outside the #ifdef if
1791
      it is to avoid the gcc compiler warning. */
1792
1793
      /* Fall through */
1794
1795
      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1796
      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1797
      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1798
      there is no difference between CLASS and NCLASS. In all other wide
1799
      character modes, set the 0xFF bit to indicate code units >= 255. */
1800
1801
1.03k
      case OP_NCLASS:
1802
1.03k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1803
1.03k
      if (utf)
1804
441
        {
1805
441
        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1806
441
        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1807
441
        }
1808
#elif PCRE2_CODE_UNIT_WIDTH != 8
1809
      SET_BIT(0xFF);                             /* For characters >= 255 */
1810
#endif
1811
      /* Fall through */
1812
1813
      /* Enter here for a positive non-XCLASS. If we have fallen through from
1814
      an XCLASS, classmap will already be set; just advance the code pointer.
1815
      Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1816
1817
2.91k
      case OP_CLASS:
1818
2.91k
      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1819
2.81k
        {
1820
2.81k
        classmap = (const uint8_t *)(++tcode);
1821
2.81k
        tcode += 32 / sizeof(PCRE2_UCHAR);
1822
2.81k
        }
1823
1824
      /* When wide characters are supported, classmap may be NULL. In UTF-8
1825
      (sic) mode, the bits in a class bit map correspond to character values,
1826
      not to byte values. However, the bit map we are constructing is for byte
1827
      values. So we have to do a conversion for characters whose code point is
1828
      greater than 127. In fact, there are only two possible starting bytes for
1829
      characters in the range 128 - 255. */
1830
1831
2.91k
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1832
2.94k
      HANDLE_CLASSMAP:
1833
2.94k
#endif
1834
2.94k
      if (classmap != NULL)
1835
2.94k
        {
1836
2.94k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1837
2.94k
        if (utf)
1838
605
          {
1839
10.2k
          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1840
22.4k
          for (c = 128; c < 256; c++)
1841
21.8k
            {
1842
21.8k
            if ((classmap[c/8] & (1u << (c&7))) != 0)
1843
882
              {
1844
882
              int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1845
882
              re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1846
882
              c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1847
882
              }
1848
21.8k
            }
1849
605
          }
1850
2.34k
        else
1851
2.34k
#endif
1852
        /* In all modes except UTF-8, the two bit maps are compatible. */
1853
1854
2.34k
          {
1855
77.3k
          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1856
2.34k
          }
1857
2.94k
        }
1858
1859
      /* Act on what follows the class. For a zero minimum repeat, continue;
1860
      otherwise stop processing. */
1861
1862
2.94k
      switch (*tcode)
1863
2.94k
        {
1864
353
        case OP_CRSTAR:
1865
659
        case OP_CRMINSTAR:
1866
891
        case OP_CRQUERY:
1867
1.20k
        case OP_CRMINQUERY:
1868
1.53k
        case OP_CRPOSSTAR:
1869
1.98k
        case OP_CRPOSQUERY:
1870
1.98k
        tcode++;
1871
1.98k
        break;
1872
1873
0
        case OP_CRRANGE:
1874
0
        case OP_CRMINRANGE:
1875
0
        case OP_CRPOSRANGE:
1876
0
        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1877
0
          else try_next = FALSE;
1878
0
        break;
1879
1880
966
        default:
1881
966
        try_next = FALSE;
1882
966
        break;
1883
2.94k
        }
1884
2.94k
      break; /* End of class handling case */
1885
24.3k
      }      /* End of switch for opcodes */
1886
24.3k
    }        /* End of try_next loop */
1887
1888
13.7k
  code += GET(code, 1);   /* Advance to next branch */
1889
13.7k
  }
1890
13.7k
while (*code == OP_ALT);
1891
1892
1.04k
return yield;
1893
2.42k
}
1894
1895
1896
1897
/*************************************************
1898
*          Study a compiled expression           *
1899
*************************************************/
1900
1901
/* This function is handed a compiled expression that it must study to produce
1902
information that will speed up the matching.
1903
1904
Argument:
1905
  re       points to the compiled expression
1906
1907
Returns:   0 normally; non-zero should never normally occur
1908
           1 unknown opcode in set_start_bits
1909
           2 missing capturing bracket
1910
           3 unknown opcode in find_minlength
1911
*/
1912
1913
int
1914
PRIV(study)(pcre2_real_code *re)
1915
1.16k
{
1916
1.16k
int count = 0;
1917
1.16k
PCRE2_UCHAR *code;
1918
1.16k
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1919
1.16k
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1920
1921
/* Find start of compiled code */
1922
1923
1.16k
code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
1924
1925
/* For a pattern that has a first code unit, or a multiline pattern that
1926
matches only at "line start", there is no point in seeking a list of starting
1927
code units. */
1928
1929
1.16k
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1930
1.10k
  {
1931
1.10k
  int depth = 0;
1932
1.10k
  int rc = set_start_bits(re, code, utf, ucp, &depth);
1933
1.10k
  if (rc == SSB_UNKNOWN)
1934
0
    {
1935
0
    PCRE2_DEBUG_UNREACHABLE();
1936
0
    return 1;
1937
0
    }
1938
1939
  /* If a list of starting code units was set up, scan the list to see if only
1940
  one or two were listed. Having only one listed is rare because usually a
1941
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1942
  If two are listed, see if they are caseless versions of the same character;
1943
  if so we can replace the list with a caseless first code unit. This gives
1944
  better performance and is plausibly worth doing for patterns such as [Ww]ord
1945
  or (word|WORD). */
1946
1947
1.10k
  if (rc == SSB_DONE)
1948
556
    {
1949
556
    int i;
1950
556
    int a = -1;
1951
556
    int b = -1;
1952
556
    uint8_t *p = re->start_bitmap;
1953
556
    uint32_t flags = PCRE2_FIRSTMAPSET;
1954
1955
2.47k
    for (i = 0; i < 256; p++, i += 8)
1956
2.47k
      {
1957
2.47k
      uint8_t x = *p;
1958
2.47k
      if (x != 0)
1959
803
        {
1960
803
        int c;
1961
803
        uint8_t y = x & (~x + 1);   /* Least significant bit */
1962
803
        if (y != x) goto DONE;      /* More than one bit set */
1963
1964
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1965
        all wide characters", so we cannot use it here. */
1966
1967
#if PCRE2_CODE_UNIT_WIDTH != 8
1968
        if (i == 248 && x == 0x80) goto DONE;
1969
#endif
1970
1971
        /* Compute the character value */
1972
1973
421
        c = i;
1974
421
        switch (x)
1975
421
          {
1976
173
          case 1:   break;
1977
69
          case 2:   c += 1; break;  case 4:  c += 2; break;
1978
36
          case 8:   c += 3; break;  case 16: c += 4; break;
1979
45
          case 32:  c += 5; break;  case 64: c += 6; break;
1980
33
          case 128: c += 7; break;
1981
421
          }
1982
1983
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1984
        mode, only values < 128 can be used. In all the other cases, c is a
1985
        character value. */
1986
1987
421
#if PCRE2_CODE_UNIT_WIDTH == 8
1988
421
        if (utf && c > 127) goto DONE;
1989
421
#endif
1990
421
        if (a < 0) a = c;   /* First one found, save in a */
1991
173
        else if (b < 0)     /* Second one found */
1992
172
          {
1993
172
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1994
1995
172
#ifdef SUPPORT_UNICODE
1996
172
          if (utf || ucp)
1997
20
            {
1998
20
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1999
18
            if (c > 127) d = UCD_OTHERCASE(c);
2000
18
            }
2001
170
#endif  /* SUPPORT_UNICODE */
2002
2003
170
          if (d != a) goto DONE;   /* Not the other case of a */
2004
2
          b = c;                   /* Save second in b */
2005
2
          }
2006
1
        else goto DONE;   /* More than two characters found */
2007
421
        }
2008
2.47k
      }
2009
2010
    /* Replace the start code unit bits with a first code unit. If it is the
2011
    same as a required later code unit, then clear the required later code
2012
    unit. This is because a search for a required code unit starts after an
2013
    explicit first code unit, but at a code unit found from the bitmap.
2014
    Patterns such as /a*a/ don't work if both the start unit and required
2015
    unit are the same. */
2016
2017
3
    if (a >= 0) {
2018
3
      if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {
2019
0
        re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);
2020
0
        re->last_codeunit = 0;
2021
0
      }
2022
3
      re->first_codeunit = a;
2023
3
      flags = PCRE2_FIRSTSET;
2024
3
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
2025
3
    }
2026
2027
556
    DONE:
2028
556
    re->flags |= flags;
2029
556
    }
2030
1.10k
  }
2031
2032
/* Find the minimum length of subject string. If the pattern can match an empty
2033
string, the minimum length is already known. If the pattern contains (*ACCEPT)
2034
all bets are off, and we don't even try to find a minimum length. If there are
2035
more back references than the size of the vector we are going to cache them in,
2036
do nothing. A pattern that complicated will probably take a long time to
2037
analyze and may in any case turn out to be too complicated. Note that back
2038
reference minima are held as 16-bit numbers. */
2039
2040
1.16k
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
2041
1.16k
     re->top_backref <= MAX_CACHE_BACKREF)
2042
1.10k
  {
2043
1.10k
  int min;
2044
1.10k
  int backref_cache[MAX_CACHE_BACKREF+1];
2045
1.10k
  backref_cache[0] = 0;    /* Highest one that is set */
2046
1.10k
  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
2047
1.10k
  switch(min)
2048
1.10k
    {
2049
10
    case -1:  /* \C in UTF mode or over-complex regex */
2050
10
    break;    /* Leave minlength unchanged (will be zero) */
2051
2052
0
    case -2:
2053
0
    PCRE2_DEBUG_UNREACHABLE();
2054
0
    return 2; /* missing capturing bracket */
2055
2056
0
    case -3:
2057
0
    PCRE2_DEBUG_UNREACHABLE();
2058
0
    return 3; /* unrecognized opcode */
2059
2060
1.09k
    default:
2061
1.09k
    re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
2062
1.09k
    break;
2063
1.10k
    }
2064
1.10k
  }
2065
2066
1.16k
return 0;
2067
1.16k
}
2068
2069
/* End of pcre2_study.c */