Coverage Report

Created: 2025-08-26 07:08

/src/pcre2/src/pcre2_study.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains functions for scanning a compiled pattern and
43
collecting data (e.g. minimum matching length). */
44
45
46
#include "pcre2_internal.h"
47
48
49
50
/* The maximum remembered capturing brackets minimum. */
51
52
44.3k
#define MAX_CACHE_BACKREF 128
53
54
/* Set a bit in the starting code unit bit map. */
55
56
471k
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
57
58
/* Returns from set_start_bits() */
59
60
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
61
62
63
/*************************************************
64
*   Find the minimum subject length for a group  *
65
*************************************************/
66
67
/* Scan a parenthesized group and compute the minimum length of subject that
68
is needed to match it. This is a lower bound; it does not mean there is a
69
string of that length that matches. In UTF mode, the result is in characters
70
rather than code units. The field in a compiled pattern for storing the minimum
71
length is 16-bits long (on the grounds that anything longer than that is
72
pathological), so we give up when we reach that amount. This also means that
73
integer overflow for really crazy patterns cannot happen.
74
75
Backreference minimum lengths are cached to speed up multiple references. This
76
function is called only when the highest back reference in the pattern is less
77
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
78
caching vector. The zeroth element contains the number of the highest set
79
value.
80
81
Arguments:
82
  re              compiled pattern block
83
  code            pointer to start of group (the bracket)
84
  startcode       pointer to start of the whole pattern's code
85
  utf             UTF flag
86
  recurses        chain of recurse_check to catch mutual recursion
87
  countptr        pointer to call count (to catch over complexity)
88
  backref_cache   vector for caching back references.
89
90
This function is no longer called when the pattern contains (*ACCEPT); however,
91
the old code for returning -1 is retained, just in case.
92
93
Returns:   the minimum length
94
           -1 \C in UTF-8 mode
95
              or (*ACCEPT)
96
              or pattern too complicated
97
           -2 internal error (missing capturing bracket)
98
           -3 internal error (opcode not listed)
99
*/
100
101
static int
102
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
103
  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
104
  int *backref_cache)
105
406k
{
106
406k
int length = -1;
107
406k
int branchlength = 0;
108
406k
int prev_cap_recno = -1;
109
406k
int prev_cap_d = 0;
110
406k
int prev_recurse_recno = -1;
111
406k
int prev_recurse_d = 0;
112
406k
uint32_t once_fudge = 0;
113
406k
BOOL had_recurse = FALSE;
114
406k
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
115
406k
PCRE2_SPTR nextbranch = code + GET(code, 1);
116
406k
PCRE2_SPTR cc = code + 1 + LINK_SIZE;
117
406k
recurse_check this_recurse;
118
119
/* If this is a "could be empty" group, its minimum length is 0. */
120
121
406k
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
122
123
/* Skip over capturing bracket number */
124
125
399k
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
126
127
/* A large and/or complex regex can take too long to process. */
128
129
399k
if ((*countptr)++ > 1000) return -1;
130
131
/* Scan along the opcodes for this branch. If we get to the end of the branch,
132
check the length against that of the other branches. If the accumulated length
133
passes 16-bits, reset to that value and skip the rest of the branch. */
134
135
399k
for (;;)
136
10.1M
  {
137
10.1M
  int d, min, recno;
138
10.1M
  PCRE2_UCHAR op;
139
10.1M
  PCRE2_SPTR cs, ce;
140
141
10.1M
  if (branchlength >= UINT16_MAX)
142
2.41k
    {
143
2.41k
    branchlength = UINT16_MAX;
144
2.41k
    cc = nextbranch;
145
2.41k
    }
146
147
10.1M
  op = *cc;
148
10.1M
  switch (op)
149
10.1M
    {
150
15.1k
    case OP_COND:
151
15.6k
    case OP_SCOND:
152
153
    /* If there is only one branch in a condition, the implied branch has zero
154
    length, so we don't add anything. This covers the DEFINE "condition"
155
    automatically. If there are two branches we can treat it the same as any
156
    other non-capturing subpattern. */
157
158
15.6k
    cs = cc + GET(cc, 1);
159
15.6k
    if (*cs != OP_ALT)
160
10.1k
      {
161
10.1k
      cc = cs + 1 + LINK_SIZE;
162
10.1k
      break;
163
10.1k
      }
164
5.52k
    goto PROCESS_NON_CAPTURE;
165
166
164k
    case OP_BRA:
167
    /* There's a special case of OP_BRA, when it is wrapped round a repeated
168
    OP_RECURSE. We'd like to process the latter at this level so that
169
    remembering the value works for repeated cases. So we do nothing, but
170
    set a fudge value to skip over the OP_KET after the recurse. */
171
172
164k
    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173
364
      {
174
364
      once_fudge = 1 + LINK_SIZE;
175
364
      cc += 1 + LINK_SIZE;
176
364
      break;
177
364
      }
178
    /* Fall through */
179
180
181k
    case OP_ONCE:
181
188k
    case OP_SCRIPT_RUN:
182
189k
    case OP_SBRA:
183
190k
    case OP_BRAPOS:
184
191k
    case OP_SBRAPOS:
185
197k
    PROCESS_NON_CAPTURE:
186
197k
    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187
197k
      backref_cache);
188
197k
    if (d < 0) return d;
189
197k
    branchlength += d;
190
413k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
191
197k
    cc += 1 + LINK_SIZE;
192
197k
    break;
193
194
    /* To save time for repeated capturing subpatterns, we remember the
195
    length of the previous one. Unfortunately we can't do the same for
196
    the unnumbered ones above. Nor can we do this if (?| is present in the
197
    pattern because captures with the same number are not then identical. */
198
199
483k
    case OP_CBRA:
200
484k
    case OP_SCBRA:
201
485k
    case OP_CBRAPOS:
202
487k
    case OP_SCBRAPOS:
203
487k
    recno = (int)GET2(cc, 1+LINK_SIZE);
204
487k
    if (dupcapused || recno != prev_cap_recno)
205
131k
      {
206
131k
      prev_cap_recno = recno;
207
131k
      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208
131k
        backref_cache);
209
131k
      if (prev_cap_d < 0) return prev_cap_d;
210
131k
      }
211
486k
    branchlength += prev_cap_d;
212
872k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
213
486k
    cc += 1 + LINK_SIZE;
214
486k
    break;
215
216
    /* ACCEPT makes things far too complicated; we have to give up. In fact,
217
    from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218
    used. However, leave the code in place, just in case. */
219
220
0
    case OP_ACCEPT:
221
0
    case OP_ASSERT_ACCEPT:
222
0
    return -1;
223
224
    /* Reached end of a branch; if it's a ket it is the end of a nested
225
    call. If it's ALT it is an alternation in a nested call. If it is END it's
226
    the end of the outer call. All can be handled by the same code. If the
227
    length of any branch is zero, there is no need to scan any subsequent
228
    branches. */
229
230
265k
    case OP_ALT:
231
634k
    case OP_KET:
232
637k
    case OP_KETRMAX:
233
638k
    case OP_KETRMIN:
234
640k
    case OP_KETRPOS:
235
640k
    case OP_END:
236
640k
    if (length < 0 || (!had_recurse && branchlength < length))
237
457k
      length = branchlength;
238
640k
    if (op != OP_ALT || length == 0) return length;
239
242k
    nextbranch = cc + GET(cc, 1);
240
242k
    cc += 1 + LINK_SIZE;
241
242k
    branchlength = 0;
242
242k
    had_recurse = FALSE;
243
242k
    break;
244
245
    /* Skip over assertive subpatterns */
246
247
180k
    case OP_ASSERT:
248
309k
    case OP_ASSERT_NOT:
249
439k
    case OP_ASSERTBACK:
250
754k
    case OP_ASSERTBACK_NOT:
251
1.24M
    case OP_ASSERT_NA:
252
1.25M
    case OP_ASSERT_SCS:
253
1.28M
    case OP_ASSERTBACK_NA:
254
2.56M
    do cc += GET(cc, 1); while (*cc == OP_ALT);
255
    /* Fall through */
256
257
    /* Skip over things that don't match chars */
258
259
1.28M
    case OP_REVERSE:
260
1.28M
    case OP_VREVERSE:
261
1.28M
    case OP_CREF:
262
1.29M
    case OP_DNCREF:
263
1.29M
    case OP_RREF:
264
1.29M
    case OP_DNRREF:
265
1.29M
    case OP_FALSE:
266
1.29M
    case OP_TRUE:
267
2.37M
    case OP_CALLOUT:
268
2.37M
    case OP_SOD:
269
2.38M
    case OP_SOM:
270
2.38M
    case OP_EOD:
271
2.39M
    case OP_EODN:
272
2.42M
    case OP_CIRC:
273
2.54M
    case OP_CIRCM:
274
2.58M
    case OP_DOLL:
275
2.59M
    case OP_DOLLM:
276
2.60M
    case OP_NOT_WORD_BOUNDARY:
277
2.60M
    case OP_WORD_BOUNDARY:
278
2.60M
    case OP_NOT_UCP_WORD_BOUNDARY:
279
2.61M
    case OP_UCP_WORD_BOUNDARY:
280
2.61M
    cc += PRIV(OP_lengths)[*cc];
281
2.61M
    break;
282
283
2.36k
    case OP_CALLOUT_STR:
284
2.36k
    cc += GET(cc, 1 + 2*LINK_SIZE);
285
2.36k
    break;
286
287
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
288
289
7.96k
    case OP_BRAZERO:
290
9.92k
    case OP_BRAMINZERO:
291
12.7k
    case OP_BRAPOSZERO:
292
13.4k
    case OP_SKIPZERO:
293
13.4k
    cc += PRIV(OP_lengths)[*cc];
294
16.9k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
295
13.4k
    cc += 1 + LINK_SIZE;
296
13.4k
    break;
297
298
    /* Handle literal characters and + repetitions */
299
300
3.33M
    case OP_CHAR:
301
5.11M
    case OP_CHARI:
302
5.12M
    case OP_NOT:
303
5.13M
    case OP_NOTI:
304
5.16M
    case OP_PLUS:
305
5.17M
    case OP_PLUSI:
306
5.18M
    case OP_MINPLUS:
307
5.18M
    case OP_MINPLUSI:
308
5.22M
    case OP_POSPLUS:
309
5.23M
    case OP_POSPLUSI:
310
5.23M
    case OP_NOTPLUS:
311
5.23M
    case OP_NOTPLUSI:
312
5.23M
    case OP_NOTMINPLUS:
313
5.24M
    case OP_NOTMINPLUSI:
314
5.24M
    case OP_NOTPOSPLUS:
315
5.24M
    case OP_NOTPOSPLUSI:
316
5.24M
    branchlength++;
317
5.24M
    cc += 2;
318
5.24M
#ifdef SUPPORT_UNICODE
319
5.24M
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
320
5.24M
#endif
321
5.24M
    break;
322
323
30.6k
    case OP_TYPEPLUS:
324
38.6k
    case OP_TYPEMINPLUS:
325
52.3k
    case OP_TYPEPOSPLUS:
326
52.3k
    branchlength++;
327
52.3k
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
328
52.3k
    break;
329
330
    /* Handle exact repetitions. The count is already in characters, but we
331
    may need to skip over a multibyte character in UTF mode.  */
332
333
13.1k
    case OP_EXACT:
334
20.1k
    case OP_EXACTI:
335
23.0k
    case OP_NOTEXACT:
336
27.9k
    case OP_NOTEXACTI:
337
27.9k
    branchlength += GET2(cc,1);
338
27.9k
    cc += 2 + IMM2_SIZE;
339
27.9k
#ifdef SUPPORT_UNICODE
340
27.9k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
341
27.9k
#endif
342
27.9k
    break;
343
344
8.04k
    case OP_TYPEEXACT:
345
8.04k
    branchlength += GET2(cc,1);
346
8.04k
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
347
8.04k
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
348
8.04k
    break;
349
350
    /* Handle single-char non-literal matchers */
351
352
34.1k
    case OP_PROP:
353
48.1k
    case OP_NOTPROP:
354
48.1k
    cc += 2;
355
    /* Fall through */
356
357
57.2k
    case OP_NOT_DIGIT:
358
67.2k
    case OP_DIGIT:
359
85.1k
    case OP_NOT_WHITESPACE:
360
178k
    case OP_WHITESPACE:
361
187k
    case OP_NOT_WORDCHAR:
362
195k
    case OP_WORDCHAR:
363
225k
    case OP_ANY:
364
241k
    case OP_ALLANY:
365
245k
    case OP_EXTUNI:
366
259k
    case OP_HSPACE:
367
278k
    case OP_NOT_HSPACE:
368
284k
    case OP_VSPACE:
369
287k
    case OP_NOT_VSPACE:
370
287k
    branchlength++;
371
287k
    cc++;
372
287k
    break;
373
374
    /* "Any newline" might match two characters, but it also might match just
375
    one. */
376
377
9.43k
    case OP_ANYNL:
378
9.43k
    branchlength += 1;
379
9.43k
    cc++;
380
9.43k
    break;
381
382
    /* The single-byte matcher means we can't proceed in UTF mode. (In
383
    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
384
    appear, but leave the code, just in case.) */
385
386
0
    case OP_ANYBYTE:
387
0
#ifdef SUPPORT_UNICODE
388
0
    if (utf) return -1;
389
0
#endif
390
0
    branchlength++;
391
0
    cc++;
392
0
    break;
393
394
    /* For repeated character types, we have to test for \p and \P, which have
395
    an extra two bytes of parameters. */
396
397
33.0k
    case OP_TYPESTAR:
398
37.9k
    case OP_TYPEMINSTAR:
399
49.7k
    case OP_TYPEQUERY:
400
59.0k
    case OP_TYPEMINQUERY:
401
68.0k
    case OP_TYPEPOSSTAR:
402
82.6k
    case OP_TYPEPOSQUERY:
403
82.6k
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
404
82.6k
    cc += PRIV(OP_lengths)[op];
405
82.6k
    break;
406
407
2.32k
    case OP_TYPEUPTO:
408
4.50k
    case OP_TYPEMINUPTO:
409
5.77k
    case OP_TYPEPOSUPTO:
410
5.77k
    if (cc[1 + IMM2_SIZE] == OP_PROP
411
5.77k
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
412
5.77k
    cc += PRIV(OP_lengths)[op];
413
5.77k
    break;
414
415
    /* Check a class for variable quantification */
416
417
90.4k
    case OP_CLASS:
418
125k
    case OP_NCLASS:
419
125k
#ifdef SUPPORT_WIDE_CHARS
420
142k
    case OP_XCLASS:
421
147k
    case OP_ECLASS:
422
    /* The original code caused an unsigned overflow in 64 bit systems,
423
    so now we use a conditional statement. */
424
147k
    if (op == OP_XCLASS || op == OP_ECLASS)
425
21.6k
      cc += GET(cc, 1);
426
125k
    else
427
125k
#endif
428
125k
      cc += PRIV(OP_lengths)[OP_CLASS];
429
430
147k
    switch (*cc)
431
147k
      {
432
8.46k
      case OP_CRPLUS:
433
14.3k
      case OP_CRMINPLUS:
434
18.1k
      case OP_CRPOSPLUS:
435
18.1k
      branchlength++;
436
      /* Fall through */
437
438
25.3k
      case OP_CRSTAR:
439
30.2k
      case OP_CRMINSTAR:
440
34.6k
      case OP_CRQUERY:
441
38.1k
      case OP_CRMINQUERY:
442
44.4k
      case OP_CRPOSSTAR:
443
47.0k
      case OP_CRPOSQUERY:
444
47.0k
      cc++;
445
47.0k
      break;
446
447
4.37k
      case OP_CRRANGE:
448
5.90k
      case OP_CRMINRANGE:
449
22.5k
      case OP_CRPOSRANGE:
450
22.5k
      branchlength += GET2(cc,1);
451
22.5k
      cc += 1 + 2 * IMM2_SIZE;
452
22.5k
      break;
453
454
77.4k
      default:
455
77.4k
      branchlength++;
456
77.4k
      break;
457
147k
      }
458
147k
    break;
459
460
    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461
    way: we find the minimum length for the subpattern. A recursion
462
    (backreference or subroutine) causes an a flag to be set that causes the
463
    length of this branch to be ignored. The logic is that a recursion can only
464
    make sense if there is another alternative that stops the recursing. That
465
    will provide the minimum length (when no recursion happens).
466
467
    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468
    matches an empty string (by default it causes a matching failure), so in
469
    that case we must set the minimum length to zero.
470
471
    For backreferenes, if duplicate numbers are present in the pattern we check
472
    for a reference to a duplicate. If it is, we don't know which version will
473
    be referenced, so we have to set the minimum length to zero. */
474
475
    /* Duplicate named pattern back reference. */
476
477
147k
    case OP_DNREF:
478
7.09k
    case OP_DNREFI:
479
7.09k
    if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480
3.96k
      {
481
3.96k
      int count = GET2(cc, 1+IMM2_SIZE);
482
3.96k
      PCRE2_SPTR slot =
483
3.96k
        (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code)) +
484
3.96k
          GET2(cc, 1) * re->name_entry_size;
485
486
3.96k
      d = INT_MAX;
487
488
      /* Scan all groups with the same name; find the shortest. */
489
490
8.43k
      while (count-- > 0)
491
7.95k
        {
492
7.95k
        int dd, i;
493
7.95k
        recno = GET2(slot, 0);
494
495
7.95k
        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496
5.34k
          dd = backref_cache[recno];
497
2.60k
        else
498
2.60k
          {
499
2.60k
          ce = cs = PRIV(find_bracket)(startcode, utf, recno);
500
2.60k
          if (cs == NULL) return -2;
501
2.84k
          do ce += GET(ce, 1); while (*ce == OP_ALT);
502
503
2.60k
          dd = 0;
504
2.60k
          if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
505
2.60k
            {
506
2.60k
            if (cc > cs && cc < ce)    /* Simple recursion */
507
156
              {
508
156
              had_recurse = TRUE;
509
156
              }
510
2.45k
            else
511
2.45k
              {
512
2.45k
              recurse_check *r = recurses;
513
7.01k
              for (r = recurses; r != NULL; r = r->prev)
514
4.69k
                if (r->group == cs) break;
515
2.45k
              if (r != NULL)           /* Mutual recursion */
516
141
                {
517
141
                had_recurse = TRUE;
518
141
                }
519
2.31k
              else
520
2.31k
                {
521
2.31k
                this_recurse.prev = recurses;  /* No recursion */
522
2.31k
                this_recurse.group = cs;
523
2.31k
                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
524
2.31k
                  countptr, backref_cache);
525
2.31k
                if (dd < 0) return dd;
526
2.31k
                }
527
2.45k
              }
528
2.60k
            }
529
530
2.60k
          backref_cache[recno] = dd;
531
10.6k
          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
532
2.60k
          backref_cache[0] = recno;
533
2.60k
          }
534
535
7.95k
        if (dd < d) d = dd;
536
7.95k
        if (d <= 0) break;    /* No point looking at any more */
537
4.47k
        slot += re->name_entry_size;
538
4.47k
        }
539
3.96k
      }
540
3.13k
    else d = 0;
541
7.08k
    cc += PRIV(OP_lengths)[*cc];
542
7.08k
    goto REPEAT_BACK_REFERENCE;
543
544
    /* Single back reference by number. References by name are converted to by
545
    number when there is no duplication. */
546
547
23.2k
    case OP_REF:
548
30.6k
    case OP_REFI:
549
30.6k
    recno = GET2(cc, 1);
550
30.6k
    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
551
18.8k
      d = backref_cache[recno];
552
11.7k
    else
553
11.7k
      {
554
11.7k
      int i;
555
11.7k
      d = 0;
556
557
11.7k
      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
558
11.5k
        {
559
11.5k
        ce = cs = PRIV(find_bracket)(startcode, utf, recno);
560
11.5k
        if (cs == NULL) return -2;
561
12.7k
        do ce += GET(ce, 1); while (*ce == OP_ALT);
562
563
11.5k
        if (!dupcapused || PRIV(find_bracket)(ce, utf, recno) == NULL)
564
11.2k
          {
565
11.2k
          if (cc > cs && cc < ce)    /* Simple recursion */
566
619
            {
567
619
            had_recurse = TRUE;
568
619
            }
569
10.6k
          else
570
10.6k
            {
571
10.6k
            recurse_check *r = recurses;
572
52.1k
            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
573
10.6k
            if (r != NULL)           /* Mutual recursion */
574
877
              {
575
877
              had_recurse = TRUE;
576
877
              }
577
9.75k
            else                     /* No recursion */
578
9.75k
              {
579
9.75k
              this_recurse.prev = recurses;
580
9.75k
              this_recurse.group = cs;
581
9.75k
              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
582
9.75k
                backref_cache);
583
9.75k
              if (d < 0) return d;
584
9.75k
              }
585
10.6k
            }
586
11.2k
          }
587
11.5k
        }
588
589
11.6k
      backref_cache[recno] = d;
590
26.1k
      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
591
11.6k
      backref_cache[0] = recno;
592
11.6k
      }
593
594
30.4k
    cc += PRIV(OP_lengths)[*cc];
595
596
    /* Handle repeated back references */
597
598
37.5k
    REPEAT_BACK_REFERENCE:
599
37.5k
    switch (*cc)
600
37.5k
      {
601
2.22k
      case OP_CRSTAR:
602
3.49k
      case OP_CRMINSTAR:
603
5.03k
      case OP_CRQUERY:
604
6.05k
      case OP_CRMINQUERY:
605
6.05k
      case OP_CRPOSSTAR:
606
6.05k
      case OP_CRPOSQUERY:
607
6.05k
      min = 0;
608
6.05k
      cc++;
609
6.05k
      break;
610
611
3.14k
      case OP_CRPLUS:
612
3.52k
      case OP_CRMINPLUS:
613
3.52k
      case OP_CRPOSPLUS:
614
3.52k
      min = 1;
615
3.52k
      cc++;
616
3.52k
      break;
617
618
2.62k
      case OP_CRRANGE:
619
3.61k
      case OP_CRMINRANGE:
620
3.61k
      case OP_CRPOSRANGE:
621
3.61k
      min = GET2(cc, 1);
622
3.61k
      cc += 1 + 2 * IMM2_SIZE;
623
3.61k
      break;
624
625
24.3k
      default:
626
24.3k
      min = 1;
627
24.3k
      break;
628
37.5k
      }
629
630
     /* Take care not to overflow: (1) min and d are ints, so check that their
631
     product is not greater than INT_MAX. (2) branchlength is limited to
632
     UINT16_MAX (checked at the top of the loop). */
633
634
37.5k
    if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
635
2.07k
      branchlength = UINT16_MAX;
636
35.5k
    else branchlength += min * d;
637
37.5k
    break;
638
639
    /* Recursion always refers to the first occurrence of a subpattern with a
640
    given number. Therefore, we can always make use of caching, even when the
641
    pattern contains multiple subpatterns with the same number. */
642
643
50.2k
    case OP_RECURSE:
644
50.2k
    cs = ce = startcode + GET(cc, 1);
645
50.2k
    recno = GET2(cs, 1+LINK_SIZE);
646
50.2k
    if (recno == prev_recurse_recno)
647
5.11k
      {
648
5.11k
      branchlength += prev_recurse_d;
649
5.11k
      }
650
45.1k
    else
651
45.1k
      {
652
98.8k
      do ce += GET(ce, 1); while (*ce == OP_ALT);
653
45.1k
      if (cc > cs && cc < ce)    /* Simple recursion */
654
23.8k
        had_recurse = TRUE;
655
21.2k
      else
656
21.2k
        {
657
21.2k
        recurse_check *r = recurses;
658
134k
        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
659
21.2k
        if (r != NULL)          /* Mutual recursion */
660
708
          had_recurse = TRUE;
661
20.5k
        else
662
20.5k
          {
663
20.5k
          this_recurse.prev = recurses;
664
20.5k
          this_recurse.group = cs;
665
20.5k
          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
666
20.5k
            countptr, backref_cache);
667
20.5k
          if (prev_recurse_d < 0) return prev_recurse_d;
668
20.3k
          prev_recurse_recno = recno;
669
20.3k
          branchlength += prev_recurse_d;
670
20.3k
          }
671
21.2k
        }
672
45.1k
      }
673
50.0k
    cc += 1 + LINK_SIZE + once_fudge;
674
50.0k
    once_fudge = 0;
675
50.0k
    break;
676
677
    /* Anything else does not or need not match a character. We can get the
678
    item's length from the table, but for those that can match zero occurrences
679
    of a character, we must take special action for UTF-8 characters. As it
680
    happens, the "NOT" versions of these opcodes are used at present only for
681
    ASCII characters, so they could be omitted from this list. However, in
682
    future that may change, so we include them here so as not to leave a
683
    gotcha for a future maintainer. */
684
685
2.01k
    case OP_UPTO:
686
3.31k
    case OP_UPTOI:
687
3.98k
    case OP_NOTUPTO:
688
4.69k
    case OP_NOTUPTOI:
689
5.47k
    case OP_MINUPTO:
690
7.17k
    case OP_MINUPTOI:
691
8.06k
    case OP_NOTMINUPTO:
692
8.98k
    case OP_NOTMINUPTOI:
693
14.4k
    case OP_POSUPTO:
694
19.3k
    case OP_POSUPTOI:
695
21.6k
    case OP_NOTPOSUPTO:
696
23.8k
    case OP_NOTPOSUPTOI:
697
698
42.9k
    case OP_STAR:
699
56.7k
    case OP_STARI:
700
58.2k
    case OP_NOTSTAR:
701
59.1k
    case OP_NOTSTARI:
702
66.9k
    case OP_MINSTAR:
703
71.2k
    case OP_MINSTARI:
704
72.1k
    case OP_NOTMINSTAR:
705
73.2k
    case OP_NOTMINSTARI:
706
99.5k
    case OP_POSSTAR:
707
109k
    case OP_POSSTARI:
708
111k
    case OP_NOTPOSSTAR:
709
113k
    case OP_NOTPOSSTARI:
710
711
132k
    case OP_QUERY:
712
145k
    case OP_QUERYI:
713
147k
    case OP_NOTQUERY:
714
149k
    case OP_NOTQUERYI:
715
157k
    case OP_MINQUERY:
716
164k
    case OP_MINQUERYI:
717
168k
    case OP_NOTMINQUERY:
718
170k
    case OP_NOTMINQUERYI:
719
200k
    case OP_POSQUERY:
720
210k
    case OP_POSQUERYI:
721
211k
    case OP_NOTPOSQUERY:
722
212k
    case OP_NOTPOSQUERYI:
723
724
212k
    cc += PRIV(OP_lengths)[op];
725
212k
#ifdef SUPPORT_UNICODE
726
212k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
727
212k
#endif
728
212k
    break;
729
730
    /* Skip these, but we need to add in the name length. */
731
732
2.59k
    case OP_MARK:
733
3.11k
    case OP_COMMIT_ARG:
734
3.62k
    case OP_PRUNE_ARG:
735
6.06k
    case OP_SKIP_ARG:
736
7.73k
    case OP_THEN_ARG:
737
7.73k
    cc += PRIV(OP_lengths)[op] + cc[1];
738
7.73k
    break;
739
740
    /* The remaining opcodes are just skipped over. */
741
742
0
    case OP_CLOSE:
743
2.16k
    case OP_COMMIT:
744
5.02k
    case OP_FAIL:
745
8.26k
    case OP_PRUNE:
746
11.2k
    case OP_SET_SOM:
747
14.0k
    case OP_SKIP:
748
20.1k
    case OP_THEN:
749
20.1k
    cc += PRIV(OP_lengths)[op];
750
20.1k
    break;
751
752
    /* This should not occur: we list all opcodes explicitly so that when
753
    new ones get added they are properly considered. */
754
755
0
    default:
756
0
    PCRE2_DEBUG_UNREACHABLE();
757
0
    return -3;
758
10.1M
    }
759
10.1M
  }
760
761
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
762
0
return -3;                 /* Avoid compiler warnings */
763
399k
}
764
765
766
767
/*************************************************
768
*      Set a bit and maybe its alternate case    *
769
*************************************************/
770
771
/* Given a character, set its first code unit's bit in the table, and also the
772
corresponding bit for the other version of a letter if we are caseless.
773
774
Arguments:
775
  re            points to the regex block
776
  p             points to the first code unit of the character
777
  caseless      TRUE if caseless
778
  utf           TRUE for UTF mode
779
  ucp           TRUE for UCP mode
780
781
Returns:        pointer after the character
782
*/
783
784
static PCRE2_SPTR
785
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
786
  BOOL ucp)
787
297k
{
788
297k
uint32_t c = *p++;   /* First code unit */
789
790
297k
(void)utf;           /* Stop compiler warnings when UTF not supported */
791
297k
(void)ucp;
792
793
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
794
0xff. */
795
796
#if PCRE2_CODE_UNIT_WIDTH != 8
797
if (c > 0xff) SET_BIT(0xff); else
798
#endif
799
800
297k
SET_BIT(c);
801
802
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
803
the end of the character, even when caseless. */
804
805
297k
#ifdef SUPPORT_UNICODE
806
297k
if (utf)
807
24.4k
  {
808
24.4k
#if PCRE2_CODE_UNIT_WIDTH == 8
809
24.4k
  if (c >= 0xc0) GETUTF8INC(c, p);
810
#elif PCRE2_CODE_UNIT_WIDTH == 16
811
  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
812
#endif
813
24.4k
  }
814
297k
#endif  /* SUPPORT_UNICODE */
815
816
/* If caseless, handle the other case of the character. */
817
818
297k
if (caseless)
819
99.7k
  {
820
99.7k
#ifdef SUPPORT_UNICODE
821
99.7k
  if (utf || ucp)
822
41.2k
    {
823
41.2k
    c = UCD_OTHERCASE(c);
824
41.2k
#if PCRE2_CODE_UNIT_WIDTH == 8
825
41.2k
    if (utf)
826
14.1k
      {
827
14.1k
      PCRE2_UCHAR buff[6];
828
14.1k
      (void)PRIV(ord2utf)(c, buff);
829
14.1k
      SET_BIT(buff[0]);
830
14.1k
      }
831
27.1k
    else if (c < 256) SET_BIT(c);
832
#else  /* 16-bit or 32-bit mode */
833
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
834
#endif
835
41.2k
    }
836
837
58.4k
  else
838
58.4k
#endif  /* SUPPORT_UNICODE */
839
840
  /* Not UTF or UCP */
841
842
58.4k
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
843
99.7k
  }
844
845
297k
return p;
846
297k
}
847
848
849
850
/*************************************************
851
*     Set bits for a positive character type     *
852
*************************************************/
853
854
/* This function sets starting bits for a character type. In UTF-8 mode, we can
855
only do a direct setting for bytes less than 128, as otherwise there can be
856
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
857
environment, the tables will only recognize ASCII characters anyway, but in at
858
least one Windows environment, some higher bytes bits were set in the tables.
859
So we deal with that case by considering the UTF-8 encoding.
860
861
Arguments:
862
  re             the regex block
863
  cbit type      the type of character wanted
864
  table_limit    32 for non-UTF-8; 16 for UTF-8
865
866
Returns:         nothing
867
*/
868
869
static void
870
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
871
85.5k
{
872
85.5k
uint32_t c;
873
2.37M
for (c = 0; c < table_limit; c++)
874
2.28M
  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
875
85.5k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
876
85.5k
if (table_limit == 32) return;
877
3.64M
for (c = 128; c < 256; c++)
878
3.61M
  {
879
3.61M
  if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
880
0
    {
881
0
    PCRE2_UCHAR buff[6];
882
0
    (void)PRIV(ord2utf)(c, buff);
883
0
    SET_BIT(buff[0]);
884
0
    }
885
3.61M
  }
886
28.2k
#endif  /* UTF-8 */
887
28.2k
}
888
889
890
/*************************************************
891
*     Set bits for a negative character type     *
892
*************************************************/
893
894
/* This function sets starting bits for a negative character type such as \D.
895
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
896
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
897
Unlike in the positive case, where we can set appropriate starting bits for
898
specific high-valued UTF-8 characters, in this case we have to set the bits for
899
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
900
0xc0 (192) for simplicity.
901
902
Arguments:
903
  re             the regex block
904
  cbit type      the type of character wanted
905
  table_limit    32 for non-UTF-8; 16 for UTF-8
906
907
Returns:         nothing
908
*/
909
910
static void
911
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
912
30.1k
{
913
30.1k
uint32_t c;
914
860k
for (c = 0; c < table_limit; c++)
915
830k
  re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
916
30.1k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
917
75.1k
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
918
30.1k
#endif
919
30.1k
}
920
921
922
923
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
924
/*************************************************
925
*     Set starting bits for a character list.    *
926
*************************************************/
927
928
/* This function sets starting bits for a character list. It enumerates
929
all characters and character ranges in the character list, and sets
930
the starting bits accordingly.
931
932
Arguments:
933
  code           pointer to the code
934
  start_bitmap   pointer to the starting bitmap
935
936
Returns:         nothing
937
*/
938
static void
939
study_char_list(PCRE2_SPTR code, uint8_t *start_bitmap,
940
  const uint8_t *char_lists_end)
941
1.16k
{
942
1.16k
uint32_t type, list_ind;
943
1.16k
uint32_t char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
944
1.16k
uint32_t range_start = ~(uint32_t)0, range_end = 0;
945
1.16k
const uint8_t *next_char;
946
1.16k
PCRE2_UCHAR start_buffer[6], end_buffer[6];
947
1.16k
PCRE2_UCHAR start, end;
948
949
/* Only needed in 8-bit mode at the moment. */
950
1.16k
type = (uint32_t)(code[0] << 8) | code[1];
951
1.16k
code += 2;
952
953
/* Align characters. */
954
1.16k
next_char = char_lists_end - (GET(code, 0) << 1);
955
1.16k
type &= XCL_TYPE_MASK;
956
1.16k
list_ind = 0;
957
958
1.16k
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
959
684
  range_start = XCL_CHAR_LIST_LOW_16_START;
960
961
4.15k
while (type > 0)
962
2.98k
  {
963
2.98k
  uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
964
965
2.98k
  if (item_count == XCL_ITEM_COUNT_MASK)
966
1.54k
    {
967
1.54k
    if (list_ind <= 1)
968
1.26k
      {
969
1.26k
      item_count = *(const uint16_t*)next_char;
970
1.26k
      next_char += 2;
971
1.26k
      }
972
279
    else
973
279
      {
974
279
      item_count = *(const uint32_t*)next_char;
975
279
      next_char += 4;
976
279
      }
977
1.54k
    }
978
979
17.9k
  while (item_count > 0)
980
14.9k
    {
981
14.9k
    if (list_ind <= 1)
982
12.7k
      {
983
12.7k
      range_end = *(const uint16_t*)next_char;
984
12.7k
      next_char += 2;
985
12.7k
      }
986
2.19k
    else
987
2.19k
      {
988
2.19k
      range_end = *(const uint32_t*)next_char;
989
2.19k
      next_char += 4;
990
2.19k
      }
991
992
14.9k
    if ((range_end & XCL_CHAR_END) != 0)
993
10.1k
      {
994
10.1k
      range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
995
996
10.1k
      PRIV(ord2utf)(range_end, end_buffer);
997
10.1k
      end = end_buffer[0];
998
999
10.1k
      if (range_start < range_end)
1000
5.49k
        {
1001
5.49k
        PRIV(ord2utf)(range_start, start_buffer);
1002
37.6k
        for (start = start_buffer[0]; start <= end; start++)
1003
32.1k
          start_bitmap[start / 8] |= (1u << (start & 7));
1004
5.49k
        }
1005
4.63k
      else
1006
4.63k
        start_bitmap[end / 8] |= (1u << (end & 7));
1007
1008
10.1k
      range_start = ~(uint32_t)0;
1009
10.1k
      }
1010
4.79k
    else
1011
4.79k
      range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
1012
1013
14.9k
    item_count--;
1014
14.9k
    }
1015
1016
2.98k
  list_ind++;
1017
2.98k
  type >>= XCL_TYPE_BIT_LEN;
1018
1019
2.98k
  if (range_start == ~(uint32_t)0)
1020
1.92k
    {
1021
1.92k
    if ((type & XCL_BEGIN_WITH_RANGE) != 0)
1022
15
      {
1023
      /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_START is not possible. */
1024
15
      if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
1025
1
      else range_start = XCL_CHAR_LIST_LOW_32_START;
1026
15
      }
1027
1.92k
    }
1028
1.06k
  else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
1029
0
    {
1030
0
    PRIV(ord2utf)(range_start, start_buffer);
1031
1032
    /* In 8 bit mode XCL_CHAR_LIST_LOW_32_END and
1033
    XCL_CHAR_LIST_HIGH_32_END are not possible. */
1034
0
    if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
1035
0
    else range_end = XCL_CHAR_LIST_HIGH_16_END;
1036
1037
0
    PRIV(ord2utf)(range_end, end_buffer);
1038
0
    end = end_buffer[0];
1039
1040
0
    for (start = start_buffer[0]; start <= end; start++)
1041
0
      start_bitmap[start / 8] |= (1u << (start & 7));
1042
1043
0
    range_start = ~(uint32_t)0;
1044
0
    }
1045
1046
  /* In 8 bit mode XCL_CHAR_LIST_HIGH_32_ADD is not possible. */
1047
2.98k
  if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
1048
1.82k
  else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
1049
2.98k
  }
1050
1.16k
}
1051
#endif
1052
1053
1054
1055
/*************************************************
1056
*      Create bitmap of starting code units      *
1057
*************************************************/
1058
1059
/* This function scans a compiled unanchored expression recursively and
1060
attempts to build a bitmap of the set of possible starting code units whose
1061
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
1062
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
1063
we pass a value of 16 rather than 32 as the final argument. (See comments in
1064
those functions for the reason.)
1065
1066
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
1067
(a*)b where the group provides some optional starting code units but scanning
1068
must continue at the outer level to find at least one mandatory code unit. At
1069
the outermost level, this function fails unless the result is SSB_DONE.
1070
1071
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
1072
issues.
1073
1074
Arguments:
1075
  re           points to the compiled regex block
1076
  code         points to an expression
1077
  utf          TRUE if in UTF mode
1078
  ucp          TRUE if in UCP mode
1079
  depthptr     pointer to recurse depth
1080
1081
Returns:       SSB_FAIL     => Failed to find any starting code units
1082
               SSB_DONE     => Found mandatory starting code units
1083
               SSB_CONTINUE => Found optional starting code units
1084
               SSB_UNKNOWN  => Hit an unrecognized opcode
1085
               SSB_TOODEEP  => Recursion is too deep
1086
*/
1087
1088
static int
1089
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
1090
  int *depthptr)
1091
208k
{
1092
208k
uint32_t c;
1093
208k
int yield = SSB_DONE;
1094
1095
208k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1096
208k
int table_limit = utf? 16:32;
1097
#else
1098
int table_limit = 32;
1099
#endif
1100
1101
208k
*depthptr += 1;
1102
208k
if (*depthptr > 1000) return SSB_TOODEEP;
1103
1104
208k
do
1105
825k
  {
1106
825k
  BOOL try_next = TRUE;
1107
825k
  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
1108
1109
825k
  if (*code == OP_CBRA || *code == OP_SCBRA ||
1110
825k
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1111
1112
2.07M
  while (try_next)    /* Loop for items in this branch */
1113
1.35M
    {
1114
1.35M
    int rc;
1115
1.35M
    PCRE2_SPTR ncode;
1116
1.35M
    const uint8_t *classmap = NULL;
1117
1.35M
#ifdef SUPPORT_WIDE_CHARS
1118
1.35M
    PCRE2_UCHAR xclassflags;
1119
1.35M
#endif
1120
1121
1.35M
    switch(*tcode)
1122
1.35M
      {
1123
      /* If we reach something we don't understand, it means a new opcode has
1124
      been created that hasn't been added to this function. Hopefully this
1125
      problem will be discovered during testing. */
1126
1127
0
      default:
1128
0
      return SSB_UNKNOWN;
1129
1130
      /* Fail for a valid opcode that implies no starting bits. */
1131
1132
64
      case OP_ACCEPT:
1133
84
      case OP_ASSERT_ACCEPT:
1134
735
      case OP_ALLANY:
1135
4.32k
      case OP_ANY:
1136
4.32k
      case OP_ANYBYTE:
1137
4.45k
      case OP_CIRCM:
1138
4.49k
      case OP_CLOSE:
1139
4.51k
      case OP_COMMIT:
1140
4.52k
      case OP_COMMIT_ARG:
1141
4.89k
      case OP_COND:
1142
4.94k
      case OP_CREF:
1143
4.94k
      case OP_FALSE:
1144
4.95k
      case OP_TRUE:
1145
4.95k
      case OP_DNCREF:
1146
4.97k
      case OP_DNREF:
1147
4.97k
      case OP_DNREFI:
1148
4.97k
      case OP_DNRREF:
1149
6.80k
      case OP_DOLL:
1150
7.25k
      case OP_DOLLM:
1151
7.25k
      case OP_END:
1152
7.33k
      case OP_EOD:
1153
7.46k
      case OP_EODN:
1154
8.10k
      case OP_EXTUNI:
1155
8.12k
      case OP_FAIL:
1156
8.16k
      case OP_MARK:
1157
8.39k
      case OP_NOT:
1158
8.48k
      case OP_NOTEXACT:
1159
8.50k
      case OP_NOTEXACTI:
1160
8.55k
      case OP_NOTI:
1161
8.60k
      case OP_NOTMINPLUS:
1162
8.62k
      case OP_NOTMINPLUSI:
1163
8.64k
      case OP_NOTMINQUERY:
1164
8.66k
      case OP_NOTMINQUERYI:
1165
8.68k
      case OP_NOTMINSTAR:
1166
8.71k
      case OP_NOTMINSTARI:
1167
8.74k
      case OP_NOTMINUPTO:
1168
8.76k
      case OP_NOTMINUPTOI:
1169
8.90k
      case OP_NOTPLUS:
1170
8.95k
      case OP_NOTPLUSI:
1171
9.02k
      case OP_NOTPOSPLUS:
1172
9.03k
      case OP_NOTPOSPLUSI:
1173
9.08k
      case OP_NOTPOSQUERY:
1174
9.09k
      case OP_NOTPOSQUERYI:
1175
9.12k
      case OP_NOTPOSSTAR:
1176
9.14k
      case OP_NOTPOSSTARI:
1177
9.15k
      case OP_NOTPOSUPTO:
1178
9.16k
      case OP_NOTPOSUPTOI:
1179
10.2k
      case OP_NOTPROP:
1180
10.3k
      case OP_NOTQUERY:
1181
10.3k
      case OP_NOTQUERYI:
1182
10.4k
      case OP_NOTSTAR:
1183
10.4k
      case OP_NOTSTARI:
1184
10.5k
      case OP_NOTUPTO:
1185
10.5k
      case OP_NOTUPTOI:
1186
12.0k
      case OP_NOT_HSPACE:
1187
12.7k
      case OP_NOT_VSPACE:
1188
12.7k
      case OP_PRUNE:
1189
12.7k
      case OP_PRUNE_ARG:
1190
13.2k
      case OP_RECURSE:
1191
13.3k
      case OP_REF:
1192
13.4k
      case OP_REFI:
1193
13.4k
      case OP_REVERSE:
1194
13.5k
      case OP_VREVERSE:
1195
13.5k
      case OP_RREF:
1196
13.5k
      case OP_SCOND:
1197
13.6k
      case OP_SET_SOM:
1198
13.6k
      case OP_SKIP:
1199
13.7k
      case OP_SKIP_ARG:
1200
13.8k
      case OP_SOD:
1201
13.8k
      case OP_SOM:
1202
14.0k
      case OP_THEN:
1203
14.0k
      case OP_THEN_ARG:
1204
14.0k
      return SSB_FAIL;
1205
1206
      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1207
      uses OP_CIRCM). Skip over it. */
1208
1209
7.06k
      case OP_CIRC:
1210
7.06k
      tcode += PRIV(OP_lengths)[OP_CIRC];
1211
7.06k
      break;
1212
1213
      /* A "real" property test implies no starting bits, but the fake property
1214
      PT_CLIST identifies a list of characters. These lists are short, as they
1215
      are used for characters with more than one "other case", so there is no
1216
      point in recognizing them for OP_NOTPROP. */
1217
1218
4.48k
      case OP_PROP:
1219
4.48k
      if (tcode[1] != PT_CLIST) return SSB_FAIL;
1220
3.93k
        {
1221
3.93k
        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1222
15.7k
        while ((c = *p++) < NOTACHAR)
1223
11.8k
          {
1224
11.8k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1225
11.8k
          if (utf)
1226
2.30k
            {
1227
2.30k
            PCRE2_UCHAR buff[6];
1228
2.30k
            (void)PRIV(ord2utf)(c, buff);
1229
2.30k
            c = buff[0];
1230
2.30k
            }
1231
11.8k
#endif
1232
11.8k
          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1233
11.8k
          }
1234
3.93k
        }
1235
3.93k
      try_next = FALSE;
1236
3.93k
      break;
1237
1238
      /* We can ignore word boundary tests. */
1239
1240
2.34k
      case OP_WORD_BOUNDARY:
1241
3.43k
      case OP_NOT_WORD_BOUNDARY:
1242
5.44k
      case OP_UCP_WORD_BOUNDARY:
1243
6.04k
      case OP_NOT_UCP_WORD_BOUNDARY:
1244
6.04k
      tcode++;
1245
6.04k
      break;
1246
1247
      /* For a positive lookahead assertion, inspect what immediately follows,
1248
      ignoring intermediate assertions and callouts. If the next item is one
1249
      that sets a mandatory character, skip this assertion. Otherwise, treat it
1250
      the same as other bracket groups. */
1251
1252
8.26k
      case OP_ASSERT:
1253
48.2k
      case OP_ASSERT_NA:
1254
48.2k
      ncode = tcode + GET(tcode, 1);
1255
159k
      while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1256
48.2k
      ncode += 1 + LINK_SIZE;
1257
1258
      /* Skip irrelevant items */
1259
1260
12.9M
      for (BOOL done = FALSE; !done;)
1261
12.9M
        {
1262
12.9M
        switch (*ncode)
1263
12.9M
          {
1264
10.4M
          case OP_ASSERT:
1265
10.4M
          case OP_ASSERT_NOT:
1266
10.4M
          case OP_ASSERTBACK:
1267
10.4M
          case OP_ASSERTBACK_NOT:
1268
12.8M
          case OP_ASSERT_NA:
1269
12.8M
          case OP_ASSERTBACK_NA:
1270
12.8M
          case OP_ASSERT_SCS:
1271
12.8M
          ncode += GET(ncode, 1);
1272
35.6M
          while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1273
12.8M
          ncode += 1 + LINK_SIZE;
1274
12.8M
          break;
1275
1276
2.19k
          case OP_WORD_BOUNDARY:
1277
3.30k
          case OP_NOT_WORD_BOUNDARY:
1278
4.86k
          case OP_UCP_WORD_BOUNDARY:
1279
5.95k
          case OP_NOT_UCP_WORD_BOUNDARY:
1280
5.95k
          ncode++;
1281
5.95k
          break;
1282
1283
8.02k
          case OP_CALLOUT:
1284
8.02k
          ncode += PRIV(OP_lengths)[OP_CALLOUT];
1285
8.02k
          break;
1286
1287
1.34k
          case OP_CALLOUT_STR:
1288
1.34k
          ncode += GET(ncode, 1 + 2*LINK_SIZE);
1289
1.34k
          break;
1290
1291
48.2k
          default:
1292
48.2k
          done = TRUE;
1293
48.2k
          break;
1294
12.9M
          }
1295
12.9M
        }
1296
1297
      /* Now check the next significant item. */
1298
1299
48.2k
      switch(*ncode)
1300
48.2k
        {
1301
23.6k
        default:
1302
23.6k
        break;
1303
1304
23.6k
        case OP_PROP:
1305
538
        if (ncode[1] != PT_CLIST) break;
1306
        /* Fall through */
1307
1.10k
        case OP_ANYNL:
1308
7.81k
        case OP_CHAR:
1309
10.0k
        case OP_CHARI:
1310
10.6k
        case OP_EXACT:
1311
11.5k
        case OP_EXACTI:
1312
12.8k
        case OP_HSPACE:
1313
13.2k
        case OP_MINPLUS:
1314
13.9k
        case OP_MINPLUSI:
1315
14.9k
        case OP_PLUS:
1316
16.0k
        case OP_PLUSI:
1317
17.7k
        case OP_POSPLUS:
1318
18.0k
        case OP_POSPLUSI:
1319
18.8k
        case OP_VSPACE:
1320
        /* Note that these types will only be present in non-UCP mode. */
1321
19.2k
        case OP_DIGIT:
1322
19.4k
        case OP_NOT_DIGIT:
1323
19.7k
        case OP_WORDCHAR:
1324
21.6k
        case OP_NOT_WORDCHAR:
1325
23.3k
        case OP_WHITESPACE:
1326
24.3k
        case OP_NOT_WHITESPACE:
1327
24.3k
        tcode = ncode;
1328
24.3k
        continue;   /* With the following significant opcode */
1329
48.2k
        }
1330
      /* Fall through */
1331
1332
      /* For a group bracket or a positive assertion without an immediately
1333
      following mandatory setting, recurse to set bits from within the
1334
      subpattern. If it can't find anything, we have to give up. If it finds
1335
      some mandatory character(s), we are done for this branch. Otherwise,
1336
      carry on scanning after the subpattern. */
1337
1338
60.6k
      case OP_BRA:
1339
60.9k
      case OP_SBRA:
1340
115k
      case OP_CBRA:
1341
116k
      case OP_SCBRA:
1342
116k
      case OP_BRAPOS:
1343
117k
      case OP_SBRAPOS:
1344
118k
      case OP_CBRAPOS:
1345
120k
      case OP_SCBRAPOS:
1346
124k
      case OP_ONCE:
1347
125k
      case OP_SCRIPT_RUN:
1348
125k
      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1349
125k
      if (rc == SSB_DONE)
1350
14.6k
        {
1351
14.6k
        try_next = FALSE;
1352
14.6k
        }
1353
110k
      else if (rc == SSB_CONTINUE)
1354
108k
        {
1355
444k
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1356
108k
        tcode += 1 + LINK_SIZE;
1357
108k
        }
1358
2.50k
      else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1359
122k
      break;
1360
1361
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1362
      this branch, though we might have found something optional. For ALT, we
1363
      continue with the next alternative, but we have to arrange that the final
1364
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1365
      return SSB_CONTINUE: if this is the top level, that indicates failure,
1366
      but after a nested subpattern, it causes scanning to continue. */
1367
1368
322k
      case OP_ALT:
1369
322k
      yield = SSB_CONTINUE;
1370
322k
      try_next = FALSE;
1371
322k
      break;
1372
1373
64.7k
      case OP_KET:
1374
65.1k
      case OP_KETRMAX:
1375
65.7k
      case OP_KETRMIN:
1376
67.9k
      case OP_KETRPOS:
1377
67.9k
      return SSB_CONTINUE;
1378
1379
      /* Skip over callout */
1380
1381
172k
      case OP_CALLOUT:
1382
172k
      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1383
172k
      break;
1384
1385
2.60k
      case OP_CALLOUT_STR:
1386
2.60k
      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1387
2.60k
      break;
1388
1389
      /* Skip over lookbehind, negative lookahead, and scan substring
1390
      assertions */
1391
1392
7.85k
      case OP_ASSERT_NOT:
1393
37.4k
      case OP_ASSERTBACK:
1394
61.5k
      case OP_ASSERTBACK_NOT:
1395
63.1k
      case OP_ASSERTBACK_NA:
1396
79.0k
      case OP_ASSERT_SCS:
1397
154k
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1398
79.0k
      tcode += 1 + LINK_SIZE;
1399
79.0k
      break;
1400
1401
      /* BRAZERO does the bracket, but carries on. */
1402
1403
35.3k
      case OP_BRAZERO:
1404
46.5k
      case OP_BRAMINZERO:
1405
47.9k
      case OP_BRAPOSZERO:
1406
47.9k
      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1407
47.9k
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1408
42.7k
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1409
24.9k
      tcode += 1 + LINK_SIZE;
1410
24.9k
      break;
1411
1412
      /* SKIPZERO skips the bracket. */
1413
1414
410
      case OP_SKIPZERO:
1415
410
      tcode++;
1416
961
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1417
410
      tcode += 1 + LINK_SIZE;
1418
410
      break;
1419
1420
      /* Single-char * or ? sets the bit and tries the next item */
1421
1422
4.88k
      case OP_STAR:
1423
6.30k
      case OP_MINSTAR:
1424
15.9k
      case OP_POSSTAR:
1425
20.0k
      case OP_QUERY:
1426
21.2k
      case OP_MINQUERY:
1427
25.7k
      case OP_POSQUERY:
1428
25.7k
      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1429
25.7k
      break;
1430
1431
3.85k
      case OP_STARI:
1432
6.52k
      case OP_MINSTARI:
1433
9.02k
      case OP_POSSTARI:
1434
11.7k
      case OP_QUERYI:
1435
14.4k
      case OP_MINQUERYI:
1436
17.4k
      case OP_POSQUERYI:
1437
17.4k
      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1438
17.4k
      break;
1439
1440
      /* Single-char upto sets the bit and tries the next */
1441
1442
936
      case OP_UPTO:
1443
1.59k
      case OP_MINUPTO:
1444
3.53k
      case OP_POSUPTO:
1445
3.53k
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1446
3.53k
      break;
1447
1448
270
      case OP_UPTOI:
1449
930
      case OP_MINUPTOI:
1450
1.47k
      case OP_POSUPTOI:
1451
1.47k
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1452
1.47k
      break;
1453
1454
      /* At least one single char sets the bit and stops */
1455
1456
1.09k
      case OP_EXACT:
1457
1.09k
      tcode += IMM2_SIZE;
1458
      /* Fall through */
1459
159k
      case OP_CHAR:
1460
163k
      case OP_PLUS:
1461
164k
      case OP_MINPLUS:
1462
168k
      case OP_POSPLUS:
1463
168k
      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1464
168k
      try_next = FALSE;
1465
168k
      break;
1466
1467
1.52k
      case OP_EXACTI:
1468
1.52k
      tcode += IMM2_SIZE;
1469
      /* Fall through */
1470
75.5k
      case OP_CHARI:
1471
78.3k
      case OP_PLUSI:
1472
79.7k
      case OP_MINPLUSI:
1473
80.8k
      case OP_POSPLUSI:
1474
80.8k
      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1475
80.8k
      try_next = FALSE;
1476
80.8k
      break;
1477
1478
      /* Special spacing and line-terminating items. These recognize specific
1479
      lists of characters. The difference between VSPACE and ANYNL is that the
1480
      latter can match the two-character CRLF sequence, but that is not
1481
      relevant for finding the first character, so their code here is
1482
      identical. */
1483
1484
2.71k
      case OP_HSPACE:
1485
2.71k
      SET_BIT(CHAR_HT);
1486
2.71k
      SET_BIT(CHAR_SPACE);
1487
1488
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1489
      the bits for NBSP and for code units >= 255, independently of UTF. */
1490
1491
#if PCRE2_CODE_UNIT_WIDTH != 8
1492
      SET_BIT(CHAR_NBSP);
1493
      SET_BIT(0xFF);
1494
#else
1495
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1496
      units of horizontal space characters. */
1497
1498
2.71k
#ifdef SUPPORT_UNICODE
1499
2.71k
      if (utf)
1500
476
        {
1501
476
        SET_BIT(0xC2);  /* For U+00A0 */
1502
476
        SET_BIT(0xE1);  /* For U+1680, U+180E */
1503
476
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1504
476
        SET_BIT(0xE3);  /* For U+3000 */
1505
476
        }
1506
2.23k
      else
1507
2.23k
#endif
1508
      /* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */
1509
2.23k
        {
1510
2.23k
        SET_BIT(CHAR_NBSP);
1511
2.23k
        }
1512
2.71k
#endif  /* 8-bit support */
1513
1514
2.71k
      try_next = FALSE;
1515
2.71k
      break;
1516
1517
4.61k
      case OP_ANYNL:
1518
5.93k
      case OP_VSPACE:
1519
5.93k
      SET_BIT(CHAR_LF);
1520
5.93k
      SET_BIT(CHAR_VT);
1521
5.93k
      SET_BIT(CHAR_FF);
1522
5.93k
      SET_BIT(CHAR_CR);
1523
1524
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1525
      the bits for NEL and for code units >= 255, independently of UTF. */
1526
1527
#if PCRE2_CODE_UNIT_WIDTH != 8
1528
      SET_BIT(CHAR_NEL);
1529
      SET_BIT(0xFF);
1530
#else
1531
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1532
      units of vertical space characters. */
1533
1534
5.93k
#ifdef SUPPORT_UNICODE
1535
5.93k
      if (utf)
1536
1.05k
        {
1537
1.05k
        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1538
1.05k
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1539
1.05k
        }
1540
4.87k
      else
1541
4.87k
#endif
1542
      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1543
4.87k
        {
1544
4.87k
        SET_BIT(CHAR_NEL);
1545
4.87k
        }
1546
5.93k
#endif  /* 8-bit support */
1547
1548
5.93k
      try_next = FALSE;
1549
5.93k
      break;
1550
1551
      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1552
      is set, we do not see these opcodes because \d etc are converted to
1553
      properties. Therefore, these apply in the case when only characters less
1554
      than 256 are recognized to match the types. */
1555
1556
2.21k
      case OP_NOT_DIGIT:
1557
2.21k
      set_nottype_bits(re, cbit_digit, table_limit);
1558
2.21k
      try_next = FALSE;
1559
2.21k
      break;
1560
1561
8.79k
      case OP_DIGIT:
1562
8.79k
      set_type_bits(re, cbit_digit, table_limit);
1563
8.79k
      try_next = FALSE;
1564
8.79k
      break;
1565
1566
11.2k
      case OP_NOT_WHITESPACE:
1567
11.2k
      set_nottype_bits(re, cbit_space, table_limit);
1568
11.2k
      try_next = FALSE;
1569
11.2k
      break;
1570
1571
11.6k
      case OP_WHITESPACE:
1572
11.6k
      set_type_bits(re, cbit_space, table_limit);
1573
11.6k
      try_next = FALSE;
1574
11.6k
      break;
1575
1576
8.29k
      case OP_NOT_WORDCHAR:
1577
8.29k
      set_nottype_bits(re, cbit_word, table_limit);
1578
8.29k
      try_next = FALSE;
1579
8.29k
      break;
1580
1581
50.5k
      case OP_WORDCHAR:
1582
50.5k
      set_type_bits(re, cbit_word, table_limit);
1583
50.5k
      try_next = FALSE;
1584
50.5k
      break;
1585
1586
      /* One or more character type fudges the pointer and restarts, knowing
1587
      it will hit a single character type and stop there. */
1588
1589
4.27k
      case OP_TYPEPLUS:
1590
6.07k
      case OP_TYPEMINPLUS:
1591
7.91k
      case OP_TYPEPOSPLUS:
1592
7.91k
      tcode++;
1593
7.91k
      break;
1594
1595
4.21k
      case OP_TYPEEXACT:
1596
4.21k
      tcode += 1 + IMM2_SIZE;
1597
4.21k
      break;
1598
1599
      /* Zero or more repeats of character types set the bits and then
1600
      try again. */
1601
1602
1.35k
      case OP_TYPEUPTO:
1603
3.04k
      case OP_TYPEMINUPTO:
1604
3.60k
      case OP_TYPEPOSUPTO:
1605
3.60k
      tcode += IMM2_SIZE;  /* Fall through */
1606
1607
11.3k
      case OP_TYPESTAR:
1608
13.3k
      case OP_TYPEMINSTAR:
1609
15.2k
      case OP_TYPEPOSSTAR:
1610
22.4k
      case OP_TYPEQUERY:
1611
26.6k
      case OP_TYPEMINQUERY:
1612
31.6k
      case OP_TYPEPOSQUERY:
1613
31.6k
      switch(tcode[1])
1614
31.6k
        {
1615
1.98k
        default:
1616
3.27k
        case OP_ANY:
1617
3.64k
        case OP_ALLANY:
1618
3.64k
        return SSB_FAIL;
1619
1620
1.20k
        case OP_HSPACE:
1621
1.20k
        SET_BIT(CHAR_HT);
1622
1.20k
        SET_BIT(CHAR_SPACE);
1623
1624
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1625
        the bits for NBSP and for code units >= 255, independently of UTF. */
1626
1627
#if PCRE2_CODE_UNIT_WIDTH != 8
1628
        SET_BIT(CHAR_NBSP);
1629
        SET_BIT(0xFF);
1630
#else
1631
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1632
        units of horizontal space characters. */
1633
1634
1.20k
#ifdef SUPPORT_UNICODE
1635
1.20k
        if (utf)
1636
351
          {
1637
351
          SET_BIT(0xC2);  /* For U+00A0 */
1638
351
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1639
351
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1640
351
          SET_BIT(0xE3);  /* For U+3000 */
1641
351
          }
1642
850
        else
1643
850
#endif
1644
        /* For the 8-bit library not in UTF-8 mode, set the bit for NBSP. */
1645
850
          {
1646
850
          SET_BIT(CHAR_NBSP);
1647
850
          }
1648
1.20k
#endif  /* 8-bit support */
1649
1.20k
        break;
1650
1651
2.92k
        case OP_ANYNL:
1652
3.73k
        case OP_VSPACE:
1653
3.73k
        SET_BIT(CHAR_LF);
1654
3.73k
        SET_BIT(CHAR_VT);
1655
3.73k
        SET_BIT(CHAR_FF);
1656
3.73k
        SET_BIT(CHAR_CR);
1657
1658
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1659
        the bits for NEL and for code units >= 255, independently of UTF. */
1660
1661
#if PCRE2_CODE_UNIT_WIDTH != 8
1662
        SET_BIT(CHAR_NEL);
1663
        SET_BIT(0xFF);
1664
#else
1665
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1666
        units of vertical space characters. */
1667
1668
3.73k
#ifdef SUPPORT_UNICODE
1669
3.73k
        if (utf)
1670
358
          {
1671
358
          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1672
358
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1673
358
          }
1674
3.37k
        else
1675
3.37k
#endif
1676
        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1677
3.37k
          {
1678
3.37k
          SET_BIT(CHAR_NEL);
1679
3.37k
          }
1680
3.73k
#endif  /* 8-bit support */
1681
3.73k
        break;
1682
1683
1.33k
        case OP_NOT_DIGIT:
1684
1.33k
        set_nottype_bits(re, cbit_digit, table_limit);
1685
1.33k
        break;
1686
1687
4.25k
        case OP_DIGIT:
1688
4.25k
        set_type_bits(re, cbit_digit, table_limit);
1689
4.25k
        break;
1690
1691
5.18k
        case OP_NOT_WHITESPACE:
1692
5.18k
        set_nottype_bits(re, cbit_space, table_limit);
1693
5.18k
        break;
1694
1695
6.59k
        case OP_WHITESPACE:
1696
6.59k
        set_type_bits(re, cbit_space, table_limit);
1697
6.59k
        break;
1698
1699
1.87k
        case OP_NOT_WORDCHAR:
1700
1.87k
        set_nottype_bits(re, cbit_word, table_limit);
1701
1.87k
        break;
1702
1703
3.82k
        case OP_WORDCHAR:
1704
3.82k
        set_type_bits(re, cbit_word, table_limit);
1705
3.82k
        break;
1706
31.6k
        }
1707
1708
28.0k
      tcode += 2;
1709
28.0k
      break;
1710
1711
      /* Set-based ECLASS: treat it the same as a "complex" XCLASS; give up. */
1712
1713
0
#ifdef SUPPORT_WIDE_CHARS
1714
121
      case OP_ECLASS:
1715
121
      return SSB_FAIL;
1716
0
#endif
1717
1718
      /* Extended class: if there are any property checks, or if this is a
1719
      negative XCLASS without a map, give up. If there are no property checks,
1720
      there must be wide characters on the XCLASS list, because otherwise an
1721
      XCLASS would not have been created. This means that code points >= 255
1722
      are potential starters. In the UTF-8 case we can scan them and set bits
1723
      for the relevant leading bytes. */
1724
1725
0
#ifdef SUPPORT_WIDE_CHARS
1726
4.12k
      case OP_XCLASS:
1727
4.12k
      xclassflags = tcode[1 + LINK_SIZE];
1728
4.12k
      if ((xclassflags & XCL_HASPROP) != 0 ||
1729
4.12k
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1730
327
        return SSB_FAIL;
1731
1732
      /* We have a positive XCLASS or a negative one without a map. Set up the
1733
      map pointer if there is one, and fall through. */
1734
1735
3.79k
      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1736
3.79k
        (const uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1737
1738
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1739
      then jump to handle the map. */
1740
1741
3.79k
#if PCRE2_CODE_UNIT_WIDTH == 8
1742
3.79k
      if (utf && (xclassflags & XCL_NOT) == 0)
1743
3.08k
        {
1744
3.08k
        PCRE2_UCHAR b, e;
1745
3.08k
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1746
3.08k
        tcode += GET(tcode, 1);
1747
1748
3.08k
        if (*p >= XCL_LIST)
1749
1.16k
          {
1750
1.16k
          study_char_list(p, re->start_bitmap,
1751
1.16k
            ((const uint8_t *)re + re->code_start));
1752
1.16k
          goto HANDLE_CLASSMAP;
1753
1.16k
          }
1754
1755
4.89k
        for (;;) switch (*p++)
1756
4.89k
          {
1757
2.09k
          case XCL_SINGLE:
1758
2.09k
          b = *p++;
1759
6.40k
          while ((*p & 0xc0) == 0x80) p++;
1760
2.09k
          re->start_bitmap[b/8] |= (1u << (b&7));
1761
2.09k
          break;
1762
1763
888
          case XCL_RANGE:
1764
888
          b = *p++;
1765
2.13k
          while ((*p & 0xc0) == 0x80) p++;
1766
888
          e = *p++;
1767
2.67k
          while ((*p & 0xc0) == 0x80) p++;
1768
15.0k
          for (; b <= e; b++)
1769
14.1k
            re->start_bitmap[b/8] |= (1u << (b&7));
1770
888
          break;
1771
1772
1.91k
          case XCL_END:
1773
1.91k
          goto HANDLE_CLASSMAP;
1774
1775
0
          default:
1776
0
          PCRE2_DEBUG_UNREACHABLE();
1777
0
          return SSB_UNKNOWN;   /* Internal error, should not occur */
1778
4.89k
          }
1779
1.91k
        }
1780
714
#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1781
714
#endif  /* SUPPORT_WIDE_CHARS */
1782
1783
      /* It seems that the fall through comment must be outside the #ifdef if
1784
      it is to avoid the gcc compiler warning. */
1785
1786
      /* Fall through */
1787
1788
      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1789
      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1790
      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1791
      there is no difference between CLASS and NCLASS. In all other wide
1792
      character modes, set the 0xFF bit to indicate code units >= 255. */
1793
1794
12.9k
      case OP_NCLASS:
1795
12.9k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1796
12.9k
      if (utf)
1797
5.03k
        {
1798
5.03k
        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1799
5.03k
        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1800
5.03k
        }
1801
#elif PCRE2_CODE_UNIT_WIDTH != 8
1802
      SET_BIT(0xFF);                             /* For characters >= 255 */
1803
#endif
1804
      /* Fall through */
1805
1806
      /* Enter here for a positive non-XCLASS. If we have fallen through from
1807
      an XCLASS, classmap will already be set; just advance the code pointer.
1808
      Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1809
1810
36.4k
      case OP_CLASS:
1811
36.4k
      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1812
35.7k
        {
1813
35.7k
        classmap = (const uint8_t *)(++tcode);
1814
35.7k
        tcode += 32 / sizeof(PCRE2_UCHAR);
1815
35.7k
        }
1816
1817
      /* When wide characters are supported, classmap may be NULL. In UTF-8
1818
      (sic) mode, the bits in a class bit map correspond to character values,
1819
      not to byte values. However, the bit map we are constructing is for byte
1820
      values. So we have to do a conversion for characters whose code point is
1821
      greater than 127. In fact, there are only two possible starting bytes for
1822
      characters in the range 128 - 255. */
1823
1824
36.4k
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1825
39.5k
      HANDLE_CLASSMAP:
1826
39.5k
#endif
1827
39.5k
      if (classmap != NULL)
1828
39.0k
        {
1829
39.0k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1830
39.0k
        if (utf)
1831
13.0k
          {
1832
221k
          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1833
983k
          for (c = 128; c < 256; c++)
1834
970k
            {
1835
970k
            if ((classmap[c/8] & (1u << (c&7))) != 0)
1836
11.1k
              {
1837
11.1k
              int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1838
11.1k
              re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1839
11.1k
              c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1840
11.1k
              }
1841
970k
            }
1842
13.0k
          }
1843
26.0k
        else
1844
26.0k
#endif
1845
        /* In all modes except UTF-8, the two bit maps are compatible. */
1846
1847
26.0k
          {
1848
860k
          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1849
26.0k
          }
1850
39.0k
        }
1851
1852
      /* Act on what follows the class. For a zero minimum repeat, continue;
1853
      otherwise stop processing. */
1854
1855
39.5k
      switch (*tcode)
1856
39.5k
        {
1857
4.76k
        case OP_CRSTAR:
1858
6.50k
        case OP_CRMINSTAR:
1859
8.49k
        case OP_CRQUERY:
1860
9.95k
        case OP_CRMINQUERY:
1861
11.7k
        case OP_CRPOSSTAR:
1862
12.7k
        case OP_CRPOSQUERY:
1863
12.7k
        tcode++;
1864
12.7k
        break;
1865
1866
1.30k
        case OP_CRRANGE:
1867
1.64k
        case OP_CRMINRANGE:
1868
6.57k
        case OP_CRPOSRANGE:
1869
6.57k
        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1870
1.96k
          else try_next = FALSE;
1871
6.57k
        break;
1872
1873
20.1k
        default:
1874
20.1k
        try_next = FALSE;
1875
20.1k
        break;
1876
39.5k
        }
1877
39.5k
      break; /* End of class handling case */
1878
1.35M
      }      /* End of switch for opcodes */
1879
1.35M
    }        /* End of try_next loop */
1880
1881
713k
  code += GET(code, 1);   /* Advance to next branch */
1882
713k
  }
1883
713k
while (*code == OP_ALT);
1884
1885
96.1k
return yield;
1886
208k
}
1887
1888
1889
1890
/*************************************************
1891
*          Study a compiled expression           *
1892
*************************************************/
1893
1894
/* This function is handed a compiled expression that it must study to produce
1895
information that will speed up the matching.
1896
1897
Argument:
1898
  re       points to the compiled expression
1899
1900
Returns:   0 normally; non-zero should never normally occur
1901
           1 unknown opcode in set_start_bits
1902
           2 missing capturing bracket
1903
           3 unknown opcode in find_minlength
1904
*/
1905
1906
int
1907
PRIV(study)(pcre2_real_code *re)
1908
54.5k
{
1909
54.5k
int count = 0;
1910
54.5k
PCRE2_UCHAR *code;
1911
54.5k
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1912
54.5k
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1913
1914
/* Find start of compiled code */
1915
1916
54.5k
code = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
1917
1918
/* For a pattern that has a first code unit, or a multiline pattern that
1919
matches only at "line start", there is no point in seeking a list of starting
1920
code units. */
1921
1922
54.5k
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1923
34.9k
  {
1924
34.9k
  int depth = 0;
1925
34.9k
  int rc = set_start_bits(re, code, utf, ucp, &depth);
1926
34.9k
  if (rc == SSB_UNKNOWN)
1927
0
    {
1928
0
    PCRE2_DEBUG_UNREACHABLE();
1929
0
    return 1;
1930
0
    }
1931
1932
  /* If a list of starting code units was set up, scan the list to see if only
1933
  one or two were listed. Having only one listed is rare because usually a
1934
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1935
  If two are listed, see if they are caseless versions of the same character;
1936
  if so we can replace the list with a caseless first code unit. This gives
1937
  better performance and is plausibly worth doing for patterns such as [Ww]ord
1938
  or (word|WORD). */
1939
1940
34.9k
  if (rc == SSB_DONE)
1941
11.7k
    {
1942
11.7k
    int i;
1943
11.7k
    int a = -1;
1944
11.7k
    int b = -1;
1945
11.7k
    uint8_t *p = re->start_bitmap;
1946
11.7k
    uint32_t flags = PCRE2_FIRSTMAPSET;
1947
1948
92.5k
    for (i = 0; i < 256; p++, i += 8)
1949
92.2k
      {
1950
92.2k
      uint8_t x = *p;
1951
92.2k
      if (x != 0)
1952
17.3k
        {
1953
17.3k
        int c;
1954
17.3k
        uint8_t y = x & (~x + 1);   /* Least significant bit */
1955
17.3k
        if (y != x) goto DONE;      /* More than one bit set */
1956
1957
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1958
        all wide characters", so we cannot use it here. */
1959
1960
#if PCRE2_CODE_UNIT_WIDTH != 8
1961
        if (i == 248 && x == 0x80) goto DONE;
1962
#endif
1963
1964
        /* Compute the character value */
1965
1966
10.7k
        c = i;
1967
10.7k
        switch (x)
1968
10.7k
          {
1969
3.07k
          case 1:   break;
1970
1.72k
          case 2:   c += 1; break;  case 4:  c += 2; break;
1971
1.28k
          case 8:   c += 3; break;  case 16: c += 4; break;
1972
1.09k
          case 32:  c += 5; break;  case 64: c += 6; break;
1973
798
          case 128: c += 7; break;
1974
10.7k
          }
1975
1976
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1977
        mode, only values < 128 can be used. In all the other cases, c is a
1978
        character value. */
1979
1980
10.7k
#if PCRE2_CODE_UNIT_WIDTH == 8
1981
10.7k
        if (utf && c > 127) goto DONE;
1982
10.4k
#endif
1983
10.4k
        if (a < 0) a = c;   /* First one found, save in a */
1984
4.87k
        else if (b < 0)     /* Second one found */
1985
4.78k
          {
1986
4.78k
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1987
1988
4.78k
#ifdef SUPPORT_UNICODE
1989
4.78k
          if (utf || ucp)
1990
1.35k
            {
1991
1.35k
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1992
1.16k
            if (c > 127) d = UCD_OTHERCASE(c);
1993
1.16k
            }
1994
4.59k
#endif  /* SUPPORT_UNICODE */
1995
1996
4.59k
          if (d != a) goto DONE;   /* Not the other case of a */
1997
221
          b = c;                   /* Save second in b */
1998
1999
#ifdef EBCDIC
2000
          /* To match ASCII (which puts the uppercase one in a), swap a & b
2001
          if needed. This doesn't really matter, but neatens the tests. */
2002
          if (TABLE_GET((unsigned int)a, re->tables + lcc_offset, a) == a)
2003
            {
2004
            b = a;
2005
            a = c;
2006
            }
2007
#endif
2008
221
          }
2009
92
        else goto DONE;   /* More than two characters found */
2010
10.4k
        }
2011
92.2k
      }
2012
2013
    /* Replace the start code unit bits with a first code unit. If it is the
2014
    same as a required later code unit, then clear the required later code
2015
    unit. This is because a search for a required code unit starts after an
2016
    explicit first code unit, but at a code unit found from the bitmap.
2017
    Patterns such as /a*a/ don't work if both the start unit and required
2018
    unit are the same. */
2019
2020
286
    if (a >= 0) {
2021
268
      if ((re->flags & PCRE2_LASTSET) && (re->last_codeunit == (uint32_t)a || (b >= 0 && re->last_codeunit == (uint32_t)b))) {
2022
100
        re->flags &= ~(PCRE2_LASTSET | PCRE2_LASTCASELESS);
2023
100
        re->last_codeunit = 0;
2024
100
      }
2025
268
      re->first_codeunit = a;
2026
268
      flags = PCRE2_FIRSTSET;
2027
268
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
2028
268
    }
2029
2030
11.7k
    DONE:
2031
11.7k
    re->flags |= flags;
2032
11.7k
    }
2033
34.9k
  }
2034
2035
/* Find the minimum length of subject string. If the pattern can match an empty
2036
string, the minimum length is already known. If the pattern contains (*ACCEPT)
2037
all bets are off, and we don't even try to find a minimum length. If there are
2038
more back references than the size of the vector we are going to cache them in,
2039
do nothing. A pattern that complicated will probably take a long time to
2040
analyze and may in any case turn out to be too complicated. Note that back
2041
reference minima are held as 16-bit numbers. */
2042
2043
54.5k
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
2044
54.5k
     re->top_backref <= MAX_CACHE_BACKREF)
2045
44.2k
  {
2046
44.2k
  int min;
2047
44.2k
  int backref_cache[MAX_CACHE_BACKREF+1];
2048
44.2k
  backref_cache[0] = 0;    /* Highest one that is set */
2049
44.2k
  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
2050
44.2k
  switch(min)
2051
44.2k
    {
2052
131
    case -1:  /* \C in UTF mode or over-complex regex */
2053
131
    break;    /* Leave minlength unchanged (will be zero) */
2054
2055
0
    case -2:
2056
0
    PCRE2_DEBUG_UNREACHABLE();
2057
0
    return 2; /* missing capturing bracket */
2058
2059
0
    case -3:
2060
0
    PCRE2_DEBUG_UNREACHABLE();
2061
0
    return 3; /* unrecognized opcode */
2062
2063
44.1k
    default:
2064
44.1k
    re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
2065
44.1k
    break;
2066
44.2k
    }
2067
44.2k
  }
2068
2069
54.5k
return 0;
2070
54.5k
}
2071
2072
/* End of pcre2_study.c */