Coverage Report

Created: 2026-03-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/pcre2-10.39/src/pcre2_study.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2020 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains functions for scanning a compiled pattern and
42
collecting data (e.g. minimum matching length). */
43
44
45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48
49
#include "pcre2_internal.h"
50
51
/* The maximum remembered capturing brackets minimum. */
52
53
921k
#define MAX_CACHE_BACKREF 128
54
55
/* Set a bit in the starting code unit bit map. */
56
57
1.98M
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59
/* Returns from set_start_bits() */
60
61
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62
63
64
/*************************************************
65
*   Find the minimum subject length for a group  *
66
*************************************************/
67
68
/* Scan a parenthesized group and compute the minimum length of subject that
69
is needed to match it. This is a lower bound; it does not mean there is a
70
string of that length that matches. In UTF mode, the result is in characters
71
rather than code units. The field in a compiled pattern for storing the minimum
72
length is 16-bits long (on the grounds that anything longer than that is
73
pathological), so we give up when we reach that amount. This also means that
74
integer overflow for really crazy patterns cannot happen.
75
76
Backreference minimum lengths are cached to speed up multiple references. This
77
function is called only when the highest back reference in the pattern is less
78
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79
caching vector. The zeroth element contains the number of the highest set
80
value.
81
82
Arguments:
83
  re              compiled pattern block
84
  code            pointer to start of group (the bracket)
85
  startcode       pointer to start of the whole pattern's code
86
  utf             UTF flag
87
  recurses        chain of recurse_check to catch mutual recursion
88
  countptr        pointer to call count (to catch over complexity)
89
  backref_cache   vector for caching back references.
90
91
This function is no longer called when the pattern contains (*ACCEPT); however,
92
the old code for returning -1 is retained, just in case.
93
94
Returns:   the minimum length
95
           -1 \C in UTF-8 mode
96
              or (*ACCEPT)
97
              or pattern too complicated
98
           -2 internal error (missing capturing bracket)
99
           -3 internal error (opcode not listed)
100
*/
101
102
static int
103
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104
  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105
  int *backref_cache)
106
8.89M
{
107
8.89M
int length = -1;
108
8.89M
int branchlength = 0;
109
8.89M
int prev_cap_recno = -1;
110
8.89M
int prev_cap_d = 0;
111
8.89M
int prev_recurse_recno = -1;
112
8.89M
int prev_recurse_d = 0;
113
8.89M
uint32_t once_fudge = 0;
114
8.89M
BOOL had_recurse = FALSE;
115
8.89M
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116
8.89M
PCRE2_SPTR nextbranch = code + GET(code, 1);
117
8.89M
PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118
8.89M
recurse_check this_recurse;
119
120
/* If this is a "could be empty" group, its minimum length is 0. */
121
122
8.89M
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123
124
/* Skip over capturing bracket number */
125
126
8.84M
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127
128
/* A large and/or complex regex can take too long to process. */
129
130
8.84M
if ((*countptr)++ > 1000) return -1;
131
132
/* Scan along the opcodes for this branch. If we get to the end of the branch,
133
check the length against that of the other branches. If the accumulated length
134
passes 16-bits, reset to that value and skip the rest of the branch. */
135
136
8.84M
for (;;)
137
73.7M
  {
138
73.7M
  int d, min, recno;
139
73.7M
  PCRE2_UCHAR op, *cs, *ce;
140
141
73.7M
  if (branchlength >= UINT16_MAX)
142
3.81k
    {
143
3.81k
    branchlength = UINT16_MAX;
144
3.81k
    cc = (PCRE2_UCHAR *)nextbranch;
145
3.81k
    }
146
147
73.7M
  op = *cc;
148
73.7M
  switch (op)
149
73.7M
    {
150
24.4k
    case OP_COND:
151
28.2k
    case OP_SCOND:
152
153
    /* If there is only one branch in a condition, the implied branch has zero
154
    length, so we don't add anything. This covers the DEFINE "condition"
155
    automatically. If there are two branches we can treat it the same as any
156
    other non-capturing subpattern. */
157
158
28.2k
    cs = cc + GET(cc, 1);
159
28.2k
    if (*cs != OP_ALT)
160
15.4k
      {
161
15.4k
      cc = cs + 1 + LINK_SIZE;
162
15.4k
      break;
163
15.4k
      }
164
12.7k
    goto PROCESS_NON_CAPTURE;
165
166
5.79M
    case OP_BRA:
167
    /* There's a special case of OP_BRA, when it is wrapped round a repeated
168
    OP_RECURSE. We'd like to process the latter at this level so that
169
    remembering the value works for repeated cases. So we do nothing, but
170
    set a fudge value to skip over the OP_KET after the recurse. */
171
172
5.79M
    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173
4.90k
      {
174
4.90k
      once_fudge = 1 + LINK_SIZE;
175
4.90k
      cc += 1 + LINK_SIZE;
176
4.90k
      break;
177
4.90k
      }
178
    /* Fall through */
179
180
5.86M
    case OP_ONCE:
181
5.87M
    case OP_SCRIPT_RUN:
182
5.88M
    case OP_SBRA:
183
5.89M
    case OP_BRAPOS:
184
5.90M
    case OP_SBRAPOS:
185
5.91M
    PROCESS_NON_CAPTURE:
186
5.91M
    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187
5.91M
      backref_cache);
188
5.91M
    if (d < 0) return d;
189
5.91M
    branchlength += d;
190
8.11M
    do cc += GET(cc, 1); while (*cc == OP_ALT);
191
5.91M
    cc += 1 + LINK_SIZE;
192
5.91M
    break;
193
194
    /* To save time for repeated capturing subpatterns, we remember the
195
    length of the previous one. Unfortunately we can't do the same for
196
    the unnumbered ones above. Nor can we do this if (?| is present in the
197
    pattern because captures with the same number are not then identical. */
198
199
2.06M
    case OP_CBRA:
200
2.08M
    case OP_SCBRA:
201
2.09M
    case OP_CBRAPOS:
202
2.09M
    case OP_SCBRAPOS:
203
2.09M
    recno = (int)GET2(cc, 1+LINK_SIZE);
204
2.09M
    if (dupcapused || recno != prev_cap_recno)
205
1.81M
      {
206
1.81M
      prev_cap_recno = recno;
207
1.81M
      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208
1.81M
        backref_cache);
209
1.81M
      if (prev_cap_d < 0) return prev_cap_d;
210
1.81M
      }
211
2.09M
    branchlength += prev_cap_d;
212
2.63M
    do cc += GET(cc, 1); while (*cc == OP_ALT);
213
2.09M
    cc += 1 + LINK_SIZE;
214
2.09M
    break;
215
216
    /* ACCEPT makes things far too complicated; we have to give up. In fact,
217
    from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218
    used. However, leave the code in place, just in case. */
219
220
0
    case OP_ACCEPT:
221
0
    case OP_ASSERT_ACCEPT:
222
0
    return -1;
223
224
    /* Reached end of a branch; if it's a ket it is the end of a nested
225
    call. If it's ALT it is an alternation in a nested call. If it is END it's
226
    the end of the outer call. All can be handled by the same code. If the
227
    length of any branch is zero, there is no need to scan any subsequent
228
    branches. */
229
230
1.59M
    case OP_ALT:
231
9.83M
    case OP_KET:
232
9.86M
    case OP_KETRMAX:
233
9.87M
    case OP_KETRMIN:
234
9.88M
    case OP_KETRPOS:
235
9.88M
    case OP_END:
236
9.88M
    if (length < 0 || (!had_recurse && branchlength < length))
237
9.64M
      length = branchlength;
238
9.88M
    if (op != OP_ALT || length == 0) return length;
239
1.05M
    nextbranch = cc + GET(cc, 1);
240
1.05M
    cc += 1 + LINK_SIZE;
241
1.05M
    branchlength = 0;
242
1.05M
    had_recurse = FALSE;
243
1.05M
    break;
244
245
    /* Skip over assertive subpatterns */
246
247
200k
    case OP_ASSERT:
248
212k
    case OP_ASSERT_NOT:
249
222k
    case OP_ASSERTBACK:
250
271k
    case OP_ASSERTBACK_NOT:
251
411k
    case OP_ASSERT_NA:
252
418k
    case OP_ASSERTBACK_NA:
253
1.80M
    do cc += GET(cc, 1); while (*cc == OP_ALT);
254
    /* Fall through */
255
256
    /* Skip over things that don't match chars */
257
258
418k
    case OP_REVERSE:
259
422k
    case OP_CREF:
260
427k
    case OP_DNCREF:
261
429k
    case OP_RREF:
262
430k
    case OP_DNRREF:
263
430k
    case OP_FALSE:
264
430k
    case OP_TRUE:
265
435k
    case OP_CALLOUT:
266
456k
    case OP_SOD:
267
461k
    case OP_SOM:
268
481k
    case OP_EOD:
269
488k
    case OP_EODN:
270
1.60M
    case OP_CIRC:
271
1.63M
    case OP_CIRCM:
272
2.06M
    case OP_DOLL:
273
2.07M
    case OP_DOLLM:
274
2.07M
    case OP_NOT_WORD_BOUNDARY:
275
2.08M
    case OP_WORD_BOUNDARY:
276
2.08M
    cc += PRIV(OP_lengths)[*cc];
277
2.08M
    break;
278
279
10.1k
    case OP_CALLOUT_STR:
280
10.1k
    cc += GET(cc, 1 + 2*LINK_SIZE);
281
10.1k
    break;
282
283
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
284
285
107k
    case OP_BRAZERO:
286
119k
    case OP_BRAMINZERO:
287
125k
    case OP_BRAPOSZERO:
288
131k
    case OP_SKIPZERO:
289
131k
    cc += PRIV(OP_lengths)[*cc];
290
227k
    do cc += GET(cc, 1); while (*cc == OP_ALT);
291
131k
    cc += 1 + LINK_SIZE;
292
131k
    break;
293
294
    /* Handle literal characters and + repetitions */
295
296
31.3M
    case OP_CHAR:
297
40.7M
    case OP_CHARI:
298
40.7M
    case OP_NOT:
299
40.7M
    case OP_NOTI:
300
41.1M
    case OP_PLUS:
301
41.2M
    case OP_PLUSI:
302
41.2M
    case OP_MINPLUS:
303
41.2M
    case OP_MINPLUSI:
304
41.7M
    case OP_POSPLUS:
305
41.8M
    case OP_POSPLUSI:
306
41.8M
    case OP_NOTPLUS:
307
41.8M
    case OP_NOTPLUSI:
308
41.8M
    case OP_NOTMINPLUS:
309
41.8M
    case OP_NOTMINPLUSI:
310
41.8M
    case OP_NOTPOSPLUS:
311
41.8M
    case OP_NOTPOSPLUSI:
312
41.8M
    branchlength++;
313
41.8M
    cc += 2;
314
41.8M
#ifdef SUPPORT_UNICODE
315
41.8M
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
316
41.8M
#endif
317
41.8M
    break;
318
319
569k
    case OP_TYPEPLUS:
320
580k
    case OP_TYPEMINPLUS:
321
1.18M
    case OP_TYPEPOSPLUS:
322
1.18M
    branchlength++;
323
1.18M
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
324
1.18M
    break;
325
326
    /* Handle exact repetitions. The count is already in characters, but we
327
    may need to skip over a multibyte character in UTF mode.  */
328
329
228k
    case OP_EXACT:
330
238k
    case OP_EXACTI:
331
244k
    case OP_NOTEXACT:
332
250k
    case OP_NOTEXACTI:
333
250k
    branchlength += GET2(cc,1);
334
250k
    cc += 2 + IMM2_SIZE;
335
250k
#ifdef SUPPORT_UNICODE
336
250k
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
337
250k
#endif
338
250k
    break;
339
340
41.9k
    case OP_TYPEEXACT:
341
41.9k
    branchlength += GET2(cc,1);
342
41.9k
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
343
35.9k
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
344
41.9k
    break;
345
346
    /* Handle single-char non-literal matchers */
347
348
206k
    case OP_PROP:
349
481k
    case OP_NOTPROP:
350
481k
    cc += 2;
351
    /* Fall through */
352
353
492k
    case OP_NOT_DIGIT:
354
524k
    case OP_DIGIT:
355
552k
    case OP_NOT_WHITESPACE:
356
1.10M
    case OP_WHITESPACE:
357
1.12M
    case OP_NOT_WORDCHAR:
358
1.15M
    case OP_WORDCHAR:
359
1.59M
    case OP_ANY:
360
1.67M
    case OP_ALLANY:
361
1.69M
    case OP_EXTUNI:
362
1.72M
    case OP_HSPACE:
363
1.75M
    case OP_NOT_HSPACE:
364
1.78M
    case OP_VSPACE:
365
1.80M
    case OP_NOT_VSPACE:
366
1.80M
    branchlength++;
367
1.80M
    cc++;
368
1.80M
    break;
369
370
    /* "Any newline" might match two characters, but it also might match just
371
    one. */
372
373
15.0k
    case OP_ANYNL:
374
15.0k
    branchlength += 1;
375
15.0k
    cc++;
376
15.0k
    break;
377
378
    /* The single-byte matcher means we can't proceed in UTF mode. (In
379
    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
380
    appear, but leave the code, just in case.) */
381
382
1.99k
    case OP_ANYBYTE:
383
1.99k
#ifdef SUPPORT_UNICODE
384
1.99k
    if (utf) return -1;
385
0
#endif
386
0
    branchlength++;
387
0
    cc++;
388
0
    break;
389
390
    /* For repeated character types, we have to test for \p and \P, which have
391
    an extra two bytes of parameters. */
392
393
590k
    case OP_TYPESTAR:
394
610k
    case OP_TYPEMINSTAR:
395
668k
    case OP_TYPEQUERY:
396
675k
    case OP_TYPEMINQUERY:
397
2.18M
    case OP_TYPEPOSSTAR:
398
2.20M
    case OP_TYPEPOSQUERY:
399
2.20M
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
400
2.20M
    cc += PRIV(OP_lengths)[op];
401
2.20M
    break;
402
403
10.8k
    case OP_TYPEUPTO:
404
16.1k
    case OP_TYPEMINUPTO:
405
26.6k
    case OP_TYPEPOSUPTO:
406
26.6k
    if (cc[1 + IMM2_SIZE] == OP_PROP
407
19.5k
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
408
26.6k
    cc += PRIV(OP_lengths)[op];
409
26.6k
    break;
410
411
    /* Check a class for variable quantification */
412
413
4.12M
    case OP_CLASS:
414
4.18M
    case OP_NCLASS:
415
4.18M
#ifdef SUPPORT_WIDE_CHARS
416
4.47M
    case OP_XCLASS:
417
    /* The original code caused an unsigned overflow in 64 bit systems,
418
    so now we use a conditional statement. */
419
4.47M
    if (op == OP_XCLASS)
420
292k
      cc += GET(cc, 1);
421
4.18M
    else
422
4.18M
      cc += PRIV(OP_lengths)[OP_CLASS];
423
#else
424
    cc += PRIV(OP_lengths)[OP_CLASS];
425
#endif
426
427
4.47M
    switch (*cc)
428
4.47M
      {
429
169k
      case OP_CRPLUS:
430
298k
      case OP_CRMINPLUS:
431
557k
      case OP_CRPOSPLUS:
432
557k
      branchlength++;
433
      /* Fall through */
434
435
649k
      case OP_CRSTAR:
436
661k
      case OP_CRMINSTAR:
437
688k
      case OP_CRQUERY:
438
696k
      case OP_CRMINQUERY:
439
1.03M
      case OP_CRPOSSTAR:
440
1.04M
      case OP_CRPOSQUERY:
441
1.04M
      cc++;
442
1.04M
      break;
443
444
372k
      case OP_CRRANGE:
445
378k
      case OP_CRMINRANGE:
446
456k
      case OP_CRPOSRANGE:
447
456k
      branchlength += GET2(cc,1);
448
456k
      cc += 1 + 2 * IMM2_SIZE;
449
456k
      break;
450
451
2.97M
      default:
452
2.97M
      branchlength++;
453
2.97M
      break;
454
4.47M
      }
455
4.47M
    break;
456
457
    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
458
    way: we find the minimum length for the subpattern. A recursion
459
    (backreference or subroutine) causes an a flag to be set that causes the
460
    length of this branch to be ignored. The logic is that a recursion can only
461
    make sense if there is another alternative that stops the recursing. That
462
    will provide the minimum length (when no recursion happens).
463
464
    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
465
    matches an empty string (by default it causes a matching failure), so in
466
    that case we must set the minimum length to zero.
467
468
    For backreferenes, if duplicate numbers are present in the pattern we check
469
    for a reference to a duplicate. If it is, we don't know which version will
470
    be referenced, so we have to set the minimum length to zero. */
471
472
    /* Duplicate named pattern back reference. */
473
474
4.47M
    case OP_DNREF:
475
16.0k
    case OP_DNREFI:
476
16.0k
    if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
477
14.9k
      {
478
14.9k
      int count = GET2(cc, 1+IMM2_SIZE);
479
14.9k
      PCRE2_UCHAR *slot =
480
14.9k
        (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
481
14.9k
          GET2(cc, 1) * re->name_entry_size;
482
483
14.9k
      d = INT_MAX;
484
485
      /* Scan all groups with the same name; find the shortest. */
486
487
79.6k
      while (count-- > 0)
488
72.8k
        {
489
72.8k
        int dd, i;
490
72.8k
        recno = GET2(slot, 0);
491
492
72.8k
        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
493
52.9k
          dd = backref_cache[recno];
494
19.8k
        else
495
19.8k
          {
496
19.8k
          ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
497
19.8k
          if (cs == NULL) return -2;
498
21.7k
          do ce += GET(ce, 1); while (*ce == OP_ALT);
499
500
19.8k
          dd = 0;
501
19.8k
          if (!dupcapused ||
502
0
              (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
503
19.8k
            {
504
19.8k
            if (cc > cs && cc < ce)    /* Simple recursion */
505
2.54k
              {
506
2.54k
              had_recurse = TRUE;
507
2.54k
              }
508
17.3k
            else
509
17.3k
              {
510
17.3k
              recurse_check *r = recurses;
511
38.4k
              for (r = recurses; r != NULL; r = r->prev)
512
22.2k
                if (r->group == cs) break;
513
17.3k
              if (r != NULL)           /* Mutual recursion */
514
1.13k
                {
515
1.13k
                had_recurse = TRUE;
516
1.13k
                }
517
16.1k
              else
518
16.1k
                {
519
16.1k
                this_recurse.prev = recurses;  /* No recursion */
520
16.1k
                this_recurse.group = cs;
521
16.1k
                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
522
16.1k
                  countptr, backref_cache);
523
16.1k
                if (dd < 0) return dd;
524
16.1k
                }
525
17.3k
              }
526
19.8k
            }
527
528
19.8k
          backref_cache[recno] = dd;
529
50.9k
          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
530
19.8k
          backref_cache[0] = recno;
531
19.8k
          }
532
533
72.8k
        if (dd < d) d = dd;
534
72.8k
        if (d <= 0) break;    /* No point looking at any more */
535
64.7k
        slot += re->name_entry_size;
536
64.7k
        }
537
14.9k
      }
538
1.15k
    else d = 0;
539
16.0k
    cc += 1 + 2*IMM2_SIZE;
540
16.0k
    goto REPEAT_BACK_REFERENCE;
541
542
    /* Single back reference by number. References by name are converted to by
543
    number when there is no duplication. */
544
545
226k
    case OP_REF:
546
313k
    case OP_REFI:
547
313k
    recno = GET2(cc, 1);
548
313k
    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
549
126k
      d = backref_cache[recno];
550
187k
    else
551
187k
      {
552
187k
      int i;
553
187k
      d = 0;
554
555
187k
      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
556
187k
        {
557
187k
        ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
558
187k
        if (cs == NULL) return -2;
559
240k
        do ce += GET(ce, 1); while (*ce == OP_ALT);
560
561
187k
        if (!dupcapused ||
562
14.9k
            (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
563
185k
          {
564
185k
          if (cc > cs && cc < ce)    /* Simple recursion */
565
8.19k
            {
566
8.19k
            had_recurse = TRUE;
567
8.19k
            }
568
177k
          else
569
177k
            {
570
177k
            recurse_check *r = recurses;
571
201k
            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
572
177k
            if (r != NULL)           /* Mutual recursion */
573
2.14k
              {
574
2.14k
              had_recurse = TRUE;
575
2.14k
              }
576
175k
            else                     /* No recursion */
577
175k
              {
578
175k
              this_recurse.prev = recurses;
579
175k
              this_recurse.group = cs;
580
175k
              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
581
175k
                backref_cache);
582
175k
              if (d < 0) return d;
583
175k
              }
584
177k
            }
585
185k
          }
586
187k
        }
587
588
186k
      backref_cache[recno] = d;
589
370k
      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
590
186k
      backref_cache[0] = recno;
591
186k
      }
592
593
313k
    cc += 1 + IMM2_SIZE;
594
595
    /* Handle repeated back references */
596
597
329k
    REPEAT_BACK_REFERENCE:
598
329k
    switch (*cc)
599
329k
      {
600
5.21k
      case OP_CRSTAR:
601
7.39k
      case OP_CRMINSTAR:
602
10.7k
      case OP_CRQUERY:
603
13.1k
      case OP_CRMINQUERY:
604
13.1k
      case OP_CRPOSSTAR:
605
13.1k
      case OP_CRPOSQUERY:
606
13.1k
      min = 0;
607
13.1k
      cc++;
608
13.1k
      break;
609
610
24.4k
      case OP_CRPLUS:
611
28.7k
      case OP_CRMINPLUS:
612
28.7k
      case OP_CRPOSPLUS:
613
28.7k
      min = 1;
614
28.7k
      cc++;
615
28.7k
      break;
616
617
1.93k
      case OP_CRRANGE:
618
6.57k
      case OP_CRMINRANGE:
619
6.57k
      case OP_CRPOSRANGE:
620
6.57k
      min = GET2(cc, 1);
621
6.57k
      cc += 1 + 2 * IMM2_SIZE;
622
6.57k
      break;
623
624
280k
      default:
625
280k
      min = 1;
626
280k
      break;
627
329k
      }
628
629
     /* Take care not to overflow: (1) min and d are ints, so check that their
630
     product is not greater than INT_MAX. (2) branchlength is limited to
631
     UINT16_MAX (checked at the top of the loop). */
632
633
329k
    if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
634
1.46k
      branchlength = UINT16_MAX;
635
327k
    else branchlength += min * d;
636
329k
    break;
637
638
    /* Recursion always refers to the first occurrence of a subpattern with a
639
    given number. Therefore, we can always make use of caching, even when the
640
    pattern contains multiple subpatterns with the same number. */
641
642
250k
    case OP_RECURSE:
643
250k
    cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
644
250k
    recno = GET2(cs, 1+LINK_SIZE);
645
250k
    if (recno == prev_recurse_recno)
646
43.8k
      {
647
43.8k
      branchlength += prev_recurse_d;
648
43.8k
      }
649
206k
    else
650
206k
      {
651
229k
      do ce += GET(ce, 1); while (*ce == OP_ALT);
652
206k
      if (cc > cs && cc < ce)    /* Simple recursion */
653
133k
        had_recurse = TRUE;
654
72.8k
      else
655
72.8k
        {
656
72.8k
        recurse_check *r = recurses;
657
501k
        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
658
72.8k
        if (r != NULL)          /* Mutual recursion */
659
22.3k
          had_recurse = TRUE;
660
50.5k
        else
661
50.5k
          {
662
50.5k
          this_recurse.prev = recurses;
663
50.5k
          this_recurse.group = cs;
664
50.5k
          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
665
50.5k
            countptr, backref_cache);
666
50.5k
          if (prev_recurse_d < 0) return prev_recurse_d;
667
49.9k
          prev_recurse_recno = recno;
668
49.9k
          branchlength += prev_recurse_d;
669
49.9k
          }
670
72.8k
        }
671
206k
      }
672
249k
    cc += 1 + LINK_SIZE + once_fudge;
673
249k
    once_fudge = 0;
674
249k
    break;
675
676
    /* Anything else does not or need not match a character. We can get the
677
    item's length from the table, but for those that can match zero occurrences
678
    of a character, we must take special action for UTF-8 characters. As it
679
    happens, the "NOT" versions of these opcodes are used at present only for
680
    ASCII characters, so they could be omitted from this list. However, in
681
    future that may change, so we include them here so as not to leave a
682
    gotcha for a future maintainer. */
683
684
5.71k
    case OP_UPTO:
685
12.5k
    case OP_UPTOI:
686
15.8k
    case OP_NOTUPTO:
687
19.1k
    case OP_NOTUPTOI:
688
23.9k
    case OP_MINUPTO:
689
25.6k
    case OP_MINUPTOI:
690
31.1k
    case OP_NOTMINUPTO:
691
32.3k
    case OP_NOTMINUPTOI:
692
45.0k
    case OP_POSUPTO:
693
51.0k
    case OP_POSUPTOI:
694
55.3k
    case OP_NOTPOSUPTO:
695
58.0k
    case OP_NOTPOSUPTOI:
696
697
227k
    case OP_STAR:
698
278k
    case OP_STARI:
699
281k
    case OP_NOTSTAR:
700
288k
    case OP_NOTSTARI:
701
317k
    case OP_MINSTAR:
702
330k
    case OP_MINSTARI:
703
332k
    case OP_NOTMINSTAR:
704
335k
    case OP_NOTMINSTARI:
705
727k
    case OP_POSSTAR:
706
829k
    case OP_POSSTARI:
707
832k
    case OP_NOTPOSSTAR:
708
834k
    case OP_NOTPOSSTARI:
709
710
871k
    case OP_QUERY:
711
902k
    case OP_QUERYI:
712
906k
    case OP_NOTQUERY:
713
911k
    case OP_NOTQUERYI:
714
920k
    case OP_MINQUERY:
715
926k
    case OP_MINQUERYI:
716
928k
    case OP_NOTMINQUERY:
717
930k
    case OP_NOTMINQUERYI:
718
1.03M
    case OP_POSQUERY:
719
1.08M
    case OP_POSQUERYI:
720
1.08M
    case OP_NOTPOSQUERY:
721
1.09M
    case OP_NOTPOSQUERYI:
722
723
1.09M
    cc += PRIV(OP_lengths)[op];
724
1.09M
#ifdef SUPPORT_UNICODE
725
1.09M
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
726
1.09M
#endif
727
1.09M
    break;
728
729
    /* Skip these, but we need to add in the name length. */
730
731
7.93k
    case OP_MARK:
732
7.93k
    case OP_COMMIT_ARG:
733
7.93k
    case OP_PRUNE_ARG:
734
7.93k
    case OP_SKIP_ARG:
735
7.93k
    case OP_THEN_ARG:
736
7.93k
    cc += PRIV(OP_lengths)[op] + cc[1];
737
7.93k
    break;
738
739
    /* The remaining opcodes are just skipped over. */
740
741
0
    case OP_CLOSE:
742
0
    case OP_COMMIT:
743
6.90k
    case OP_FAIL:
744
6.90k
    case OP_PRUNE:
745
11.8k
    case OP_SET_SOM:
746
11.8k
    case OP_SKIP:
747
11.8k
    case OP_THEN:
748
11.8k
    cc += PRIV(OP_lengths)[op];
749
11.8k
    break;
750
751
    /* This should not occur: we list all opcodes explicitly so that when
752
    new ones get added they are properly considered. */
753
754
0
    default:
755
0
    return -3;
756
73.7M
    }
757
73.7M
  }
758
/* Control never gets here */
759
8.84M
}
760
761
762
763
/*************************************************
764
*      Set a bit and maybe its alternate case    *
765
*************************************************/
766
767
/* Given a character, set its first code unit's bit in the table, and also the
768
corresponding bit for the other version of a letter if we are caseless.
769
770
Arguments:
771
  re            points to the regex block
772
  p             points to the first code unit of the character
773
  caseless      TRUE if caseless
774
  utf           TRUE for UTF mode
775
  ucp           TRUE for UCP mode
776
777
Returns:        pointer after the character
778
*/
779
780
static PCRE2_SPTR
781
set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
782
  BOOL ucp)
783
1.39M
{
784
1.39M
uint32_t c = *p++;   /* First code unit */
785
786
1.39M
(void)utf;           /* Stop compiler warnings when UTF not supported */
787
1.39M
(void)ucp;
788
789
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
790
0xff. */
791
792
#if PCRE2_CODE_UNIT_WIDTH != 8
793
if (c > 0xff) SET_BIT(0xff); else
794
#endif
795
796
1.39M
SET_BIT(c);
797
798
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
799
the end of the character, even when caseless. */
800
801
1.39M
#ifdef SUPPORT_UNICODE
802
1.39M
if (utf)
803
201k
  {
804
201k
#if PCRE2_CODE_UNIT_WIDTH == 8
805
201k
  if (c >= 0xc0) GETUTF8INC(c, p);
806
#elif PCRE2_CODE_UNIT_WIDTH == 16
807
  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
808
#endif
809
201k
  }
810
1.39M
#endif  /* SUPPORT_UNICODE */
811
812
/* If caseless, handle the other case of the character. */
813
814
1.39M
if (caseless)
815
275k
  {
816
275k
#ifdef SUPPORT_UNICODE
817
275k
  if (utf || ucp)
818
54.7k
    {
819
54.7k
    c = UCD_OTHERCASE(c);
820
54.7k
#if PCRE2_CODE_UNIT_WIDTH == 8
821
54.7k
    if (utf)
822
50.7k
      {
823
50.7k
      PCRE2_UCHAR buff[6];
824
50.7k
      (void)PRIV(ord2utf)(c, buff);
825
50.7k
      SET_BIT(buff[0]);
826
50.7k
      }
827
3.97k
    else if (c < 256) SET_BIT(c);
828
#else  /* 16-bit or 32-bit mode */
829
    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
830
#endif
831
54.7k
    }
832
833
220k
  else
834
220k
#endif  /* SUPPORT_UNICODE */
835
836
  /* Not UTF or UCP */
837
838
220k
  if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
839
275k
  }
840
841
1.39M
return p;
842
1.39M
}
843
844
845
846
/*************************************************
847
*     Set bits for a positive character type     *
848
*************************************************/
849
850
/* This function sets starting bits for a character type. In UTF-8 mode, we can
851
only do a direct setting for bytes less than 128, as otherwise there can be
852
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
853
environment, the tables will only recognize ASCII characters anyway, but in at
854
least one Windows environment, some higher bytes bits were set in the tables.
855
So we deal with that case by considering the UTF-8 encoding.
856
857
Arguments:
858
  re             the regex block
859
  cbit type      the type of character wanted
860
  table_limit    32 for non-UTF-8; 16 for UTF-8
861
862
Returns:         nothing
863
*/
864
865
static void
866
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
867
456k
{
868
456k
uint32_t c;
869
13.5M
for (c = 0; c < table_limit; c++)
870
13.1M
  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
871
456k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
872
456k
if (table_limit == 32) return;
873
12.1M
for (c = 128; c < 256; c++)
874
12.0M
  {
875
12.0M
  if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
876
0
    {
877
0
    PCRE2_UCHAR buff[6];
878
0
    (void)PRIV(ord2utf)(c, buff);
879
0
    SET_BIT(buff[0]);
880
0
    }
881
12.0M
  }
882
94.2k
#endif  /* UTF-8 */
883
94.2k
}
884
885
886
/*************************************************
887
*     Set bits for a negative character type     *
888
*************************************************/
889
890
/* This function sets starting bits for a negative character type such as \D.
891
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
892
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
893
Unlike in the positive case, where we can set appropriate starting bits for
894
specific high-valued UTF-8 characters, in this case we have to set the bits for
895
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
896
0xc0 (192) for simplicity.
897
898
Arguments:
899
  re             the regex block
900
  cbit type      the type of character wanted
901
  table_limit    32 for non-UTF-8; 16 for UTF-8
902
903
Returns:         nothing
904
*/
905
906
static void
907
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
908
119k
{
909
119k
uint32_t c;
910
3.38M
for (c = 0; c < table_limit; c++)
911
3.26M
  re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
912
119k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
913
307k
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
914
119k
#endif
915
119k
}
916
917
918
919
/*************************************************
920
*      Create bitmap of starting code units      *
921
*************************************************/
922
923
/* This function scans a compiled unanchored expression recursively and
924
attempts to build a bitmap of the set of possible starting code units whose
925
values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
926
the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
927
we pass a value of 16 rather than 32 as the final argument. (See comments in
928
those functions for the reason.)
929
930
The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
931
(a*)b where the group provides some optional starting code units but scanning
932
must continue at the outer level to find at least one mandatory code unit. At
933
the outermost level, this function fails unless the result is SSB_DONE.
934
935
We restrict recursion (for nested groups) to 1000 to avoid stack overflow
936
issues.
937
938
Arguments:
939
  re           points to the compiled regex block
940
  code         points to an expression
941
  utf          TRUE if in UTF mode
942
  ucp          TRUE if in UCP mode
943
  depthptr     pointer to recurse depth
944
945
Returns:       SSB_FAIL     => Failed to find any starting code units
946
               SSB_DONE     => Found mandatory starting code units
947
               SSB_CONTINUE => Found optional starting code units
948
               SSB_UNKNOWN  => Hit an unrecognized opcode
949
               SSB_TOODEEP  => Recursion is too deep
950
*/
951
952
static int
953
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
954
  int *depthptr)
955
2.44M
{
956
2.44M
uint32_t c;
957
2.44M
int yield = SSB_DONE;
958
959
2.44M
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
960
2.44M
int table_limit = utf? 16:32;
961
#else
962
int table_limit = 32;
963
#endif
964
965
2.44M
*depthptr += 1;
966
2.44M
if (*depthptr > 1000) return SSB_TOODEEP;
967
968
2.44M
do
969
3.70M
  {
970
3.70M
  BOOL try_next = TRUE;
971
3.70M
  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
972
973
3.70M
  if (*code == OP_CBRA || *code == OP_SCBRA ||
974
3.60M
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
975
976
8.78M
  while (try_next)    /* Loop for items in this branch */
977
6.69M
    {
978
6.69M
    int rc;
979
6.69M
    uint8_t *classmap = NULL;
980
6.69M
#ifdef SUPPORT_WIDE_CHARS
981
6.69M
    PCRE2_UCHAR xclassflags;
982
6.69M
#endif
983
984
6.69M
    switch(*tcode)
985
6.69M
      {
986
      /* If we reach something we don't understand, it means a new opcode has
987
      been created that hasn't been added to this function. Hopefully this
988
      problem will be discovered during testing. */
989
990
0
      default:
991
0
      return SSB_UNKNOWN;
992
993
      /* Fail for a valid opcode that implies no starting bits. */
994
995
0
      case OP_ACCEPT:
996
0
      case OP_ASSERT_ACCEPT:
997
1.96k
      case OP_ALLANY:
998
18.9k
      case OP_ANY:
999
19.6k
      case OP_ANYBYTE:
1000
27.6k
      case OP_CIRCM:
1001
27.6k
      case OP_CLOSE:
1002
27.6k
      case OP_COMMIT:
1003
27.6k
      case OP_COMMIT_ARG:
1004
31.9k
      case OP_COND:
1005
32.3k
      case OP_CREF:
1006
32.3k
      case OP_FALSE:
1007
32.3k
      case OP_TRUE:
1008
32.5k
      case OP_DNCREF:
1009
32.7k
      case OP_DNREF:
1010
32.8k
      case OP_DNREFI:
1011
32.8k
      case OP_DNRREF:
1012
34.7k
      case OP_DOLL:
1013
35.9k
      case OP_DOLLM:
1014
35.9k
      case OP_END:
1015
40.2k
      case OP_EOD:
1016
40.8k
      case OP_EODN:
1017
48.1k
      case OP_EXTUNI:
1018
49.0k
      case OP_FAIL:
1019
50.5k
      case OP_MARK:
1020
52.9k
      case OP_NOT:
1021
53.6k
      case OP_NOTEXACT:
1022
54.5k
      case OP_NOTEXACTI:
1023
57.0k
      case OP_NOTI:
1024
57.8k
      case OP_NOTMINPLUS:
1025
58.6k
      case OP_NOTMINPLUSI:
1026
59.1k
      case OP_NOTMINQUERY:
1027
59.9k
      case OP_NOTMINQUERYI:
1028
60.7k
      case OP_NOTMINSTAR:
1029
61.3k
      case OP_NOTMINSTARI:
1030
62.1k
      case OP_NOTMINUPTO:
1031
62.6k
      case OP_NOTMINUPTOI:
1032
63.4k
      case OP_NOTPLUS:
1033
64.4k
      case OP_NOTPLUSI:
1034
65.2k
      case OP_NOTPOSPLUS:
1035
65.9k
      case OP_NOTPOSPLUSI:
1036
66.8k
      case OP_NOTPOSQUERY:
1037
67.4k
      case OP_NOTPOSQUERYI:
1038
68.4k
      case OP_NOTPOSSTAR:
1039
68.9k
      case OP_NOTPOSSTARI:
1040
70.0k
      case OP_NOTPOSUPTO:
1041
71.0k
      case OP_NOTPOSUPTOI:
1042
194k
      case OP_NOTPROP:
1043
195k
      case OP_NOTQUERY:
1044
197k
      case OP_NOTQUERYI:
1045
198k
      case OP_NOTSTAR:
1046
199k
      case OP_NOTSTARI:
1047
200k
      case OP_NOTUPTO:
1048
201k
      case OP_NOTUPTOI:
1049
203k
      case OP_NOT_HSPACE:
1050
204k
      case OP_NOT_VSPACE:
1051
204k
      case OP_PRUNE:
1052
204k
      case OP_PRUNE_ARG:
1053
206k
      case OP_RECURSE:
1054
210k
      case OP_REF:
1055
220k
      case OP_REFI:
1056
221k
      case OP_REVERSE:
1057
222k
      case OP_RREF:
1058
223k
      case OP_SCOND:
1059
224k
      case OP_SET_SOM:
1060
224k
      case OP_SKIP:
1061
224k
      case OP_SKIP_ARG:
1062
225k
      case OP_SOD:
1063
227k
      case OP_SOM:
1064
227k
      case OP_THEN:
1065
227k
      case OP_THEN_ARG:
1066
227k
      return SSB_FAIL;
1067
1068
      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1069
      uses OP_CIRCM). Skip over it. */
1070
1071
730k
      case OP_CIRC:
1072
730k
      tcode += PRIV(OP_lengths)[OP_CIRC];
1073
730k
      break;
1074
1075
      /* A "real" property test implies no starting bits, but the fake property
1076
      PT_CLIST identifies a list of characters. These lists are short, as they
1077
      are used for characters with more than one "other case", so there is no
1078
      point in recognizing them for OP_NOTPROP. */
1079
1080
23.3k
      case OP_PROP:
1081
23.3k
      if (tcode[1] != PT_CLIST) return SSB_FAIL;
1082
21.2k
        {
1083
21.2k
        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1084
92.0k
        while ((c = *p++) < NOTACHAR)
1085
70.8k
          {
1086
70.8k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1087
70.8k
          if (utf)
1088
61.5k
            {
1089
61.5k
            PCRE2_UCHAR buff[6];
1090
61.5k
            (void)PRIV(ord2utf)(c, buff);
1091
61.5k
            c = buff[0];
1092
61.5k
            }
1093
70.8k
#endif
1094
70.8k
          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1095
70.8k
          }
1096
21.2k
        }
1097
21.2k
      try_next = FALSE;
1098
21.2k
      break;
1099
1100
      /* We can ignore word boundary tests. */
1101
1102
4.75k
      case OP_WORD_BOUNDARY:
1103
10.7k
      case OP_NOT_WORD_BOUNDARY:
1104
10.7k
      tcode++;
1105
10.7k
      break;
1106
1107
      /* If we hit a bracket or a positive lookahead assertion, recurse to set
1108
      bits from within the subpattern. If it can't find anything, we have to
1109
      give up. If it finds some mandatory character(s), we are done for this
1110
      branch. Otherwise, carry on scanning after the subpattern. */
1111
1112
1.17M
      case OP_BRA:
1113
1.18M
      case OP_SBRA:
1114
1.26M
      case OP_CBRA:
1115
1.27M
      case OP_SCBRA:
1116
1.27M
      case OP_BRAPOS:
1117
1.28M
      case OP_SBRAPOS:
1118
1.28M
      case OP_CBRAPOS:
1119
1.28M
      case OP_SCBRAPOS:
1120
1.29M
      case OP_ONCE:
1121
1.30M
      case OP_SCRIPT_RUN:
1122
1.33M
      case OP_ASSERT:
1123
1.33M
      case OP_ASSERT_NA:
1124
1.33M
      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1125
1.33M
      if (rc == SSB_DONE)
1126
117k
        {
1127
117k
        try_next = FALSE;
1128
117k
        }
1129
1.21M
      else if (rc == SSB_CONTINUE)
1130
1.19M
        {
1131
2.11M
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1132
1.19M
        tcode += 1 + LINK_SIZE;
1133
1.19M
        }
1134
26.1k
      else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1135
1.31M
      break;
1136
1137
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1138
      this branch, though we might have found something optional. For ALT, we
1139
      continue with the next alternative, but we have to arrange that the final
1140
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1141
      return SSB_CONTINUE: if this is the top level, that indicates failure,
1142
      but after a nested subpattern, it causes scanning to continue. */
1143
1144
1.31M
      case OP_ALT:
1145
403k
      yield = SSB_CONTINUE;
1146
403k
      try_next = FALSE;
1147
403k
      break;
1148
1149
1.12M
      case OP_KET:
1150
1.14M
      case OP_KETRMAX:
1151
1.15M
      case OP_KETRMIN:
1152
1.16M
      case OP_KETRPOS:
1153
1.16M
      return SSB_CONTINUE;
1154
1155
      /* Skip over callout */
1156
1157
2.10k
      case OP_CALLOUT:
1158
2.10k
      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1159
2.10k
      break;
1160
1161
5.30k
      case OP_CALLOUT_STR:
1162
5.30k
      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1163
5.30k
      break;
1164
1165
      /* Skip over lookbehind and negative lookahead assertions */
1166
1167
9.72k
      case OP_ASSERT_NOT:
1168
40.0k
      case OP_ASSERTBACK:
1169
217k
      case OP_ASSERTBACK_NOT:
1170
221k
      case OP_ASSERTBACK_NA:
1171
250k
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1172
221k
      tcode += 1 + LINK_SIZE;
1173
221k
      break;
1174
1175
      /* BRAZERO does the bracket, but carries on. */
1176
1177
233k
      case OP_BRAZERO:
1178
245k
      case OP_BRAMINZERO:
1179
249k
      case OP_BRAPOSZERO:
1180
249k
      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1181
249k
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1182
136k
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1183
69.4k
      tcode += 1 + LINK_SIZE;
1184
69.4k
      break;
1185
1186
      /* SKIPZERO skips the bracket. */
1187
1188
2.76k
      case OP_SKIPZERO:
1189
2.76k
      tcode++;
1190
3.44k
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1191
2.76k
      tcode += 1 + LINK_SIZE;
1192
2.76k
      break;
1193
1194
      /* Single-char * or ? sets the bit and tries the next item */
1195
1196
57.5k
      case OP_STAR:
1197
61.5k
      case OP_MINSTAR:
1198
83.4k
      case OP_POSSTAR:
1199
106k
      case OP_QUERY:
1200
112k
      case OP_MINQUERY:
1201
134k
      case OP_POSQUERY:
1202
134k
      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1203
134k
      break;
1204
1205
5.07k
      case OP_STARI:
1206
12.1k
      case OP_MINSTARI:
1207
22.8k
      case OP_POSSTARI:
1208
31.8k
      case OP_QUERYI:
1209
33.1k
      case OP_MINQUERYI:
1210
43.5k
      case OP_POSQUERYI:
1211
43.5k
      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1212
43.5k
      break;
1213
1214
      /* Single-char upto sets the bit and tries the next */
1215
1216
5.86k
      case OP_UPTO:
1217
6.69k
      case OP_MINUPTO:
1218
10.6k
      case OP_POSUPTO:
1219
10.6k
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1220
10.6k
      break;
1221
1222
5.66k
      case OP_UPTOI:
1223
6.82k
      case OP_MINUPTOI:
1224
10.5k
      case OP_POSUPTOI:
1225
10.5k
      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1226
10.5k
      break;
1227
1228
      /* At least one single char sets the bit and stops */
1229
1230
37.7k
      case OP_EXACT:
1231
37.7k
      tcode += IMM2_SIZE;
1232
      /* Fall through */
1233
666k
      case OP_CHAR:
1234
954k
      case OP_PLUS:
1235
961k
      case OP_MINPLUS:
1236
973k
      case OP_POSPLUS:
1237
973k
      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1238
973k
      try_next = FALSE;
1239
973k
      break;
1240
1241
2.82k
      case OP_EXACTI:
1242
2.82k
      tcode += IMM2_SIZE;
1243
      /* Fall through */
1244
204k
      case OP_CHARI:
1245
209k
      case OP_PLUSI:
1246
210k
      case OP_MINPLUSI:
1247
221k
      case OP_POSPLUSI:
1248
221k
      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1249
221k
      try_next = FALSE;
1250
221k
      break;
1251
1252
      /* Special spacing and line-terminating items. These recognize specific
1253
      lists of characters. The difference between VSPACE and ANYNL is that the
1254
      latter can match the two-character CRLF sequence, but that is not
1255
      relevant for finding the first character, so their code here is
1256
      identical. */
1257
1258
7.35k
      case OP_HSPACE:
1259
7.35k
      SET_BIT(CHAR_HT);
1260
7.35k
      SET_BIT(CHAR_SPACE);
1261
1262
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1263
      the bits for 0xA0 and for code units >= 255, independently of UTF. */
1264
1265
#if PCRE2_CODE_UNIT_WIDTH != 8
1266
      SET_BIT(0xA0);
1267
      SET_BIT(0xFF);
1268
#else
1269
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1270
      units of horizontal space characters. */
1271
1272
7.35k
#ifdef SUPPORT_UNICODE
1273
7.35k
      if (utf)
1274
2.32k
        {
1275
2.32k
        SET_BIT(0xC2);  /* For U+00A0 */
1276
2.32k
        SET_BIT(0xE1);  /* For U+1680, U+180E */
1277
2.32k
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1278
2.32k
        SET_BIT(0xE3);  /* For U+3000 */
1279
2.32k
        }
1280
5.02k
      else
1281
5.02k
#endif
1282
      /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1283
      the code is EBCDIC. */
1284
5.02k
        {
1285
5.02k
#ifndef EBCDIC
1286
5.02k
        SET_BIT(0xA0);
1287
5.02k
#endif  /* Not EBCDIC */
1288
5.02k
        }
1289
7.35k
#endif  /* 8-bit support */
1290
1291
7.35k
      try_next = FALSE;
1292
7.35k
      break;
1293
1294
13.3k
      case OP_ANYNL:
1295
19.2k
      case OP_VSPACE:
1296
19.2k
      SET_BIT(CHAR_LF);
1297
19.2k
      SET_BIT(CHAR_VT);
1298
19.2k
      SET_BIT(CHAR_FF);
1299
19.2k
      SET_BIT(CHAR_CR);
1300
1301
      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1302
      the bits for NEL and for code units >= 255, independently of UTF. */
1303
1304
#if PCRE2_CODE_UNIT_WIDTH != 8
1305
      SET_BIT(CHAR_NEL);
1306
      SET_BIT(0xFF);
1307
#else
1308
      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1309
      units of vertical space characters. */
1310
1311
19.2k
#ifdef SUPPORT_UNICODE
1312
19.2k
      if (utf)
1313
11.5k
        {
1314
11.5k
        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1315
11.5k
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1316
11.5k
        }
1317
7.75k
      else
1318
7.75k
#endif
1319
      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1320
7.75k
        {
1321
7.75k
        SET_BIT(CHAR_NEL);
1322
7.75k
        }
1323
19.2k
#endif  /* 8-bit support */
1324
1325
19.2k
      try_next = FALSE;
1326
19.2k
      break;
1327
1328
      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1329
      is set, we do not see these opcodes because \d etc are converted to
1330
      properties. Therefore, these apply in the case when only characters less
1331
      than 256 are recognized to match the types. */
1332
1333
31.4k
      case OP_NOT_DIGIT:
1334
31.4k
      set_nottype_bits(re, cbit_digit, table_limit);
1335
31.4k
      try_next = FALSE;
1336
31.4k
      break;
1337
1338
22.6k
      case OP_DIGIT:
1339
22.6k
      set_type_bits(re, cbit_digit, table_limit);
1340
22.6k
      try_next = FALSE;
1341
22.6k
      break;
1342
1343
15.8k
      case OP_NOT_WHITESPACE:
1344
15.8k
      set_nottype_bits(re, cbit_space, table_limit);
1345
15.8k
      try_next = FALSE;
1346
15.8k
      break;
1347
1348
34.9k
      case OP_WHITESPACE:
1349
34.9k
      set_type_bits(re, cbit_space, table_limit);
1350
34.9k
      try_next = FALSE;
1351
34.9k
      break;
1352
1353
19.9k
      case OP_NOT_WORDCHAR:
1354
19.9k
      set_nottype_bits(re, cbit_word, table_limit);
1355
19.9k
      try_next = FALSE;
1356
19.9k
      break;
1357
1358
24.5k
      case OP_WORDCHAR:
1359
24.5k
      set_type_bits(re, cbit_word, table_limit);
1360
24.5k
      try_next = FALSE;
1361
24.5k
      break;
1362
1363
      /* One or more character type fudges the pointer and restarts, knowing
1364
      it will hit a single character type and stop there. */
1365
1366
15.8k
      case OP_TYPEPLUS:
1367
16.7k
      case OP_TYPEMINPLUS:
1368
25.6k
      case OP_TYPEPOSPLUS:
1369
25.6k
      tcode++;
1370
25.6k
      break;
1371
1372
25.0k
      case OP_TYPEEXACT:
1373
25.0k
      tcode += 1 + IMM2_SIZE;
1374
25.0k
      break;
1375
1376
      /* Zero or more repeats of character types set the bits and then
1377
      try again. */
1378
1379
2.53k
      case OP_TYPEUPTO:
1380
3.87k
      case OP_TYPEMINUPTO:
1381
6.39k
      case OP_TYPEPOSUPTO:
1382
6.39k
      tcode += IMM2_SIZE;  /* Fall through */
1383
1384
78.4k
      case OP_TYPESTAR:
1385
84.8k
      case OP_TYPEMINSTAR:
1386
400k
      case OP_TYPEPOSSTAR:
1387
447k
      case OP_TYPEQUERY:
1388
453k
      case OP_TYPEMINQUERY:
1389
463k
      case OP_TYPEPOSQUERY:
1390
463k
      switch(tcode[1])
1391
463k
        {
1392
2.34k
        default:
1393
6.87k
        case OP_ANY:
1394
16.9k
        case OP_ALLANY:
1395
16.9k
        return SSB_FAIL;
1396
1397
10.5k
        case OP_HSPACE:
1398
10.5k
        SET_BIT(CHAR_HT);
1399
10.5k
        SET_BIT(CHAR_SPACE);
1400
1401
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1402
        the bits for 0xA0 and for code units >= 255, independently of UTF. */
1403
1404
#if PCRE2_CODE_UNIT_WIDTH != 8
1405
        SET_BIT(0xA0);
1406
        SET_BIT(0xFF);
1407
#else
1408
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1409
        units of horizontal space characters. */
1410
1411
10.5k
#ifdef SUPPORT_UNICODE
1412
10.5k
        if (utf)
1413
8.48k
          {
1414
8.48k
          SET_BIT(0xC2);  /* For U+00A0 */
1415
8.48k
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1416
8.48k
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1417
8.48k
          SET_BIT(0xE3);  /* For U+3000 */
1418
8.48k
          }
1419
2.03k
        else
1420
2.03k
#endif
1421
        /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1422
        the code is EBCDIC. */
1423
2.03k
          {
1424
2.03k
#ifndef EBCDIC
1425
2.03k
          SET_BIT(0xA0);
1426
2.03k
#endif  /* Not EBCDIC */
1427
2.03k
          }
1428
10.5k
#endif  /* 8-bit support */
1429
10.5k
        break;
1430
1431
2.44k
        case OP_ANYNL:
1432
9.84k
        case OP_VSPACE:
1433
9.84k
        SET_BIT(CHAR_LF);
1434
9.84k
        SET_BIT(CHAR_VT);
1435
9.84k
        SET_BIT(CHAR_FF);
1436
9.84k
        SET_BIT(CHAR_CR);
1437
1438
        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1439
        the bits for NEL and for code units >= 255, independently of UTF. */
1440
1441
#if PCRE2_CODE_UNIT_WIDTH != 8
1442
        SET_BIT(CHAR_NEL);
1443
        SET_BIT(0xFF);
1444
#else
1445
        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1446
        units of vertical space characters. */
1447
1448
9.84k
#ifdef SUPPORT_UNICODE
1449
9.84k
        if (utf)
1450
4.50k
          {
1451
4.50k
          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1452
4.50k
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1453
4.50k
          }
1454
5.34k
        else
1455
5.34k
#endif
1456
        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1457
5.34k
          {
1458
5.34k
          SET_BIT(CHAR_NEL);
1459
5.34k
          }
1460
9.84k
#endif  /* 8-bit support */
1461
9.84k
        break;
1462
1463
14.2k
        case OP_NOT_DIGIT:
1464
14.2k
        set_nottype_bits(re, cbit_digit, table_limit);
1465
14.2k
        break;
1466
1467
26.1k
        case OP_DIGIT:
1468
26.1k
        set_type_bits(re, cbit_digit, table_limit);
1469
26.1k
        break;
1470
1471
21.2k
        case OP_NOT_WHITESPACE:
1472
21.2k
        set_nottype_bits(re, cbit_space, table_limit);
1473
21.2k
        break;
1474
1475
326k
        case OP_WHITESPACE:
1476
326k
        set_type_bits(re, cbit_space, table_limit);
1477
326k
        break;
1478
1479
16.2k
        case OP_NOT_WORDCHAR:
1480
16.2k
        set_nottype_bits(re, cbit_word, table_limit);
1481
16.2k
        break;
1482
1483
21.8k
        case OP_WORDCHAR:
1484
21.8k
        set_type_bits(re, cbit_word, table_limit);
1485
21.8k
        break;
1486
463k
        }
1487
1488
446k
      tcode += 2;
1489
446k
      break;
1490
1491
      /* Extended class: if there are any property checks, or if this is a
1492
      negative XCLASS without a map, give up. If there are no property checks,
1493
      there must be wide characters on the XCLASS list, because otherwise an
1494
      XCLASS would not have been created. This means that code points >= 255
1495
      are potential starters. In the UTF-8 case we can scan them and set bits
1496
      for the relevant leading bytes. */
1497
1498
0
#ifdef SUPPORT_WIDE_CHARS
1499
36.8k
      case OP_XCLASS:
1500
36.8k
      xclassflags = tcode[1 + LINK_SIZE];
1501
36.8k
      if ((xclassflags & XCL_HASPROP) != 0 ||
1502
35.5k
          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1503
1.62k
        return SSB_FAIL;
1504
1505
      /* We have a positive XCLASS or a negative one without a map. Set up the
1506
      map pointer if there is one, and fall through. */
1507
1508
35.2k
      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1509
35.2k
        (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1510
1511
      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1512
      then jump to handle the map. */
1513
1514
35.2k
#if PCRE2_CODE_UNIT_WIDTH == 8
1515
35.2k
      if (utf && (xclassflags & XCL_NOT) == 0)
1516
24.3k
        {
1517
24.3k
        PCRE2_UCHAR b, e;
1518
24.3k
        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1519
24.3k
        tcode += GET(tcode, 1);
1520
1521
164k
        for (;;) switch (*p++)
1522
164k
          {
1523
60.7k
          case XCL_SINGLE:
1524
60.7k
          b = *p++;
1525
135k
          while ((*p & 0xc0) == 0x80) p++;
1526
60.7k
          re->start_bitmap[b/8] |= (1u << (b&7));
1527
60.7k
          break;
1528
1529
79.6k
          case XCL_RANGE:
1530
79.6k
          b = *p++;
1531
221k
          while ((*p & 0xc0) == 0x80) p++;
1532
79.6k
          e = *p++;
1533
248k
          while ((*p & 0xc0) == 0x80) p++;
1534
777k
          for (; b <= e; b++)
1535
697k
            re->start_bitmap[b/8] |= (1u << (b&7));
1536
79.6k
          break;
1537
1538
24.3k
          case XCL_END:
1539
24.3k
          goto HANDLE_CLASSMAP;
1540
1541
0
          default:
1542
0
          return SSB_UNKNOWN;   /* Internal error, should not occur */
1543
164k
          }
1544
24.3k
        }
1545
10.9k
#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1546
10.9k
#endif  /* SUPPORT_WIDE_CHARS */
1547
1548
      /* It seems that the fall through comment must be outside the #ifdef if
1549
      it is to avoid the gcc compiler warning. */
1550
1551
      /* Fall through */
1552
1553
      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1554
      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1555
      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1556
      there is no difference between CLASS and NCLASS. In all other wide
1557
      character modes, set the 0xFF bit to indicate code units >= 255. */
1558
1559
27.9k
      case OP_NCLASS:
1560
27.9k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1561
27.9k
      if (utf)
1562
12.0k
        {
1563
12.0k
        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1564
12.0k
        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1565
12.0k
        }
1566
#elif PCRE2_CODE_UNIT_WIDTH != 8
1567
      SET_BIT(0xFF);                             /* For characters >= 255 */
1568
#endif
1569
      /* Fall through */
1570
1571
      /* Enter here for a positive non-XCLASS. If we have fallen through from
1572
      an XCLASS, classmap will already be set; just advance the code pointer.
1573
      Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1574
1575
211k
      case OP_CLASS:
1576
211k
      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1577
201k
        {
1578
201k
        classmap = (uint8_t *)(++tcode);
1579
201k
        tcode += 32 / sizeof(PCRE2_UCHAR);
1580
201k
        }
1581
1582
      /* When wide characters are supported, classmap may be NULL. In UTF-8
1583
      (sic) mode, the bits in a class bit map correspond to character values,
1584
      not to byte values. However, the bit map we are constructing is for byte
1585
      values. So we have to do a conversion for characters whose code point is
1586
      greater than 127. In fact, there are only two possible starting bytes for
1587
      characters in the range 128 - 255. */
1588
1589
211k
#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1590
236k
      HANDLE_CLASSMAP:
1591
236k
#endif
1592
236k
      if (classmap != NULL)
1593
234k
        {
1594
234k
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1595
234k
        if (utf)
1596
47.9k
          {
1597
814k
          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1598
3.18M
          for (c = 128; c < 256; c++)
1599
3.13M
            {
1600
3.13M
            if ((classmap[c/8] & (1u << (c&7))) != 0)
1601
48.5k
              {
1602
48.5k
              int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1603
48.5k
              re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1604
48.5k
              c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1605
48.5k
              }
1606
3.13M
            }
1607
47.9k
          }
1608
186k
        else
1609
186k
#endif
1610
        /* In all modes except UTF-8, the two bit maps are compatible. */
1611
1612
186k
          {
1613
6.14M
          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1614
186k
          }
1615
234k
        }
1616
1617
      /* Act on what follows the class. For a zero minimum repeat, continue;
1618
      otherwise stop processing. */
1619
1620
236k
      switch (*tcode)
1621
236k
        {
1622
17.7k
        case OP_CRSTAR:
1623
20.6k
        case OP_CRMINSTAR:
1624
31.2k
        case OP_CRQUERY:
1625
36.8k
        case OP_CRMINQUERY:
1626
48.8k
        case OP_CRPOSSTAR:
1627
52.6k
        case OP_CRPOSQUERY:
1628
52.6k
        tcode++;
1629
52.6k
        break;
1630
1631
10.6k
        case OP_CRRANGE:
1632
13.3k
        case OP_CRMINRANGE:
1633
18.0k
        case OP_CRPOSRANGE:
1634
18.0k
        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1635
7.10k
          else try_next = FALSE;
1636
18.0k
        break;
1637
1638
165k
        default:
1639
165k
        try_next = FALSE;
1640
165k
        break;
1641
236k
        }
1642
236k
      break; /* End of class handling case */
1643
6.69M
      }      /* End of switch for opcodes */
1644
6.69M
    }        /* End of try_next loop */
1645
1646
2.08M
  code += GET(code, 1);   /* Advance to next branch */
1647
2.08M
  }
1648
2.44M
while (*code == OP_ALT);
1649
1650
824k
return yield;
1651
2.44M
}
1652
1653
1654
1655
/*************************************************
1656
*          Study a compiled expression           *
1657
*************************************************/
1658
1659
/* This function is handed a compiled expression that it must study to produce
1660
information that will speed up the matching.
1661
1662
Argument:
1663
  re       points to the compiled expression
1664
1665
Returns:   0 normally; non-zero should never normally occur
1666
           1 unknown opcode in set_start_bits
1667
           2 missing capturing bracket
1668
           3 unknown opcode in find_minlength
1669
*/
1670
1671
int
1672
PRIV(study)(pcre2_real_code *re)
1673
1.03M
{
1674
1.03M
int count = 0;
1675
1.03M
PCRE2_UCHAR *code;
1676
1.03M
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1677
1.03M
BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1678
1679
/* Find start of compiled code */
1680
1681
1.03M
code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1682
1.03M
  re->name_entry_size * re->name_count;
1683
1684
/* For a pattern that has a first code unit, or a multiline pattern that
1685
matches only at "line start", there is no point in seeking a list of starting
1686
code units. */
1687
1688
1.03M
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1689
856k
  {
1690
856k
  int depth = 0;
1691
856k
  int rc = set_start_bits(re, code, utf, ucp, &depth);
1692
856k
  if (rc == SSB_UNKNOWN) return 1;
1693
1694
  /* If a list of starting code units was set up, scan the list to see if only
1695
  one or two were listed. Having only one listed is rare because usually a
1696
  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1697
  If two are listed, see if they are caseless versions of the same character;
1698
  if so we can replace the list with a caseless first code unit. This gives
1699
  better performance and is plausibly worth doing for patterns such as [Ww]ord
1700
  or (word|WORD). */
1701
1702
856k
  if (rc == SSB_DONE)
1703
515k
    {
1704
515k
    int i;
1705
515k
    int a = -1;
1706
515k
    int b = -1;
1707
515k
    uint8_t *p = re->start_bitmap;
1708
515k
    uint32_t flags = PCRE2_FIRSTMAPSET;
1709
1710
2.46M
    for (i = 0; i < 256; p++, i += 8)
1711
2.46M
      {
1712
2.46M
      uint8_t x = *p;
1713
2.46M
      if (x != 0)
1714
565k
        {
1715
565k
        int c;
1716
565k
        uint8_t y = x & (~x + 1);   /* Least significant bit */
1717
565k
        if (y != x) goto DONE;      /* More than one bit set */
1718
1719
        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1720
        all wide characters", so we cannot use it here. */
1721
1722
#if PCRE2_CODE_UNIT_WIDTH != 8
1723
        if (i == 248 && x == 0x80) goto DONE;
1724
#endif
1725
1726
        /* Compute the character value */
1727
1728
97.0k
        c = i;
1729
97.0k
        switch (x)
1730
97.0k
          {
1731
10.9k
          case 1:   break;
1732
11.8k
          case 2:   c += 1; break;  case 4:  c += 2; break;
1733
17.5k
          case 8:   c += 3; break;  case 16: c += 4; break;
1734
14.3k
          case 32:  c += 5; break;  case 64: c += 6; break;
1735
8.97k
          case 128: c += 7; break;
1736
97.0k
          }
1737
1738
        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1739
        mode, only values < 128 can be used. In all the other cases, c is a
1740
        character value. */
1741
1742
97.0k
#if PCRE2_CODE_UNIT_WIDTH == 8
1743
97.0k
        if (utf && c > 127) goto DONE;
1744
94.3k
#endif
1745
94.3k
        if (a < 0) a = c;   /* First one found, save in a */
1746
42.5k
        else if (b < 0)     /* Second one found */
1747
40.1k
          {
1748
40.1k
          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1749
1750
40.1k
#ifdef SUPPORT_UNICODE
1751
40.1k
          if (utf || ucp)
1752
5.26k
            {
1753
5.26k
            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1754
2.51k
            if (c > 127) d = UCD_OTHERCASE(c);
1755
2.51k
            }
1756
37.4k
#endif  /* SUPPORT_UNICODE */
1757
1758
37.4k
          if (d != a) goto DONE;   /* Not the other case of a */
1759
5.91k
          b = c;                   /* Save second in b */
1760
5.91k
          }
1761
2.39k
        else goto DONE;   /* More than two characters found */
1762
94.3k
        }
1763
2.46M
      }
1764
1765
    /* Replace the start code unit bits with a first code unit, but only if it
1766
    is not the same as a required later code unit. This is because a search for
1767
    a required code unit starts after an explicit first code unit, but at a
1768
    code unit found from the bitmap. Patterns such as /a*a/ don't work
1769
    if both the start unit and required unit are the same. */
1770
1771
7.16k
    if (a >= 0 &&
1772
6.88k
        (
1773
6.88k
        (re->flags & PCRE2_LASTSET) == 0 ||
1774
5.55k
          (
1775
5.55k
          re->last_codeunit != (uint32_t)a &&
1776
4.10k
          (b < 0 || re->last_codeunit != (uint32_t)b)
1777
5.55k
          )
1778
6.88k
        ))
1779
4.54k
      {
1780
4.54k
      re->first_codeunit = a;
1781
4.54k
      flags = PCRE2_FIRSTSET;
1782
4.54k
      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1783
4.54k
      }
1784
1785
515k
    DONE:
1786
515k
    re->flags |= flags;
1787
515k
    }
1788
856k
  }
1789
1790
/* Find the minimum length of subject string. If the pattern can match an empty
1791
string, the minimum length is already known. If the pattern contains (*ACCEPT)
1792
all bets are off, and we don't even try to find a minimum length. If there are
1793
more back references than the size of the vector we are going to cache them in,
1794
do nothing. A pattern that complicated will probably take a long time to
1795
analyze and may in any case turn out to be too complicated. Note that back
1796
reference minima are held as 16-bit numbers. */
1797
1798
1.03M
if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1799
921k
     re->top_backref <= MAX_CACHE_BACKREF)
1800
921k
  {
1801
921k
  int min;
1802
921k
  int backref_cache[MAX_CACHE_BACKREF+1];
1803
921k
  backref_cache[0] = 0;    /* Highest one that is set */
1804
921k
  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1805
921k
  switch(min)
1806
921k
    {
1807
3.16k
    case -1:  /* \C in UTF mode or over-complex regex */
1808
3.16k
    break;    /* Leave minlength unchanged (will be zero) */
1809
1810
0
    case -2:
1811
0
    return 2; /* missing capturing bracket */
1812
1813
0
    case -3:
1814
0
    return 3; /* unrecognized opcode */
1815
1816
918k
    default:
1817
918k
    re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1818
918k
    break;
1819
921k
    }
1820
921k
  }
1821
1822
1.03M
return 0;
1823
1.03M
}
1824
1825
/* End of pcre2_study.c */