Coverage Report

Created: 2025-07-12 06:33

/src/glib/glib/pcre/pcre_study.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
           Copyright (c) 1997-2012 University of Cambridge
10
11
-----------------------------------------------------------------------------
12
Redistribution and use in source and binary forms, with or without
13
modification, are permitted provided that the following conditions are met:
14
15
    * Redistributions of source code must retain the above copyright notice,
16
      this list of conditions and the following disclaimer.
17
18
    * Redistributions in binary form must reproduce the above copyright
19
      notice, this list of conditions and the following disclaimer in the
20
      documentation and/or other materials provided with the distribution.
21
22
    * Neither the name of the University of Cambridge nor the names of its
23
      contributors may be used to endorse or promote products derived from
24
      this software without specific prior written permission.
25
26
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
POSSIBILITY OF SUCH DAMAGE.
37
-----------------------------------------------------------------------------
38
*/
39
40
41
/* This module contains the external function pcre_study(), along with local
42
supporting functions. */
43
44
45
#include "config.h"
46
47
#include "pcre_internal.h"
48
49
0
#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
50
51
/* Returns from set_start_bits() */
52
53
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
54
55
56
57
/*************************************************
58
*   Find the minimum subject length for a group  *
59
*************************************************/
60
61
/* Scan a parenthesized group and compute the minimum length of subject that
62
is needed to match it. This is a lower bound; it does not mean there is a
63
string of that length that matches. In UTF8 mode, the result is in characters
64
rather than bytes.
65
66
Arguments:
67
  code            pointer to start of group (the bracket)
68
  startcode       pointer to start of the whole pattern
69
  options         the compiling options
70
  int             RECURSE depth
71
72
Returns:   the minimum length
73
           -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
74
           -2 internal error (missing capturing bracket)
75
           -3 internal error (opcode not listed)
76
*/
77
78
static int
79
find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
80
  int recurse_depth)
81
0
{
82
0
int length = -1;
83
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
84
0
BOOL utf = (options & PCRE_UTF8) != 0;
85
0
BOOL had_recurse = FALSE;
86
0
int branchlength = 0;
87
0
pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
88
89
0
if (*code == OP_CBRA || *code == OP_SCBRA ||
90
0
    *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
91
92
/* Scan along the opcodes for this branch. If we get to the end of the
93
branch, check the length against that of the other branches. */
94
95
0
for (;;)
96
0
  {
97
0
  int d, min;
98
0
  pcre_uchar *cs, *ce;
99
0
  int op = *cc;
100
101
0
  switch (op)
102
0
    {
103
0
    case OP_COND:
104
0
    case OP_SCOND:
105
106
    /* If there is only one branch in a condition, the implied branch has zero
107
    length, so we don't add anything. This covers the DEFINE "condition"
108
    automatically. */
109
110
0
    cs = cc + GET(cc, 1);
111
0
    if (*cs != OP_ALT)
112
0
      {
113
0
      cc = cs + 1 + LINK_SIZE;
114
0
      break;
115
0
      }
116
117
    /* Otherwise we can fall through and treat it the same as any other
118
    subpattern. */
119
120
0
    case OP_CBRA:
121
0
    case OP_SCBRA:
122
0
    case OP_BRA:
123
0
    case OP_SBRA:
124
0
    case OP_CBRAPOS:
125
0
    case OP_SCBRAPOS:
126
0
    case OP_BRAPOS:
127
0
    case OP_SBRAPOS:
128
0
    case OP_ONCE:
129
0
    case OP_ONCE_NC:
130
0
    d = find_minlength(cc, startcode, options, recurse_depth);
131
0
    if (d < 0) return d;
132
0
    branchlength += d;
133
0
    do cc += GET(cc, 1); while (*cc == OP_ALT);
134
0
    cc += 1 + LINK_SIZE;
135
0
    break;
136
137
    /* ACCEPT makes things far too complicated; we have to give up. */
138
139
0
    case OP_ACCEPT:
140
0
    case OP_ASSERT_ACCEPT:
141
0
    return -1;
142
143
    /* Reached end of a branch; if it's a ket it is the end of a nested
144
    call. If it's ALT it is an alternation in a nested call. If it is END it's
145
    the end of the outer call. All can be handled by the same code. If an
146
    ACCEPT was previously encountered, use the length that was in force at that
147
    time, and pass back the shortest ACCEPT length. */
148
149
0
    case OP_ALT:
150
0
    case OP_KET:
151
0
    case OP_KETRMAX:
152
0
    case OP_KETRMIN:
153
0
    case OP_KETRPOS:
154
0
    case OP_END:
155
0
    if (length < 0 || (!had_recurse && branchlength < length))
156
0
      length = branchlength;
157
0
    if (op != OP_ALT) return length;
158
0
    cc += 1 + LINK_SIZE;
159
0
    branchlength = 0;
160
0
    had_recurse = FALSE;
161
0
    break;
162
163
    /* Skip over assertive subpatterns */
164
165
0
    case OP_ASSERT:
166
0
    case OP_ASSERT_NOT:
167
0
    case OP_ASSERTBACK:
168
0
    case OP_ASSERTBACK_NOT:
169
0
    do cc += GET(cc, 1); while (*cc == OP_ALT);
170
    /* Fall through */
171
172
    /* Skip over things that don't match chars */
173
174
0
    case OP_REVERSE:
175
0
    case OP_CREF:
176
0
    case OP_NCREF:
177
0
    case OP_RREF:
178
0
    case OP_NRREF:
179
0
    case OP_DEF:
180
0
    case OP_CALLOUT:
181
0
    case OP_SOD:
182
0
    case OP_SOM:
183
0
    case OP_EOD:
184
0
    case OP_EODN:
185
0
    case OP_CIRC:
186
0
    case OP_CIRCM:
187
0
    case OP_DOLL:
188
0
    case OP_DOLLM:
189
0
    case OP_NOT_WORD_BOUNDARY:
190
0
    case OP_WORD_BOUNDARY:
191
0
    cc += PRIV(OP_lengths)[*cc];
192
0
    break;
193
194
    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
195
196
0
    case OP_BRAZERO:
197
0
    case OP_BRAMINZERO:
198
0
    case OP_BRAPOSZERO:
199
0
    case OP_SKIPZERO:
200
0
    cc += PRIV(OP_lengths)[*cc];
201
0
    do cc += GET(cc, 1); while (*cc == OP_ALT);
202
0
    cc += 1 + LINK_SIZE;
203
0
    break;
204
205
    /* Handle literal characters and + repetitions */
206
207
0
    case OP_CHAR:
208
0
    case OP_CHARI:
209
0
    case OP_NOT:
210
0
    case OP_NOTI:
211
0
    case OP_PLUS:
212
0
    case OP_PLUSI:
213
0
    case OP_MINPLUS:
214
0
    case OP_MINPLUSI:
215
0
    case OP_POSPLUS:
216
0
    case OP_POSPLUSI:
217
0
    case OP_NOTPLUS:
218
0
    case OP_NOTPLUSI:
219
0
    case OP_NOTMINPLUS:
220
0
    case OP_NOTMINPLUSI:
221
0
    case OP_NOTPOSPLUS:
222
0
    case OP_NOTPOSPLUSI:
223
0
    branchlength++;
224
0
    cc += 2;
225
0
#ifdef SUPPORT_UTF
226
0
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
227
0
#endif
228
0
    break;
229
230
0
    case OP_TYPEPLUS:
231
0
    case OP_TYPEMINPLUS:
232
0
    case OP_TYPEPOSPLUS:
233
0
    branchlength++;
234
0
    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
235
0
    break;
236
237
    /* Handle exact repetitions. The count is already in characters, but we
238
    need to skip over a multibyte character in UTF8 mode.  */
239
240
0
    case OP_EXACT:
241
0
    case OP_EXACTI:
242
0
    case OP_NOTEXACT:
243
0
    case OP_NOTEXACTI:
244
0
    branchlength += GET2(cc,1);
245
0
    cc += 2 + IMM2_SIZE;
246
0
#ifdef SUPPORT_UTF
247
0
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
248
0
#endif
249
0
    break;
250
251
0
    case OP_TYPEEXACT:
252
0
    branchlength += GET2(cc,1);
253
0
    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
254
0
      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
255
0
    break;
256
257
    /* Handle single-char non-literal matchers */
258
259
0
    case OP_PROP:
260
0
    case OP_NOTPROP:
261
0
    cc += 2;
262
    /* Fall through */
263
264
0
    case OP_NOT_DIGIT:
265
0
    case OP_DIGIT:
266
0
    case OP_NOT_WHITESPACE:
267
0
    case OP_WHITESPACE:
268
0
    case OP_NOT_WORDCHAR:
269
0
    case OP_WORDCHAR:
270
0
    case OP_ANY:
271
0
    case OP_ALLANY:
272
0
    case OP_EXTUNI:
273
0
    case OP_HSPACE:
274
0
    case OP_NOT_HSPACE:
275
0
    case OP_VSPACE:
276
0
    case OP_NOT_VSPACE:
277
0
    branchlength++;
278
0
    cc++;
279
0
    break;
280
281
    /* "Any newline" might match two characters, but it also might match just
282
    one. */
283
284
0
    case OP_ANYNL:
285
0
    branchlength += 1;
286
0
    cc++;
287
0
    break;
288
289
    /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
290
    non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
291
    appear, but leave the code, just in case.) */
292
293
0
    case OP_ANYBYTE:
294
0
#ifdef SUPPORT_UTF
295
0
    if (utf) return -1;
296
0
#endif
297
0
    branchlength++;
298
0
    cc++;
299
0
    break;
300
301
    /* For repeated character types, we have to test for \p and \P, which have
302
    an extra two bytes of parameters. */
303
304
0
    case OP_TYPESTAR:
305
0
    case OP_TYPEMINSTAR:
306
0
    case OP_TYPEQUERY:
307
0
    case OP_TYPEMINQUERY:
308
0
    case OP_TYPEPOSSTAR:
309
0
    case OP_TYPEPOSQUERY:
310
0
    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
311
0
    cc += PRIV(OP_lengths)[op];
312
0
    break;
313
314
0
    case OP_TYPEUPTO:
315
0
    case OP_TYPEMINUPTO:
316
0
    case OP_TYPEPOSUPTO:
317
0
    if (cc[1 + IMM2_SIZE] == OP_PROP
318
0
      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
319
0
    cc += PRIV(OP_lengths)[op];
320
0
    break;
321
322
    /* Check a class for variable quantification */
323
324
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
325
0
    case OP_XCLASS:
326
0
    cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
327
    /* Fall through */
328
0
#endif
329
330
0
    case OP_CLASS:
331
0
    case OP_NCLASS:
332
0
    cc += PRIV(OP_lengths)[OP_CLASS];
333
334
0
    switch (*cc)
335
0
      {
336
0
      case OP_CRPLUS:
337
0
      case OP_CRMINPLUS:
338
0
      branchlength++;
339
      /* Fall through */
340
341
0
      case OP_CRSTAR:
342
0
      case OP_CRMINSTAR:
343
0
      case OP_CRQUERY:
344
0
      case OP_CRMINQUERY:
345
0
      cc++;
346
0
      break;
347
348
0
      case OP_CRRANGE:
349
0
      case OP_CRMINRANGE:
350
0
      branchlength += GET2(cc,1);
351
0
      cc += 1 + 2 * IMM2_SIZE;
352
0
      break;
353
354
0
      default:
355
0
      branchlength++;
356
0
      break;
357
0
      }
358
0
    break;
359
360
    /* Backreferences and subroutine calls are treated in the same way: we find
361
    the minimum length for the subpattern. A recursion, however, causes an
362
    a flag to be set that causes the length of this branch to be ignored. The
363
    logic is that a recursion can only make sense if there is another
364
    alternation that stops the recursing. That will provide the minimum length
365
    (when no recursion happens). A backreference within the group that it is
366
    referencing behaves in the same way.
367
368
    If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
369
    matches an empty string (by default it causes a matching failure), so in
370
    that case we must set the minimum length to zero. */
371
372
0
    case OP_REF:
373
0
    case OP_REFI:
374
0
    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
375
0
      {
376
0
      ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
377
0
      if (cs == NULL) return -2;
378
0
      do ce += GET(ce, 1); while (*ce == OP_ALT);
379
0
      if (cc > cs && cc < ce)
380
0
        {
381
0
        d = 0;
382
0
        had_recurse = TRUE;
383
0
        }
384
0
      else
385
0
        {
386
0
        d = find_minlength(cs, startcode, options, recurse_depth);
387
0
        }
388
0
      }
389
0
    else d = 0;
390
0
    cc += 1 + IMM2_SIZE;
391
392
    /* Handle repeated back references */
393
394
0
    switch (*cc)
395
0
      {
396
0
      case OP_CRSTAR:
397
0
      case OP_CRMINSTAR:
398
0
      case OP_CRQUERY:
399
0
      case OP_CRMINQUERY:
400
0
      min = 0;
401
0
      cc++;
402
0
      break;
403
404
0
      case OP_CRPLUS:
405
0
      case OP_CRMINPLUS:
406
0
      min = 1;
407
0
      cc++;
408
0
      break;
409
410
0
      case OP_CRRANGE:
411
0
      case OP_CRMINRANGE:
412
0
      min = GET2(cc, 1);
413
0
      cc += 1 + 2 * IMM2_SIZE;
414
0
      break;
415
416
0
      default:
417
0
      min = 1;
418
0
      break;
419
0
      }
420
421
0
    branchlength += min * d;
422
0
    break;
423
424
    /* We can easily detect direct recursion, but not mutual recursion. This is
425
    caught by a recursion depth count. */
426
427
0
    case OP_RECURSE:
428
0
    cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
429
0
    do ce += GET(ce, 1); while (*ce == OP_ALT);
430
0
    if ((cc > cs && cc < ce) || recurse_depth > 10)
431
0
      had_recurse = TRUE;
432
0
    else
433
0
      {
434
0
      branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
435
0
      }
436
0
    cc += 1 + LINK_SIZE;
437
0
    break;
438
439
    /* Anything else does not or need not match a character. We can get the
440
    item's length from the table, but for those that can match zero occurrences
441
    of a character, we must take special action for UTF-8 characters. As it
442
    happens, the "NOT" versions of these opcodes are used at present only for
443
    ASCII characters, so they could be omitted from this list. However, in
444
    future that may change, so we include them here so as not to leave a
445
    gotcha for a future maintainer. */
446
447
0
    case OP_UPTO:
448
0
    case OP_UPTOI:
449
0
    case OP_NOTUPTO:
450
0
    case OP_NOTUPTOI:
451
0
    case OP_MINUPTO:
452
0
    case OP_MINUPTOI:
453
0
    case OP_NOTMINUPTO:
454
0
    case OP_NOTMINUPTOI:
455
0
    case OP_POSUPTO:
456
0
    case OP_POSUPTOI:
457
0
    case OP_NOTPOSUPTO:
458
0
    case OP_NOTPOSUPTOI:
459
460
0
    case OP_STAR:
461
0
    case OP_STARI:
462
0
    case OP_NOTSTAR:
463
0
    case OP_NOTSTARI:
464
0
    case OP_MINSTAR:
465
0
    case OP_MINSTARI:
466
0
    case OP_NOTMINSTAR:
467
0
    case OP_NOTMINSTARI:
468
0
    case OP_POSSTAR:
469
0
    case OP_POSSTARI:
470
0
    case OP_NOTPOSSTAR:
471
0
    case OP_NOTPOSSTARI:
472
473
0
    case OP_QUERY:
474
0
    case OP_QUERYI:
475
0
    case OP_NOTQUERY:
476
0
    case OP_NOTQUERYI:
477
0
    case OP_MINQUERY:
478
0
    case OP_MINQUERYI:
479
0
    case OP_NOTMINQUERY:
480
0
    case OP_NOTMINQUERYI:
481
0
    case OP_POSQUERY:
482
0
    case OP_POSQUERYI:
483
0
    case OP_NOTPOSQUERY:
484
0
    case OP_NOTPOSQUERYI:
485
486
0
    cc += PRIV(OP_lengths)[op];
487
0
#ifdef SUPPORT_UTF
488
0
    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
489
0
#endif
490
0
    break;
491
492
    /* Skip these, but we need to add in the name length. */
493
494
0
    case OP_MARK:
495
0
    case OP_PRUNE_ARG:
496
0
    case OP_SKIP_ARG:
497
0
    case OP_THEN_ARG:
498
0
    cc += PRIV(OP_lengths)[op] + cc[1];
499
0
    break;
500
501
    /* The remaining opcodes are just skipped over. */
502
503
0
    case OP_CLOSE:
504
0
    case OP_COMMIT:
505
0
    case OP_FAIL:
506
0
    case OP_PRUNE:
507
0
    case OP_SET_SOM:
508
0
    case OP_SKIP:
509
0
    case OP_THEN:
510
0
    cc += PRIV(OP_lengths)[op];
511
0
    break;
512
513
    /* This should not occur: we list all opcodes explicitly so that when
514
    new ones get added they are properly considered. */
515
516
0
    default:
517
0
    return -3;
518
0
    }
519
0
  }
520
/* Control never gets here */
521
0
}
522
523
524
525
/*************************************************
526
*      Set a bit and maybe its alternate case    *
527
*************************************************/
528
529
/* Given a character, set its first byte's bit in the table, and also the
530
corresponding bit for the other version of a letter if we are caseless. In
531
UTF-8 mode, for characters greater than 127, we can only do the caseless thing
532
when Unicode property support is available.
533
534
Arguments:
535
  start_bits    points to the bit map
536
  p             points to the character
537
  caseless      the caseless flag
538
  cd            the block with char table pointers
539
  utf           TRUE for UTF-8 / UTF-16 mode
540
541
Returns:        pointer after the character
542
*/
543
544
static const pcre_uchar *
545
set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
546
  compile_data *cd, BOOL utf)
547
0
{
548
0
unsigned int c = *p;
549
550
0
#ifdef COMPILE_PCRE8
551
0
SET_BIT(c);
552
553
0
#ifdef SUPPORT_UTF
554
0
if (utf && c > 127)
555
0
  {
556
0
  GETCHARINC(c, p);
557
0
#ifdef SUPPORT_UCP
558
0
  if (caseless)
559
0
    {
560
0
    pcre_uchar buff[6];
561
0
    c = UCD_OTHERCASE(c);
562
0
    (void)PRIV(ord2utf)(c, buff);
563
0
    SET_BIT(buff[0]);
564
0
    }
565
0
#endif
566
0
  return p;
567
0
  }
568
0
#endif
569
570
/* Not UTF-8 mode, or character is less than 127. */
571
572
0
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
573
0
return p + 1;
574
0
#endif
575
576
#ifdef COMPILE_PCRE16
577
if (c > 0xff)
578
  {
579
  c = 0xff;
580
  caseless = FALSE;
581
  }
582
SET_BIT(c);
583
584
#ifdef SUPPORT_UTF
585
if (utf && c > 127)
586
  {
587
  GETCHARINC(c, p);
588
#ifdef SUPPORT_UCP
589
  if (caseless)
590
    {
591
    c = UCD_OTHERCASE(c);
592
    if (c > 0xff)
593
      c = 0xff;
594
    SET_BIT(c);
595
    }
596
#endif
597
  return p;
598
  }
599
#endif
600
601
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
602
return p + 1;
603
#endif
604
0
}
605
606
607
608
/*************************************************
609
*     Set bits for a positive character type     *
610
*************************************************/
611
612
/* This function sets starting bits for a character type. In UTF-8 mode, we can
613
only do a direct setting for bytes less than 128, as otherwise there can be
614
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
615
environment, the tables will only recognize ASCII characters anyway, but in at
616
least one Windows environment, some higher bytes bits were set in the tables.
617
So we deal with that case by considering the UTF-8 encoding.
618
619
Arguments:
620
  start_bits     the starting bitmap
621
  cbit type      the type of character wanted
622
  table_limit    32 for non-UTF-8; 16 for UTF-8
623
  cd             the block with char table pointers
624
625
Returns:         nothing
626
*/
627
628
static void
629
set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
630
  compile_data *cd)
631
0
{
632
0
int c;
633
0
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
634
0
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
635
0
if (table_limit == 32) return;
636
0
for (c = 128; c < 256; c++)
637
0
  {
638
0
  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
639
0
    {
640
0
    pcre_uchar buff[6];
641
0
    (void)PRIV(ord2utf)(c, buff);
642
0
    SET_BIT(buff[0]);
643
0
    }
644
0
  }
645
0
#endif
646
0
}
647
648
649
/*************************************************
650
*     Set bits for a negative character type     *
651
*************************************************/
652
653
/* This function sets starting bits for a negative character type such as \D.
654
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
655
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
656
Unlike in the positive case, where we can set appropriate starting bits for
657
specific high-valued UTF-8 characters, in this case we have to set the bits for
658
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
659
0xc0 (192) for simplicity.
660
661
Arguments:
662
  start_bits     the starting bitmap
663
  cbit type      the type of character wanted
664
  table_limit    32 for non-UTF-8; 16 for UTF-8
665
  cd             the block with char table pointers
666
667
Returns:         nothing
668
*/
669
670
static void
671
set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
672
  compile_data *cd)
673
0
{
674
0
int c;
675
0
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
676
0
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
677
0
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
678
0
#endif
679
0
}
680
681
682
683
/*************************************************
684
*          Create bitmap of starting bytes       *
685
*************************************************/
686
687
/* This function scans a compiled unanchored expression recursively and
688
attempts to build a bitmap of the set of possible starting bytes. As time goes
689
by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
690
useful for parenthesized groups in patterns such as (a*)b where the group
691
provides some optional starting bytes but scanning must continue at the outer
692
level to find at least one mandatory byte. At the outermost level, this
693
function fails unless the result is SSB_DONE.
694
695
Arguments:
696
  code         points to an expression
697
  start_bits   points to a 32-byte table, initialized to 0
698
  utf          TRUE if in UTF-8 / UTF-16 mode
699
  cd           the block with char table pointers
700
701
Returns:       SSB_FAIL     => Failed to find any starting bytes
702
               SSB_DONE     => Found mandatory starting bytes
703
               SSB_CONTINUE => Found optional starting bytes
704
               SSB_UNKNOWN  => Hit an unrecognized opcode
705
*/
706
707
static int
708
set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
709
  compile_data *cd)
710
0
{
711
0
int c;
712
0
int yield = SSB_DONE;
713
0
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
714
0
int table_limit = utf? 16:32;
715
#else
716
int table_limit = 32;
717
#endif
718
719
#if 0
720
/* ========================================================================= */
721
/* The following comment and code was inserted in January 1999. In May 2006,
722
when it was observed to cause compiler warnings about unused values, I took it
723
out again. If anybody is still using OS/2, they will have to put it back
724
manually. */
725
726
/* This next statement and the later reference to dummy are here in order to
727
trick the optimizer of the IBM C compiler for OS/2 into generating correct
728
code. Apparently IBM isn't going to fix the problem, and we would rather not
729
disable optimization (in this module it actually makes a big difference, and
730
the pcre module can use all the optimization it can get). */
731
732
volatile int dummy;
733
/* ========================================================================= */
734
#endif
735
736
0
do
737
0
  {
738
0
  BOOL try_next = TRUE;
739
0
  const pcre_uchar *tcode = code + 1 + LINK_SIZE;
740
741
0
  if (*code == OP_CBRA || *code == OP_SCBRA ||
742
0
      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
743
744
0
  while (try_next)    /* Loop for items in this branch */
745
0
    {
746
0
    int rc;
747
748
0
    switch(*tcode)
749
0
      {
750
      /* If we reach something we don't understand, it means a new opcode has
751
      been created that hasn't been added to this code. Hopefully this problem
752
      will be discovered during testing. */
753
754
0
      default:
755
0
      return SSB_UNKNOWN;
756
757
      /* Fail for a valid opcode that implies no starting bits. */
758
759
0
      case OP_ACCEPT:
760
0
      case OP_ASSERT_ACCEPT:
761
0
      case OP_ALLANY:
762
0
      case OP_ANY:
763
0
      case OP_ANYBYTE:
764
0
      case OP_CIRC:
765
0
      case OP_CIRCM:
766
0
      case OP_CLOSE:
767
0
      case OP_COMMIT:
768
0
      case OP_COND:
769
0
      case OP_CREF:
770
0
      case OP_DEF:
771
0
      case OP_DOLL:
772
0
      case OP_DOLLM:
773
0
      case OP_END:
774
0
      case OP_EOD:
775
0
      case OP_EODN:
776
0
      case OP_EXTUNI:
777
0
      case OP_FAIL:
778
0
      case OP_MARK:
779
0
      case OP_NCREF:
780
0
      case OP_NOT:
781
0
      case OP_NOTEXACT:
782
0
      case OP_NOTEXACTI:
783
0
      case OP_NOTI:
784
0
      case OP_NOTMINPLUS:
785
0
      case OP_NOTMINPLUSI:
786
0
      case OP_NOTMINQUERY:
787
0
      case OP_NOTMINQUERYI:
788
0
      case OP_NOTMINSTAR:
789
0
      case OP_NOTMINSTARI:
790
0
      case OP_NOTMINUPTO:
791
0
      case OP_NOTMINUPTOI:
792
0
      case OP_NOTPLUS:
793
0
      case OP_NOTPLUSI:
794
0
      case OP_NOTPOSPLUS:
795
0
      case OP_NOTPOSPLUSI:
796
0
      case OP_NOTPOSQUERY:
797
0
      case OP_NOTPOSQUERYI:
798
0
      case OP_NOTPOSSTAR:
799
0
      case OP_NOTPOSSTARI:
800
0
      case OP_NOTPOSUPTO:
801
0
      case OP_NOTPOSUPTOI:
802
0
      case OP_NOTPROP:
803
0
      case OP_NOTQUERY:
804
0
      case OP_NOTQUERYI:
805
0
      case OP_NOTSTAR:
806
0
      case OP_NOTSTARI:
807
0
      case OP_NOTUPTO:
808
0
      case OP_NOTUPTOI:
809
0
      case OP_NOT_HSPACE:
810
0
      case OP_NOT_VSPACE:
811
0
      case OP_NRREF:
812
0
      case OP_PROP:
813
0
      case OP_PRUNE:
814
0
      case OP_PRUNE_ARG:
815
0
      case OP_RECURSE:
816
0
      case OP_REF:
817
0
      case OP_REFI:
818
0
      case OP_REVERSE:
819
0
      case OP_RREF:
820
0
      case OP_SCOND:
821
0
      case OP_SET_SOM:
822
0
      case OP_SKIP:
823
0
      case OP_SKIP_ARG:
824
0
      case OP_SOD:
825
0
      case OP_SOM:
826
0
      case OP_THEN:
827
0
      case OP_THEN_ARG:
828
0
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
829
0
      case OP_XCLASS:
830
0
#endif
831
0
      return SSB_FAIL;
832
833
      /* We can ignore word boundary tests. */
834
835
0
      case OP_WORD_BOUNDARY:
836
0
      case OP_NOT_WORD_BOUNDARY:
837
0
      tcode++;
838
0
      break;
839
840
      /* If we hit a bracket or a positive lookahead assertion, recurse to set
841
      bits from within the subpattern. If it can't find anything, we have to
842
      give up. If it finds some mandatory character(s), we are done for this
843
      branch. Otherwise, carry on scanning after the subpattern. */
844
845
0
      case OP_BRA:
846
0
      case OP_SBRA:
847
0
      case OP_CBRA:
848
0
      case OP_SCBRA:
849
0
      case OP_BRAPOS:
850
0
      case OP_SBRAPOS:
851
0
      case OP_CBRAPOS:
852
0
      case OP_SCBRAPOS:
853
0
      case OP_ONCE:
854
0
      case OP_ONCE_NC:
855
0
      case OP_ASSERT:
856
0
      rc = set_start_bits(tcode, start_bits, utf, cd);
857
0
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
858
0
      if (rc == SSB_DONE) try_next = FALSE; else
859
0
        {
860
0
        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
861
0
        tcode += 1 + LINK_SIZE;
862
0
        }
863
0
      break;
864
865
      /* If we hit ALT or KET, it means we haven't found anything mandatory in
866
      this branch, though we might have found something optional. For ALT, we
867
      continue with the next alternative, but we have to arrange that the final
868
      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
869
      return SSB_CONTINUE: if this is the top level, that indicates failure,
870
      but after a nested subpattern, it causes scanning to continue. */
871
872
0
      case OP_ALT:
873
0
      yield = SSB_CONTINUE;
874
0
      try_next = FALSE;
875
0
      break;
876
877
0
      case OP_KET:
878
0
      case OP_KETRMAX:
879
0
      case OP_KETRMIN:
880
0
      case OP_KETRPOS:
881
0
      return SSB_CONTINUE;
882
883
      /* Skip over callout */
884
885
0
      case OP_CALLOUT:
886
0
      tcode += 2 + 2*LINK_SIZE;
887
0
      break;
888
889
      /* Skip over lookbehind and negative lookahead assertions */
890
891
0
      case OP_ASSERT_NOT:
892
0
      case OP_ASSERTBACK:
893
0
      case OP_ASSERTBACK_NOT:
894
0
      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
895
0
      tcode += 1 + LINK_SIZE;
896
0
      break;
897
898
      /* BRAZERO does the bracket, but carries on. */
899
900
0
      case OP_BRAZERO:
901
0
      case OP_BRAMINZERO:
902
0
      case OP_BRAPOSZERO:
903
0
      rc = set_start_bits(++tcode, start_bits, utf, cd);
904
0
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
905
/* =========================================================================
906
      See the comment at the head of this function concerning the next line,
907
      which was an old fudge for the benefit of OS/2.
908
      dummy = 1;
909
  ========================================================================= */
910
0
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
911
0
      tcode += 1 + LINK_SIZE;
912
0
      break;
913
914
      /* SKIPZERO skips the bracket. */
915
916
0
      case OP_SKIPZERO:
917
0
      tcode++;
918
0
      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
919
0
      tcode += 1 + LINK_SIZE;
920
0
      break;
921
922
      /* Single-char * or ? sets the bit and tries the next item */
923
924
0
      case OP_STAR:
925
0
      case OP_MINSTAR:
926
0
      case OP_POSSTAR:
927
0
      case OP_QUERY:
928
0
      case OP_MINQUERY:
929
0
      case OP_POSQUERY:
930
0
      tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
931
0
      break;
932
933
0
      case OP_STARI:
934
0
      case OP_MINSTARI:
935
0
      case OP_POSSTARI:
936
0
      case OP_QUERYI:
937
0
      case OP_MINQUERYI:
938
0
      case OP_POSQUERYI:
939
0
      tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
940
0
      break;
941
942
      /* Single-char upto sets the bit and tries the next */
943
944
0
      case OP_UPTO:
945
0
      case OP_MINUPTO:
946
0
      case OP_POSUPTO:
947
0
      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
948
0
      break;
949
950
0
      case OP_UPTOI:
951
0
      case OP_MINUPTOI:
952
0
      case OP_POSUPTOI:
953
0
      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
954
0
      break;
955
956
      /* At least one single char sets the bit and stops */
957
958
0
      case OP_EXACT:
959
0
      tcode += IMM2_SIZE;
960
      /* Fall through */
961
0
      case OP_CHAR:
962
0
      case OP_PLUS:
963
0
      case OP_MINPLUS:
964
0
      case OP_POSPLUS:
965
0
      (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
966
0
      try_next = FALSE;
967
0
      break;
968
969
0
      case OP_EXACTI:
970
0
      tcode += IMM2_SIZE;
971
      /* Fall through */
972
0
      case OP_CHARI:
973
0
      case OP_PLUSI:
974
0
      case OP_MINPLUSI:
975
0
      case OP_POSPLUSI:
976
0
      (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
977
0
      try_next = FALSE;
978
0
      break;
979
980
      /* Special spacing and line-terminating items. These recognize specific
981
      lists of characters. The difference between VSPACE and ANYNL is that the
982
      latter can match the two-character CRLF sequence, but that is not
983
      relevant for finding the first character, so their code here is
984
      identical. */
985
986
0
      case OP_HSPACE:
987
0
      SET_BIT(0x09);
988
0
      SET_BIT(0x20);
989
0
#ifdef SUPPORT_UTF
990
0
      if (utf)
991
0
        {
992
0
#ifdef COMPILE_PCRE8
993
0
        SET_BIT(0xC2);  /* For U+00A0 */
994
0
        SET_BIT(0xE1);  /* For U+1680, U+180E */
995
0
        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
996
0
        SET_BIT(0xE3);  /* For U+3000 */
997
0
#endif
998
#ifdef COMPILE_PCRE16
999
        SET_BIT(0xA0);
1000
        SET_BIT(0xFF);  /* For characters > 255 */
1001
#endif
1002
0
        }
1003
0
      else
1004
0
#endif /* SUPPORT_UTF */
1005
0
        {
1006
0
        SET_BIT(0xA0);
1007
#ifdef COMPILE_PCRE16
1008
        SET_BIT(0xFF);  /* For characters > 255 */
1009
#endif
1010
0
        }
1011
0
      try_next = FALSE;
1012
0
      break;
1013
1014
0
      case OP_ANYNL:
1015
0
      case OP_VSPACE:
1016
0
      SET_BIT(0x0A);
1017
0
      SET_BIT(0x0B);
1018
0
      SET_BIT(0x0C);
1019
0
      SET_BIT(0x0D);
1020
0
#ifdef SUPPORT_UTF
1021
0
      if (utf)
1022
0
        {
1023
0
#ifdef COMPILE_PCRE8
1024
0
        SET_BIT(0xC2);  /* For U+0085 */
1025
0
        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1026
0
#endif
1027
#ifdef COMPILE_PCRE16
1028
        SET_BIT(0x85);
1029
        SET_BIT(0xFF);  /* For characters > 255 */
1030
#endif
1031
0
        }
1032
0
      else
1033
0
#endif /* SUPPORT_UTF */
1034
0
        {
1035
0
        SET_BIT(0x85);
1036
#ifdef COMPILE_PCRE16
1037
        SET_BIT(0xFF);  /* For characters > 255 */
1038
#endif
1039
0
        }
1040
0
      try_next = FALSE;
1041
0
      break;
1042
1043
      /* Single character types set the bits and stop. Note that if PCRE_UCP
1044
      is set, we do not see these op codes because \d etc are converted to
1045
      properties. Therefore, these apply in the case when only characters less
1046
      than 256 are recognized to match the types. */
1047
1048
0
      case OP_NOT_DIGIT:
1049
0
      set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1050
0
      try_next = FALSE;
1051
0
      break;
1052
1053
0
      case OP_DIGIT:
1054
0
      set_type_bits(start_bits, cbit_digit, table_limit, cd);
1055
0
      try_next = FALSE;
1056
0
      break;
1057
1058
      /* The cbit_space table has vertical tab as whitespace; we have to
1059
      ensure it is set as not whitespace. */
1060
1061
0
      case OP_NOT_WHITESPACE:
1062
0
      set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1063
0
      start_bits[1] |= 0x08;
1064
0
      try_next = FALSE;
1065
0
      break;
1066
1067
      /* The cbit_space table has vertical tab as whitespace; we have to
1068
      not set it from the table. */
1069
1070
0
      case OP_WHITESPACE:
1071
0
      c = start_bits[1];    /* Save in case it was already set */
1072
0
      set_type_bits(start_bits, cbit_space, table_limit, cd);
1073
0
      start_bits[1] = (start_bits[1] & ~0x08) | c;
1074
0
      try_next = FALSE;
1075
0
      break;
1076
1077
0
      case OP_NOT_WORDCHAR:
1078
0
      set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1079
0
      try_next = FALSE;
1080
0
      break;
1081
1082
0
      case OP_WORDCHAR:
1083
0
      set_type_bits(start_bits, cbit_word, table_limit, cd);
1084
0
      try_next = FALSE;
1085
0
      break;
1086
1087
      /* One or more character type fudges the pointer and restarts, knowing
1088
      it will hit a single character type and stop there. */
1089
1090
0
      case OP_TYPEPLUS:
1091
0
      case OP_TYPEMINPLUS:
1092
0
      case OP_TYPEPOSPLUS:
1093
0
      tcode++;
1094
0
      break;
1095
1096
0
      case OP_TYPEEXACT:
1097
0
      tcode += 1 + IMM2_SIZE;
1098
0
      break;
1099
1100
      /* Zero or more repeats of character types set the bits and then
1101
      try again. */
1102
1103
0
      case OP_TYPEUPTO:
1104
0
      case OP_TYPEMINUPTO:
1105
0
      case OP_TYPEPOSUPTO:
1106
0
      tcode += IMM2_SIZE;  /* Fall through */
1107
1108
0
      case OP_TYPESTAR:
1109
0
      case OP_TYPEMINSTAR:
1110
0
      case OP_TYPEPOSSTAR:
1111
0
      case OP_TYPEQUERY:
1112
0
      case OP_TYPEMINQUERY:
1113
0
      case OP_TYPEPOSQUERY:
1114
0
      switch(tcode[1])
1115
0
        {
1116
0
        default:
1117
0
        case OP_ANY:
1118
0
        case OP_ALLANY:
1119
0
        return SSB_FAIL;
1120
1121
0
        case OP_HSPACE:
1122
0
        SET_BIT(0x09);
1123
0
        SET_BIT(0x20);
1124
0
#ifdef SUPPORT_UTF
1125
0
        if (utf)
1126
0
          {
1127
0
#ifdef COMPILE_PCRE8
1128
0
          SET_BIT(0xC2);  /* For U+00A0 */
1129
0
          SET_BIT(0xE1);  /* For U+1680, U+180E */
1130
0
          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1131
0
          SET_BIT(0xE3);  /* For U+3000 */
1132
0
#endif
1133
#ifdef COMPILE_PCRE16
1134
          SET_BIT(0xA0);
1135
          SET_BIT(0xFF);  /* For characters > 255 */
1136
#endif
1137
0
          }
1138
0
        else
1139
0
#endif /* SUPPORT_UTF */
1140
0
          SET_BIT(0xA0);
1141
0
        break;
1142
1143
0
        case OP_ANYNL:
1144
0
        case OP_VSPACE:
1145
0
        SET_BIT(0x0A);
1146
0
        SET_BIT(0x0B);
1147
0
        SET_BIT(0x0C);
1148
0
        SET_BIT(0x0D);
1149
0
#ifdef SUPPORT_UTF
1150
0
        if (utf)
1151
0
          {
1152
0
#ifdef COMPILE_PCRE8
1153
0
          SET_BIT(0xC2);  /* For U+0085 */
1154
0
          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1155
0
#endif
1156
#ifdef COMPILE_PCRE16
1157
          SET_BIT(0x85);
1158
          SET_BIT(0xFF);  /* For characters > 255 */
1159
#endif
1160
0
          }
1161
0
        else
1162
0
#endif /* SUPPORT_UTF */
1163
0
          SET_BIT(0x85);
1164
0
        break;
1165
1166
0
        case OP_NOT_DIGIT:
1167
0
        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1168
0
        break;
1169
1170
0
        case OP_DIGIT:
1171
0
        set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172
0
        break;
1173
1174
        /* The cbit_space table has vertical tab as whitespace; we have to
1175
        ensure it gets set as not whitespace. */
1176
1177
0
        case OP_NOT_WHITESPACE:
1178
0
        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1179
0
        start_bits[1] |= 0x08;
1180
0
        break;
1181
1182
        /* The cbit_space table has vertical tab as whitespace; we have to
1183
        avoid setting it. */
1184
1185
0
        case OP_WHITESPACE:
1186
0
        c = start_bits[1];    /* Save in case it was already set */
1187
0
        set_type_bits(start_bits, cbit_space, table_limit, cd);
1188
0
        start_bits[1] = (start_bits[1] & ~0x08) | c;
1189
0
        break;
1190
1191
0
        case OP_NOT_WORDCHAR:
1192
0
        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1193
0
        break;
1194
1195
0
        case OP_WORDCHAR:
1196
0
        set_type_bits(start_bits, cbit_word, table_limit, cd);
1197
0
        break;
1198
0
        }
1199
1200
0
      tcode += 2;
1201
0
      break;
1202
1203
      /* Character class where all the information is in a bit map: set the
1204
      bits and either carry on or not, according to the repeat count. If it was
1205
      a negative class, and we are operating with UTF-8 characters, any byte
1206
      with a value >= 0xc4 is a potentially valid starter because it starts a
1207
      character with a value > 255. */
1208
1209
0
      case OP_NCLASS:
1210
0
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1211
0
      if (utf)
1212
0
        {
1213
0
        start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1214
0
        memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1215
0
        }
1216
0
#endif
1217
#ifdef COMPILE_PCRE16
1218
      SET_BIT(0xFF);                         /* For characters > 255 */
1219
#endif
1220
      /* Fall through */
1221
1222
0
      case OP_CLASS:
1223
0
        {
1224
0
        pcre_uint8 *map;
1225
0
        tcode++;
1226
0
        map = (pcre_uint8 *)tcode;
1227
1228
        /* In UTF-8 mode, the bits in a bit map correspond to character
1229
        values, not to byte values. However, the bit map we are constructing is
1230
        for byte values. So we have to do a conversion for characters whose
1231
        value is > 127. In fact, there are only two possible starting bytes for
1232
        characters in the range 128 - 255. */
1233
1234
0
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1235
0
        if (utf)
1236
0
          {
1237
0
          for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1238
0
          for (c = 128; c < 256; c++)
1239
0
            {
1240
0
            if ((map[c/8] && (1 << (c&7))) != 0)
1241
0
              {
1242
0
              int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1243
0
              start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1244
0
              c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1245
0
              }
1246
0
            }
1247
0
          }
1248
0
        else
1249
0
#endif
1250
0
          {
1251
          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1252
0
          for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1253
0
          }
1254
1255
        /* Advance past the bit map, and act on what follows. For a zero
1256
        minimum repeat, continue; otherwise stop processing. */
1257
1258
0
        tcode += 32 / sizeof(pcre_uchar);
1259
0
        switch (*tcode)
1260
0
          {
1261
0
          case OP_CRSTAR:
1262
0
          case OP_CRMINSTAR:
1263
0
          case OP_CRQUERY:
1264
0
          case OP_CRMINQUERY:
1265
0
          tcode++;
1266
0
          break;
1267
1268
0
          case OP_CRRANGE:
1269
0
          case OP_CRMINRANGE:
1270
0
          if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1271
0
            else try_next = FALSE;
1272
0
          break;
1273
1274
0
          default:
1275
0
          try_next = FALSE;
1276
0
          break;
1277
0
          }
1278
0
        }
1279
0
      break; /* End of bitmap class handling */
1280
1281
0
      }      /* End of switch */
1282
0
    }        /* End of try_next loop */
1283
1284
0
  code += GET(code, 1);   /* Advance to next branch */
1285
0
  }
1286
0
while (*code == OP_ALT);
1287
0
return yield;
1288
0
}
1289
1290
1291
1292
1293
1294
/*************************************************
1295
*          Study a compiled expression           *
1296
*************************************************/
1297
1298
/* This function is handed a compiled expression that it must study to produce
1299
information that will speed up the matching. It returns a pcre[16]_extra block
1300
which then gets handed back to pcre_exec().
1301
1302
Arguments:
1303
  re        points to the compiled expression
1304
  options   contains option bits
1305
  errorptr  points to where to place error messages;
1306
            set NULL unless error
1307
1308
Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1309
              the appropriate flags set;
1310
            NULL on error or if no optimization possible
1311
*/
1312
1313
#ifdef COMPILE_PCRE8
1314
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1315
pcre_study(const pcre *external_re, int options, const char **errorptr)
1316
#else
1317
PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1318
pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1319
#endif
1320
0
{
1321
0
int min;
1322
0
BOOL bits_set = FALSE;
1323
0
pcre_uint8 start_bits[32];
1324
0
PUBL(extra) *extra = NULL;
1325
0
pcre_study_data *study;
1326
0
const pcre_uint8 *tables;
1327
0
pcre_uchar *code;
1328
0
compile_data compile_block;
1329
0
const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1330
1331
0
*errorptr = NULL;
1332
1333
0
if (re == NULL || re->magic_number != MAGIC_NUMBER)
1334
0
  {
1335
0
  *errorptr = "argument is not a compiled regular expression";
1336
0
  return NULL;
1337
0
  }
1338
1339
0
if ((re->flags & PCRE_MODE) == 0)
1340
0
  {
1341
0
#ifdef COMPILE_PCRE8
1342
0
  *errorptr = "argument is compiled in 16 bit mode";
1343
#else
1344
  *errorptr = "argument is compiled in 8 bit mode";
1345
#endif
1346
0
  return NULL;
1347
0
  }
1348
1349
0
if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1350
0
  {
1351
0
  *errorptr = "unknown or incorrect option bit(s) set";
1352
0
  return NULL;
1353
0
  }
1354
1355
0
code = (pcre_uchar *)re + re->name_table_offset +
1356
0
  (re->name_count * re->name_entry_size);
1357
1358
/* For an anchored pattern, or an unanchored pattern that has a first char, or
1359
a multiline pattern that matches only at "line starts", there is no point in
1360
seeking a list of starting bytes. */
1361
1362
0
if ((re->options & PCRE_ANCHORED) == 0 &&
1363
0
    (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1364
0
  {
1365
0
  int rc;
1366
1367
  /* Set the character tables in the block that is passed around */
1368
1369
0
  tables = re->tables;
1370
1371
0
#ifdef COMPILE_PCRE8
1372
0
  if (tables == NULL)
1373
0
    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1374
0
    (void *)(&tables));
1375
#else
1376
  if (tables == NULL)
1377
    (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1378
    (void *)(&tables));
1379
#endif
1380
1381
0
  compile_block.lcc = tables + lcc_offset;
1382
0
  compile_block.fcc = tables + fcc_offset;
1383
0
  compile_block.cbits = tables + cbits_offset;
1384
0
  compile_block.ctypes = tables + ctypes_offset;
1385
1386
  /* See if we can find a fixed set of initial characters for the pattern. */
1387
1388
0
  memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1389
0
  rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1390
0
    &compile_block);
1391
0
  bits_set = rc == SSB_DONE;
1392
0
  if (rc == SSB_UNKNOWN)
1393
0
    {
1394
0
    *errorptr = "internal error: opcode not recognized";
1395
0
    return NULL;
1396
0
    }
1397
0
  }
1398
1399
/* Find the minimum length of subject string. */
1400
1401
0
switch(min = find_minlength(code, code, re->options, 0))
1402
0
  {
1403
0
  case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1404
0
  case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1405
0
  default: break;
1406
0
  }
1407
1408
/* If a set of starting bytes has been identified, or if the minimum length is
1409
greater than zero, or if JIT optimization has been requested, get a
1410
pcre[16]_extra block and a pcre_study_data block. The study data is put in the
1411
latter, which is pointed to by the former, which may also get additional data
1412
set later by the calling program. At the moment, the size of pcre_study_data
1413
is fixed. We nevertheless save it in a field for returning via the
1414
pcre_fullinfo() function so that if it becomes variable in the future,
1415
we don't have to change that code. */
1416
1417
0
if (bits_set || min > 0
1418
#ifdef SUPPORT_JIT
1419
    || (options & (PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
1420
                 | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE)) != 0
1421
#endif
1422
0
  )
1423
0
  {
1424
0
  extra = (PUBL(extra) *)(PUBL(malloc))
1425
0
    (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1426
0
  if (extra == NULL)
1427
0
    {
1428
0
    *errorptr = "failed to get memory";
1429
0
    return NULL;
1430
0
    }
1431
1432
0
  study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1433
0
  extra->flags = PCRE_EXTRA_STUDY_DATA;
1434
0
  extra->study_data = study;
1435
1436
0
  study->size = sizeof(pcre_study_data);
1437
0
  study->flags = 0;
1438
1439
  /* Set the start bits always, to avoid unset memory errors if the
1440
  study data is written to a file, but set the flag only if any of the bits
1441
  are set, to save time looking when none are. */
1442
1443
0
  if (bits_set)
1444
0
    {
1445
0
    study->flags |= PCRE_STUDY_MAPPED;
1446
0
    memcpy(study->start_bits, start_bits, sizeof(start_bits));
1447
0
    }
1448
0
  else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1449
1450
#ifdef PCRE_DEBUG
1451
  if (bits_set)
1452
    {
1453
    pcre_uint8 *ptr = start_bits;
1454
    int i;
1455
1456
    printf("Start bits:\n");
1457
    for (i = 0; i < 32; i++)
1458
      printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1459
    }
1460
#endif
1461
1462
  /* Always set the minlength value in the block, because the JIT compiler
1463
  makes use of it. However, don't set the bit unless the length is greater than
1464
  zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1465
  checking the zero case. */
1466
1467
0
  if (min > 0)
1468
0
    {
1469
0
    study->flags |= PCRE_STUDY_MINLEN;
1470
0
    study->minlength = min;
1471
0
    }
1472
0
  else study->minlength = 0;
1473
1474
  /* If JIT support was compiled and requested, attempt the JIT compilation.
1475
  If no starting bytes were found, and the minimum length is zero, and JIT
1476
  compilation fails, abandon the extra block and return NULL. */
1477
1478
#ifdef SUPPORT_JIT
1479
  extra->executable_jit = NULL;
1480
  if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1481
    PRIV(jit_compile)(re, extra, JIT_COMPILE);
1482
  if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1483
    PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1484
  if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1485
    PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1486
1487
  if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
1488
    {
1489
#ifdef COMPILE_PCRE8
1490
    pcre_free_study(extra);
1491
#endif
1492
#ifdef COMPILE_PCRE16
1493
    pcre16_free_study(extra);
1494
#endif
1495
    extra = NULL;
1496
    }
1497
#endif
1498
0
  }
1499
1500
0
return extra;
1501
0
}
1502
1503
1504
/*************************************************
1505
*          Free the study data                   *
1506
*************************************************/
1507
1508
/* This function frees the memory that was obtained by pcre_study().
1509
1510
Argument:   a pointer to the pcre[16]_extra block
1511
Returns:    nothing
1512
*/
1513
1514
#ifdef COMPILE_PCRE8
1515
PCRE_EXP_DEFN void
1516
pcre_free_study(pcre_extra *extra)
1517
#else
1518
PCRE_EXP_DEFN void
1519
pcre16_free_study(pcre16_extra *extra)
1520
#endif
1521
0
{
1522
0
if (extra == NULL)
1523
0
  return;
1524
#ifdef SUPPORT_JIT
1525
if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1526
     extra->executable_jit != NULL)
1527
  PRIV(jit_free)(extra->executable_jit);
1528
#endif
1529
0
PUBL(free)(extra);
1530
0
}
1531
1532
/* End of pcre_study.c */