Coverage Report

Created: 2025-09-27 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/pcre/pcre2lib/pcre2_auto_possess.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
/* This module contains functions that scan a compiled pattern and change
42
repeats into possessive repeats where possible. */
43
44
45
#ifdef HAVE_CONFIG_H
46
#include "config.h"
47
#endif
48
49
50
#include "pcre2_internal.h"
51
52
/* This macro represents the max size of list[] and that is used to keep
53
track of UCD info in several places, it should be kept on sync with the
54
value used by GenerateUcd.py */
55
340
#define MAX_LIST 8
56
57
/*************************************************
58
*        Tables for auto-possessification        *
59
*************************************************/
60
61
/* This table is used to check whether auto-possessification is possible
62
between adjacent character-type opcodes. The left-hand (repeated) opcode is
63
used to select the row, and the right-hand opcode is use to select the column.
64
A value of 1 means that auto-possessification is OK. For example, the second
65
value in the first row means that \D+\d can be turned into \D++\d.
66
67
The Unicode property types (\P and \p) have to be present to fill out the table
68
because of what their opcode values are, but the table values should always be
69
zero because property types are handled separately in the code. The last four
70
columns apply to items that cannot be repeated, so there is no need to have
71
rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is
72
*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
73
74
#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
75
#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
76
77
static const uint8_t autoposstab[APTROWS][APTCOLS] = {
78
/* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
79
  { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
80
  { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
81
  { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
82
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
83
  { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
84
  { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
85
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
86
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
87
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
88
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
89
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
90
  { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
91
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
92
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
93
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
94
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
95
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
96
};
97
98
#ifdef SUPPORT_UNICODE
99
/* This table is used to check whether auto-possessification is possible
100
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
101
left-hand (repeated) opcode is used to select the row, and the right-hand
102
opcode is used to select the column. The values are as follows:
103
104
  0   Always return FALSE (never auto-possessify)
105
  1   Character groups are distinct (possessify if both are OP_PROP)
106
  2   Check character categories in the same group (general or particular)
107
  3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
108
109
  4   Check left general category vs right particular category
110
  5   Check right general category vs left particular category
111
112
  6   Left alphanum vs right general category
113
  7   Left space vs right general category
114
  8   Left word vs right general category
115
116
  9   Right alphanum vs left general category
117
 10   Right space vs left general category
118
 11   Right word vs left general category
119
120
 12   Left alphanum vs right particular category
121
 13   Left space vs right particular category
122
 14   Left word vs right particular category
123
124
 15   Right alphanum vs left particular category
125
 16   Right space vs left particular category
126
 17   Right word vs left particular category
127
*/
128
129
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
130
/* LAMP GC  PC  SC  SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
131
  { 3,  0,  0,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_LAMP */
132
  { 0,  2,  4,  0,   0,    9,   10,     10,  11,    0,   0,    0,    0 },  /* PT_GC */
133
  { 0,  5,  2,  0,   0,   15,   16,     16,  17,    0,   0,    0,    0 },  /* PT_PC */
134
  { 0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SC */
135
  { 0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SCX */
136
  { 3,  6, 12,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_ALNUM */
137
  { 1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_SPACE */
138
  { 1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_PXSPACE */
139
  { 0,  8, 14,  0,   0,    0,    1,      1,   3,    0,   0,    0,    0 },  /* PT_WORD */
140
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_CLIST */
141
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   3,    0,    0 },  /* PT_UCNC */
142
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_BIDICL */
143
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 }   /* PT_BOOL */
144
  /* PT_ANY does not need a record. */
145
};
146
147
/* This table is used to check whether auto-possessification is possible
148
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
149
specifies a general category and the other specifies a particular category. The
150
row is selected by the general category and the column by the particular
151
category. The value is 1 if the particular category is not part of the general
152
category. */
153
154
static const uint8_t catposstab[7][30] = {
155
/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
156
  { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
157
  { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
158
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
159
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
160
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
161
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
162
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
163
};
164
165
/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
166
a general or particular category. The properties in each row are those
167
that apply to the character set in question. Duplication means that a little
168
unnecessary work is done when checking, but this keeps things much simpler
169
because they can all use the same code. For more details see the comment where
170
this table is used.
171
172
Note: SPACE and PXSPACE used to be different because Perl excluded VT from
173
"space", but from Perl 5.18 it's included, so both categories are treated the
174
same here. */
175
176
static const uint8_t posspropstab[3][4] = {
177
  { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
178
  { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
179
  { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
180
};
181
#endif  /* SUPPORT_UNICODE */
182
183
184
185
#ifdef SUPPORT_UNICODE
186
/*************************************************
187
*        Check a character and a property        *
188
*************************************************/
189
190
/* This function is called by compare_opcodes() when a property item is
191
adjacent to a fixed character.
192
193
Arguments:
194
  c            the character
195
  ptype        the property type
196
  pdata        the data for the type
197
  negated      TRUE if it's a negated property (\P or \p{^)
198
199
Returns:       TRUE if auto-possessifying is OK
200
*/
201
202
static BOOL
203
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
204
  BOOL negated)
205
351
{
206
351
BOOL ok, rc;
207
351
const uint32_t *p;
208
351
const ucd_record *prop = GET_UCD(c);
209
210
351
switch(ptype)
211
351
  {
212
0
  case PT_LAMP:
213
0
  return (prop->chartype == ucp_Lu ||
214
0
          prop->chartype == ucp_Ll ||
215
0
          prop->chartype == ucp_Lt) == negated;
216
217
32
  case PT_GC:
218
32
  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
219
220
37
  case PT_PC:
221
37
  return (pdata == prop->chartype) == negated;
222
223
0
  case PT_SC:
224
0
  return (pdata == prop->script) == negated;
225
226
0
  case PT_SCX:
227
0
  ok = (pdata == prop->script
228
0
        || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
229
0
  return ok == negated;
230
231
  /* These are specials */
232
233
0
  case PT_ALNUM:
234
0
  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
235
0
          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
236
237
  /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
238
  means that Perl space and POSIX space are now identical. PCRE was changed
239
  at release 8.34. */
240
241
31
  case PT_SPACE:    /* Perl space */
242
31
  case PT_PXSPACE:  /* POSIX space */
243
31
  switch(c)
244
31
    {
245
18
    HSPACE_CASES:
246
18
    VSPACE_CASES:
247
1
    rc = negated;
248
1
    break;
249
250
30
    default:
251
30
    rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
252
31
    }
253
31
  return rc;
254
255
251
  case PT_WORD:
256
251
  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
257
170
          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
258
161
          c == CHAR_UNDERSCORE) == negated;
259
260
0
  case PT_CLIST:
261
0
  p = PRIV(ucd_caseless_sets) + prop->caseset;
262
0
  for (;;)
263
0
    {
264
0
    if (c < *p) return !negated;
265
0
    if (c == *p++) return negated;
266
0
    }
267
0
  PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
268
0
  break;
269
270
  /* Haven't yet thought these through. */
271
272
0
  case PT_BIDICL:
273
0
  return FALSE;
274
275
0
  case PT_BOOL:
276
0
  return FALSE;
277
351
  }
278
279
0
return FALSE;
280
351
}
281
#endif  /* SUPPORT_UNICODE */
282
283
284
285
/*************************************************
286
*        Base opcode of repeated opcodes         *
287
*************************************************/
288
289
/* Returns the base opcode for repeated single character type opcodes. If the
290
opcode is not a repeated character type, it returns with the original value.
291
292
Arguments:  c opcode
293
Returns:    base opcode for the type
294
*/
295
296
static PCRE2_UCHAR
297
get_repeat_base(PCRE2_UCHAR c)
298
44.2k
{
299
44.2k
return (c > OP_TYPEPOSUPTO)? c :
300
44.2k
       (c >= OP_TYPESTAR)?   OP_TYPESTAR :
301
44.2k
       (c >= OP_NOTSTARI)?   OP_NOTSTARI :
302
28.3k
       (c >= OP_NOTSTAR)?    OP_NOTSTAR :
303
27.6k
       (c >= OP_STARI)?      OP_STARI :
304
27.4k
                             OP_STAR;
305
44.2k
}
306
307
308
/*************************************************
309
*        Fill the character property list        *
310
*************************************************/
311
312
/* Checks whether the code points to an opcode that can take part in auto-
313
possessification, and if so, fills a list with its properties.
314
315
Arguments:
316
  code        points to start of expression
317
  utf         TRUE if in UTF mode
318
  ucp         TRUE if in UCP mode
319
  fcc         points to the case-flipping table
320
  list        points to output list
321
              list[0] will be filled with the opcode
322
              list[1] will be non-zero if this opcode
323
                can match an empty character string
324
              list[2..7] depends on the opcode
325
326
Returns:      points to the start of the next opcode if *code is accepted
327
              NULL if *code is not accepted
328
*/
329
330
static PCRE2_SPTR
331
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
332
  uint32_t *list)
333
44.7k
{
334
44.7k
PCRE2_UCHAR c = *code;
335
44.7k
PCRE2_UCHAR base;
336
44.7k
PCRE2_SPTR end;
337
44.7k
PCRE2_SPTR class_end;
338
44.7k
uint32_t chr;
339
340
44.7k
#ifdef SUPPORT_UNICODE
341
44.7k
uint32_t *clist_dest;
342
44.7k
const uint32_t *clist_src;
343
#else
344
(void)utf;    /* Suppress "unused parameter" compiler warnings */
345
(void)ucp;
346
#endif
347
348
44.7k
list[0] = c;
349
44.7k
list[1] = FALSE;
350
44.7k
code++;
351
352
44.7k
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
353
25.6k
  {
354
25.6k
  base = get_repeat_base(c);
355
25.6k
  c -= (base - OP_STAR);
356
357
25.6k
  if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
358
0
    code += IMM2_SIZE;
359
360
25.6k
  list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
361
17.8k
             c != OP_POSPLUS);
362
363
25.6k
  switch(base)
364
25.6k
    {
365
10.5k
    case OP_STAR:
366
10.5k
    list[0] = OP_CHAR;
367
10.5k
    break;
368
369
5.18k
    case OP_STARI:
370
5.18k
    list[0] = OP_CHARI;
371
5.18k
    break;
372
373
122
    case OP_NOTSTAR:
374
122
    list[0] = OP_NOT;
375
122
    break;
376
377
442
    case OP_NOTSTARI:
378
442
    list[0] = OP_NOTI;
379
442
    break;
380
381
9.28k
    case OP_TYPESTAR:
382
9.28k
    list[0] = *code;
383
9.28k
    code++;
384
9.28k
    break;
385
25.6k
    }
386
25.6k
  c = list[0];
387
25.6k
  }
388
389
44.7k
switch(c)
390
44.7k
  {
391
81
  case OP_NOT_DIGIT:
392
867
  case OP_DIGIT:
393
938
  case OP_NOT_WHITESPACE:
394
1.54k
  case OP_WHITESPACE:
395
1.85k
  case OP_NOT_WORDCHAR:
396
5.16k
  case OP_WORDCHAR:
397
6.48k
  case OP_ANY:
398
6.82k
  case OP_ALLANY:
399
10.3k
  case OP_ANYNL:
400
10.7k
  case OP_NOT_HSPACE:
401
10.7k
  case OP_HSPACE:
402
11.3k
  case OP_NOT_VSPACE:
403
11.5k
  case OP_VSPACE:
404
11.6k
  case OP_EXTUNI:
405
11.6k
  case OP_EODN:
406
11.6k
  case OP_EOD:
407
11.7k
  case OP_DOLL:
408
11.7k
  case OP_DOLLM:
409
11.7k
  return code;
410
411
18.5k
  case OP_CHAR:
412
18.6k
  case OP_NOT:
413
18.6k
  GETCHARINCTEST(chr, code);
414
18.6k
  list[2] = chr;
415
18.6k
  list[3] = NOTACHAR;
416
18.6k
  return code;
417
418
8.34k
  case OP_CHARI:
419
8.79k
  case OP_NOTI:
420
8.79k
  list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
421
8.79k
  GETCHARINCTEST(chr, code);
422
8.79k
  list[2] = chr;
423
424
8.79k
#ifdef SUPPORT_UNICODE
425
8.79k
  if (chr < 128 || (chr < 256 && !utf && !ucp))
426
8.72k
    list[3] = fcc[chr];
427
62
  else
428
62
    list[3] = UCD_OTHERCASE(chr);
429
#elif defined SUPPORT_WIDE_CHARS
430
  list[3] = (chr < 256) ? fcc[chr] : chr;
431
#else
432
  list[3] = fcc[chr];
433
#endif
434
435
  /* The othercase might be the same value. */
436
437
8.79k
  if (chr == list[3])
438
5.54k
    list[3] = NOTACHAR;
439
3.24k
  else
440
3.24k
    list[4] = NOTACHAR;
441
8.79k
  return code;
442
443
0
#ifdef SUPPORT_UNICODE
444
492
  case OP_PROP:
445
694
  case OP_NOTPROP:
446
694
  if (code[0] != PT_CLIST)
447
609
    {
448
609
    list[2] = code[0];
449
609
    list[3] = code[1];
450
609
    return code + 2;
451
609
    }
452
453
  /* Convert only if we have enough space. */
454
455
85
  clist_src = PRIV(ucd_caseless_sets) + code[1];
456
85
  clist_dest = list + 2;
457
85
  code += 2;
458
459
340
  do {
460
340
     if (clist_dest >= list + MAX_LIST)
461
0
       {
462
       /* Early return if there is not enough space. GenerateUcd.py
463
       generated a list with more than 5 characters and something
464
       must be done about that going forward. */
465
0
       PCRE2_DEBUG_UNREACHABLE();   /* Remove if it ever triggers */
466
0
       list[2] = code[0];
467
0
       list[3] = code[1];
468
0
       return code;
469
0
       }
470
340
     *clist_dest++ = *clist_src;
471
340
     }
472
340
  while(*clist_src++ != NOTACHAR);
473
474
  /* All characters are stored. The terminating NOTACHAR is copied from the
475
  clist itself. */
476
477
85
  list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
478
85
  return code;
479
0
#endif
480
481
768
  case OP_NCLASS:
482
4.09k
  case OP_CLASS:
483
4.09k
#ifdef SUPPORT_WIDE_CHARS
484
4.47k
  case OP_XCLASS:
485
4.47k
  case OP_ECLASS:
486
4.47k
  if (c == OP_XCLASS || c == OP_ECLASS)
487
380
    end = code + GET(code, 0) - 1;
488
4.09k
  else
489
4.09k
#endif
490
4.09k
    end = code + 32 / sizeof(PCRE2_UCHAR);
491
4.47k
  class_end = end;
492
493
4.47k
  switch(*end)
494
4.47k
    {
495
1.95k
    case OP_CRSTAR:
496
2.53k
    case OP_CRMINSTAR:
497
3.10k
    case OP_CRQUERY:
498
3.38k
    case OP_CRMINQUERY:
499
3.41k
    case OP_CRPOSSTAR:
500
3.41k
    case OP_CRPOSQUERY:
501
3.41k
    list[1] = TRUE;
502
3.41k
    end++;
503
3.41k
    break;
504
505
646
    case OP_CRPLUS:
506
746
    case OP_CRMINPLUS:
507
747
    case OP_CRPOSPLUS:
508
747
    end++;
509
747
    break;
510
511
0
    case OP_CRRANGE:
512
0
    case OP_CRMINRANGE:
513
0
    case OP_CRPOSRANGE:
514
0
    list[1] = (GET2(end, 1) == 0);
515
0
    end += 1 + 2 * IMM2_SIZE;
516
0
    break;
517
4.47k
    }
518
4.47k
  list[2] = (uint32_t)(end - code);
519
4.47k
  list[3] = (uint32_t)(end - class_end);
520
4.47k
  return end;
521
44.7k
  }
522
523
378
return NULL;    /* Opcode not accepted */
524
44.7k
}
525
526
527
528
/*************************************************
529
*    Scan further character sets for match       *
530
*************************************************/
531
532
/* Checks whether the base and the current opcode have a common character, in
533
which case the base cannot be possessified.
534
535
Arguments:
536
  code        points to the byte code
537
  utf         TRUE in UTF mode
538
  ucp         TRUE in UCP mode
539
  cb          compile data block
540
  base_list   the data list of the base opcode
541
  base_end    the end of the base opcode
542
  rec_limit   points to recursion depth counter
543
544
Returns:      TRUE if the auto-possessification is possible
545
*/
546
547
static BOOL
548
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
549
  const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
550
20.2k
{
551
20.2k
PCRE2_UCHAR c;
552
20.2k
uint32_t list[MAX_LIST];
553
20.2k
const uint32_t *chr_ptr;
554
20.2k
const uint32_t *ochr_ptr;
555
20.2k
const uint32_t *list_ptr;
556
20.2k
PCRE2_SPTR next_code;
557
20.2k
#ifdef SUPPORT_WIDE_CHARS
558
20.2k
PCRE2_SPTR xclass_flags;
559
20.2k
#endif
560
20.2k
const uint8_t *class_bitset;
561
20.2k
const uint8_t *set1, *set2, *set_end;
562
20.2k
uint32_t chr;
563
20.2k
BOOL accepted, invert_bits;
564
20.2k
BOOL entered_a_group = FALSE;
565
566
20.2k
if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */
567
568
/* Note: the base_list[1] contains whether the current opcode has a greedy
569
(represented by a non-zero value) quantifier. This is a different from
570
other character type lists, which store here that the character iterator
571
matches to an empty string (also represented by a non-zero value). */
572
573
20.2k
for(;;)
574
24.9k
  {
575
24.9k
  PCRE2_SPTR bracode;
576
577
  /* All operations move the code pointer forward.
578
  Therefore infinite recursions are not possible. */
579
580
24.9k
  c = *code;
581
582
  /* Skip over callouts */
583
584
24.9k
  if (c == OP_CALLOUT)
585
0
    {
586
0
    code += PRIV(OP_lengths)[c];
587
0
    continue;
588
0
    }
589
590
24.9k
  if (c == OP_CALLOUT_STR)
591
0
    {
592
0
    code += GET(code, 1 + 2*LINK_SIZE);
593
0
    continue;
594
0
    }
595
596
  /* At the end of a branch, skip to the end of the group and process it. */
597
598
24.9k
  if (c == OP_ALT)
599
227
    {
600
1.23k
    do code += GET(code, 1); while (*code == OP_ALT);
601
227
    c = *code;
602
227
    }
603
604
  /* Inspect the next opcode. */
605
606
24.9k
  switch(c)
607
24.9k
    {
608
    /* We can always possessify a greedy iterator at the end of the pattern,
609
    which is reached after skipping over the final OP_KET. A non-greedy
610
    iterator must never be possessified. */
611
612
210
    case OP_END:
613
210
    return base_list[1] != 0;
614
615
    /* When an iterator is at the end of certain kinds of group we can inspect
616
    what follows the group by skipping over the closing ket. Note that this
617
    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
618
    iteration is variable (could be another iteration or could be the next
619
    item). As these two opcodes are not listed in the next switch, they will
620
    end up as the next code to inspect, and return FALSE by virtue of being
621
    unsupported. */
622
623
257
    case OP_KET:
624
257
    case OP_KETRPOS:
625
    /* The non-greedy case cannot be converted to a possessive form. */
626
627
257
    if (base_list[1] == 0) return FALSE;
628
629
    /* If the bracket is capturing it might be referenced by an OP_RECURSE
630
    so its last iterator can never be possessified if the pattern contains
631
    recursions. (This could be improved by keeping a list of group numbers that
632
    are called by recursion.) */
633
634
247
    bracode = code - GET(code, 1);
635
247
    switch(*bracode)
636
247
      {
637
34
      case OP_CBRA:
638
34
      case OP_SCBRA:
639
34
      case OP_CBRAPOS:
640
34
      case OP_SCBRAPOS:
641
34
      if (cb->had_recurse) return FALSE;
642
34
      break;
643
644
      /* A script run might have to backtrack if the iterated item can match
645
      characters from more than one script. So give up unless repeating an
646
      explicit character. */
647
648
34
      case OP_SCRIPT_RUN:
649
0
      if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
650
0
        return FALSE;
651
0
      break;
652
653
      /* Atomic sub-patterns and forward assertions can always auto-possessify
654
      their last iterator. However, if the group was entered as a result of
655
      checking a previous iterator, this is not possible. */
656
657
0
      case OP_ASSERT:
658
0
      case OP_ASSERT_NOT:
659
0
      case OP_ONCE:
660
0
      return !entered_a_group;
661
662
      /* Fixed-length lookbehinds can be treated the same way, but variable
663
      length lookbehinds must not auto-possessify their last iterator. Note
664
      that in order to identify a variable length lookbehind we must check
665
      through all branches, because some may be of fixed length. */
666
667
0
      case OP_ASSERTBACK:
668
0
      case OP_ASSERTBACK_NOT:
669
0
      do
670
0
        {
671
0
        if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE;  /* Variable */
672
0
        bracode += GET(bracode, 1);
673
0
        }
674
0
      while (*bracode == OP_ALT);
675
0
      return !entered_a_group;  /* Not variable length */
676
677
      /* Non-atomic assertions - don't possessify last iterator. This needs
678
      more thought. */
679
680
0
      case OP_ASSERT_NA:
681
0
      case OP_ASSERTBACK_NA:
682
0
      return FALSE;
683
247
      }
684
685
    /* Skip over the bracket and inspect what comes next. */
686
687
247
    code += PRIV(OP_lengths)[c];
688
247
    continue;
689
690
    /* Handle cases where the next item is a group. */
691
692
4
    case OP_ONCE:
693
4
    case OP_BRA:
694
12
    case OP_CBRA:
695
12
    next_code = code + GET(code, 1);
696
12
    code += PRIV(OP_lengths)[c];
697
698
    /* Check each branch. We have to recurse a level for all but the last
699
    branch. */
700
701
12
    while (*next_code == OP_ALT)
702
0
      {
703
0
      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
704
0
        return FALSE;
705
0
      code = next_code + 1 + LINK_SIZE;
706
0
      next_code += GET(next_code, 1);
707
0
      }
708
709
12
    entered_a_group = TRUE;
710
12
    continue;
711
712
0
    case OP_BRAZERO:
713
0
    case OP_BRAMINZERO:
714
715
0
    next_code = code + 1;
716
0
    if (*next_code != OP_BRA && *next_code != OP_CBRA &&
717
0
        *next_code != OP_ONCE) return FALSE;
718
719
0
    do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
720
721
    /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
722
723
0
    next_code += 1 + LINK_SIZE;
724
0
    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
725
0
         rec_limit))
726
0
      return FALSE;
727
728
0
    code += PRIV(OP_lengths)[c];
729
0
    continue;
730
731
    /* The next opcode does not need special handling; fall through and use it
732
    to see if the base can be possessified. */
733
734
24.4k
    default:
735
24.4k
    break;
736
24.9k
    }
737
738
  /* We now have the next appropriate opcode to compare with the base. Check
739
  for a supported opcode, and load its properties. */
740
741
24.4k
  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
742
24.4k
  if (code == NULL) return FALSE;    /* Unsupported */
743
744
  /* If either opcode is a small character list, set pointers for comparing
745
  characters from that list with another list, or with a property. */
746
747
24.0k
  if (base_list[0] == OP_CHAR)
748
13.7k
    {
749
13.7k
    chr_ptr = base_list + 2;
750
13.7k
    list_ptr = list;
751
13.7k
    }
752
10.3k
  else if (list[0] == OP_CHAR)
753
6.61k
    {
754
6.61k
    chr_ptr = list + 2;
755
6.61k
    list_ptr = base_list;
756
6.61k
    }
757
758
  /* Character bitsets can also be compared to certain opcodes. */
759
760
3.74k
  else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
761
2.96k
#if PCRE2_CODE_UNIT_WIDTH == 8
762
      /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
763
2.96k
      || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
764
3.74k
#endif
765
3.74k
      )
766
980
    {
767
980
#if PCRE2_CODE_UNIT_WIDTH == 8
768
980
    if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
769
#else
770
    if (base_list[0] == OP_CLASS)
771
#endif
772
651
      {
773
651
      set1 = (const uint8_t *)(base_end - base_list[2]);
774
651
      list_ptr = list;
775
651
      }
776
329
    else
777
329
      {
778
329
      set1 = (const uint8_t *)(code - list[2]);
779
329
      list_ptr = base_list;
780
329
      }
781
782
980
    invert_bits = FALSE;
783
980
    switch(list_ptr[0])
784
980
      {
785
166
      case OP_CLASS:
786
225
      case OP_NCLASS:
787
225
      set2 = (const uint8_t *)
788
225
        ((list_ptr == list ? code : base_end) - list_ptr[2]);
789
225
      break;
790
791
0
#ifdef SUPPORT_WIDE_CHARS
792
46
      case OP_XCLASS:
793
46
      xclass_flags = (list_ptr == list ? code : base_end) -
794
46
        list_ptr[2] + LINK_SIZE;
795
46
      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
796
2
      if ((*xclass_flags & XCL_MAP) == 0)
797
0
        {
798
        /* No bits are set for characters < 256. */
799
0
        if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
800
        /* Might be an empty repeat. */
801
0
        continue;
802
0
        }
803
2
      set2 = (const uint8_t *)(xclass_flags + 1);
804
2
      break;
805
0
#endif
806
807
3
      case OP_NOT_DIGIT:
808
3
      invert_bits = TRUE;
809
      /* Fall through */
810
303
      case OP_DIGIT:
811
303
      set2 = (const uint8_t *)(cb->cbits + cbit_digit);
812
303
      break;
813
814
0
      case OP_NOT_WHITESPACE:
815
0
      invert_bits = TRUE;
816
      /* Fall through */
817
58
      case OP_WHITESPACE:
818
58
      set2 = (const uint8_t *)(cb->cbits + cbit_space);
819
58
      break;
820
821
27
      case OP_NOT_WORDCHAR:
822
27
      invert_bits = TRUE;
823
      /* Fall through */
824
135
      case OP_WORDCHAR:
825
135
      set2 = (const uint8_t *)(cb->cbits + cbit_word);
826
135
      break;
827
828
213
      default:
829
213
      return FALSE;
830
980
      }
831
832
    /* Because the bit sets are unaligned bytes, we need to perform byte
833
    comparison here. */
834
835
723
    set_end = set1 + 32;
836
723
    if (invert_bits)
837
30
      {
838
30
      do
839
35
        {
840
35
        if ((*set1++ & ~(*set2++)) != 0) return FALSE;
841
35
        }
842
30
      while (set1 < set_end);
843
30
      }
844
693
    else
845
693
      {
846
693
      do
847
10.3k
        {
848
10.3k
        if ((*set1++ & *set2++) != 0) return FALSE;
849
10.3k
        }
850
9.83k
      while (set1 < set_end);
851
693
      }
852
853
198
    if (list[1] == 0) return TRUE;
854
    /* Might be an empty repeat. */
855
120
    continue;
856
198
    }
857
858
  /* Some property combinations also acceptable. Unicode property opcodes are
859
  processed specially; the rest can be handled with a lookup table. */
860
861
2.76k
  else
862
2.76k
    {
863
2.76k
    uint32_t leftop, rightop;
864
865
2.76k
    leftop = base_list[0];
866
2.76k
    rightop = list[0];
867
868
2.76k
#ifdef SUPPORT_UNICODE
869
2.76k
    accepted = FALSE; /* Always set in non-unicode case. */
870
2.76k
    if (leftop == OP_PROP || leftop == OP_NOTPROP)
871
104
      {
872
104
      if (rightop == OP_EOD)
873
0
        accepted = TRUE;
874
104
      else if (rightop == OP_PROP || rightop == OP_NOTPROP)
875
26
        {
876
26
        int n;
877
26
        const uint8_t *p;
878
26
        BOOL same = leftop == rightop;
879
26
        BOOL lisprop = leftop == OP_PROP;
880
26
        BOOL risprop = rightop == OP_PROP;
881
26
        BOOL bothprop = lisprop && risprop;
882
883
        /* There's a table that specifies how each combination is to be
884
        processed:
885
          0   Always return FALSE (never auto-possessify)
886
          1   Character groups are distinct (possessify if both are OP_PROP)
887
          2   Check character categories in the same group (general or particular)
888
          3   Return TRUE if the two opcodes are not the same
889
          ... see comments below
890
        */
891
892
26
        n = propposstab[base_list[2]][list[2]];
893
26
        switch(n)
894
26
          {
895
0
          case 0: break;
896
1
          case 1: accepted = bothprop; break;
897
2
          case 2: accepted = (base_list[3] == list[3]) != same; break;
898
14
          case 3: accepted = !same; break;
899
900
0
          case 4:  /* Left general category, right particular category */
901
0
          accepted = risprop && catposstab[base_list[3]][list[3]] == same;
902
0
          break;
903
904
0
          case 5:  /* Right general category, left particular category */
905
0
          accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
906
0
          break;
907
908
          /* This code is logically tricky. Think hard before fiddling with it.
909
          The posspropstab table has four entries per row. Each row relates to
910
          one of PCRE's special properties such as ALNUM or SPACE or WORD.
911
          Only WORD actually needs all four entries, but using repeats for the
912
          others means they can all use the same code below.
913
914
          The first two entries in each row are Unicode general categories, and
915
          apply always, because all the characters they include are part of the
916
          PCRE character set. The third and fourth entries are a general and a
917
          particular category, respectively, that include one or more relevant
918
          characters. One or the other is used, depending on whether the check
919
          is for a general or a particular category. However, in both cases the
920
          category contains more characters than the specials that are defined
921
          for the property being tested against. Therefore, it cannot be used
922
          in a NOTPROP case.
923
924
          Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
925
          Underscore is covered by ucp_P or ucp_Po. */
926
927
0
          case 6:  /* Left alphanum vs right general category */
928
0
          case 7:  /* Left space vs right general category */
929
0
          case 8:  /* Left word vs right general category */
930
0
          p = posspropstab[n-6];
931
0
          accepted = risprop && lisprop ==
932
0
            (list[3] != p[0] &&
933
0
             list[3] != p[1] &&
934
0
            (list[3] != p[2] || !lisprop));
935
0
          break;
936
937
0
          case 9:   /* Right alphanum vs left general category */
938
0
          case 10:  /* Right space vs left general category */
939
0
          case 11:  /* Right word vs left general category */
940
0
          p = posspropstab[n-9];
941
0
          accepted = lisprop && risprop ==
942
0
            (base_list[3] != p[0] &&
943
0
             base_list[3] != p[1] &&
944
0
            (base_list[3] != p[2] || !risprop));
945
0
          break;
946
947
0
          case 12:  /* Left alphanum vs right particular category */
948
8
          case 13:  /* Left space vs right particular category */
949
9
          case 14:  /* Left word vs right particular category */
950
9
          p = posspropstab[n-12];
951
9
          accepted = risprop && lisprop ==
952
7
            (catposstab[p[0]][list[3]] &&
953
7
             catposstab[p[1]][list[3]] &&
954
7
            (list[3] != p[3] || !lisprop));
955
9
          break;
956
957
0
          case 15:  /* Right alphanum vs left particular category */
958
0
          case 16:  /* Right space vs left particular category */
959
0
          case 17:  /* Right word vs left particular category */
960
0
          p = posspropstab[n-15];
961
0
          accepted = lisprop && risprop ==
962
0
            (catposstab[p[0]][base_list[3]] &&
963
0
             catposstab[p[1]][base_list[3]] &&
964
0
            (base_list[3] != p[3] || !risprop));
965
0
          break;
966
26
          }
967
26
        }
968
104
      }
969
970
2.65k
    else
971
2.65k
#endif  /* SUPPORT_UNICODE */
972
973
2.65k
    accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
974
2.50k
           rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
975
2.50k
           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
976
977
2.76k
    if (!accepted) return FALSE;
978
979
1.12k
    if (list[1] == 0) return TRUE;
980
    /* Might be an empty repeat. */
981
614
    continue;
982
1.12k
    }
983
984
  /* Control reaches here only if one of the items is a small character list.
985
  All characters are checked against the other side. */
986
987
20.3k
  do
988
21.6k
    {
989
21.6k
    chr = *chr_ptr;
990
991
21.6k
    switch(list_ptr[0])
992
21.6k
      {
993
9.93k
      case OP_CHAR:
994
9.93k
      ochr_ptr = list_ptr + 2;
995
9.93k
      do
996
11.5k
        {
997
11.5k
        if (chr == *ochr_ptr) return FALSE;
998
11.2k
        ochr_ptr++;
999
11.2k
        }
1000
11.2k
      while(*ochr_ptr != NOTACHAR);
1001
9.70k
      break;
1002
1003
9.70k
      case OP_NOT:
1004
436
      ochr_ptr = list_ptr + 2;
1005
436
      do
1006
569
        {
1007
569
        if (chr == *ochr_ptr)
1008
113
          break;
1009
456
        ochr_ptr++;
1010
456
        }
1011
456
      while(*ochr_ptr != NOTACHAR);
1012
436
      if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
1013
113
      break;
1014
1015
      /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
1016
      set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
1017
1018
594
      case OP_DIGIT:
1019
594
      if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
1020
586
      break;
1021
1022
586
      case OP_NOT_DIGIT:
1023
54
      if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
1024
2
      break;
1025
1026
320
      case OP_WHITESPACE:
1027
320
      if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
1028
313
      break;
1029
1030
313
      case OP_NOT_WHITESPACE:
1031
18
      if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
1032
0
      break;
1033
1034
1.39k
      case OP_WORDCHAR:
1035
1.39k
      if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
1036
1.18k
      break;
1037
1038
1.18k
      case OP_NOT_WORDCHAR:
1039
320
      if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
1040
191
      break;
1041
1042
191
      case OP_HSPACE:
1043
44
      switch(chr)
1044
44
        {
1045
0
        HSPACE_CASES: return FALSE;
1046
44
        default: break;
1047
44
        }
1048
44
      break;
1049
1050
326
      case OP_NOT_HSPACE:
1051
326
      switch(chr)
1052
326
        {
1053
3
        HSPACE_CASES: break;
1054
323
        default: return FALSE;
1055
326
        }
1056
3
      break;
1057
1058
2.69k
      case OP_ANYNL:
1059
2.87k
      case OP_VSPACE:
1060
2.87k
      switch(chr)
1061
2.87k
        {
1062
93
        VSPACE_CASES: return FALSE;
1063
2.77k
        default: break;
1064
2.87k
        }
1065
2.77k
      break;
1066
1067
2.77k
      case OP_NOT_VSPACE:
1068
409
      switch(chr)
1069
409
        {
1070
5
        VSPACE_CASES: break;
1071
404
        default: return FALSE;
1072
409
        }
1073
5
      break;
1074
1075
32
      case OP_DOLL:
1076
33
      case OP_EODN:
1077
33
      switch (chr)
1078
33
        {
1079
1
        case CHAR_CR:
1080
1
        case CHAR_LF:
1081
3
        case CHAR_VT:
1082
3
        case CHAR_FF:
1083
3
        case CHAR_NEL:
1084
3
#ifndef EBCDIC
1085
3
        case 0x2028:
1086
3
        case 0x2029:
1087
3
#endif  /* Not EBCDIC */
1088
3
        return FALSE;
1089
33
        }
1090
30
      break;
1091
1092
30
      case OP_EOD:    /* Can always possessify before \z */
1093
6
      break;
1094
1095
0
#ifdef SUPPORT_UNICODE
1096
305
      case OP_PROP:
1097
351
      case OP_NOTPROP:
1098
351
      if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1099
351
            list_ptr[0] == OP_NOTPROP))
1100
125
        return FALSE;
1101
226
      break;
1102
226
#endif
1103
1104
530
      case OP_NCLASS:
1105
530
      if (chr > 255) return FALSE;
1106
      /* Fall through */
1107
1108
3.37k
      case OP_CLASS:
1109
3.37k
      if (chr > 255) break;
1110
3.33k
      class_bitset = (const uint8_t *)
1111
3.33k
        ((list_ptr == list ? code : base_end) - list_ptr[2]);
1112
3.33k
      if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;
1113
1.88k
      break;
1114
1115
1.88k
#ifdef SUPPORT_WIDE_CHARS
1116
1.88k
      case OP_XCLASS:
1117
304
      if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1118
304
          list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))
1119
200
        return FALSE;
1120
104
      break;
1121
1122
104
      case OP_ECLASS:
1123
0
      if (PRIV(eclass)(chr,
1124
0
          (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,
1125
0
          (list_ptr == list ? code : base_end) - list_ptr[3],
1126
0
          (const uint8_t*)cb->start_code, utf))
1127
0
        return FALSE;
1128
0
      break;
1129
0
#endif /* SUPPORT_WIDE_CHARS */
1130
1131
906
      default:
1132
906
      return FALSE;
1133
21.6k
      }
1134
1135
17.2k
    chr_ptr++;
1136
17.2k
    }
1137
20.3k
  while(*chr_ptr != NOTACHAR);
1138
1139
  /* At least one character must be matched from this opcode. */
1140
1141
15.8k
  if (list[1] == 0) return TRUE;
1142
15.8k
  }
1143
1144
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
1145
0
return FALSE;              /* Avoid compiler warnings */
1146
20.2k
}
1147
1148
1149
1150
/*************************************************
1151
*    Scan compiled regex for auto-possession     *
1152
*************************************************/
1153
1154
/* Replaces single character iterations with their possessive alternatives
1155
if appropriate. This function modifies the compiled opcode! Hitting a
1156
non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1157
bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1158
overly complicated or large patterns. In these cases, the check just stops,
1159
leaving the remainder of the pattern unpossessified.
1160
1161
Arguments:
1162
  code        points to start of the byte code
1163
  cb          compile data block
1164
1165
Returns:      0 for success
1166
              -1 if a non-existant opcode is encountered
1167
*/
1168
1169
int
1170
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
1171
1.02k
{
1172
1.02k
PCRE2_UCHAR c;
1173
1.02k
PCRE2_SPTR end;
1174
1.02k
PCRE2_UCHAR *repeat_opcode;
1175
1.02k
uint32_t list[MAX_LIST];
1176
1.02k
int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
1177
1.02k
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
1178
1.02k
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
1179
1180
1.02k
for (;;)
1181
211k
  {
1182
211k
  c = *code;
1183
1184
211k
  if (c >= OP_TABLE_LENGTH)
1185
0
    {
1186
0
    PCRE2_DEBUG_UNREACHABLE();
1187
0
    return -1;   /* Something gone wrong */
1188
0
    }
1189
1190
211k
  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1191
18.6k
    {
1192
18.6k
    c -= get_repeat_base(c) - OP_STAR;
1193
18.6k
    end = (c <= OP_MINUPTO) ?
1194
18.6k
      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
1195
18.6k
    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1196
1197
18.6k
    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
1198
18.0k
        &rec_limit))
1199
12.2k
      {
1200
12.2k
      switch(c)
1201
12.2k
        {
1202
1.09k
        case OP_STAR:
1203
1.09k
        *code += OP_POSSTAR - OP_STAR;
1204
1.09k
        break;
1205
1206
415
        case OP_MINSTAR:
1207
415
        *code += OP_POSSTAR - OP_MINSTAR;
1208
415
        break;
1209
1210
2.74k
        case OP_PLUS:
1211
2.74k
        *code += OP_POSPLUS - OP_PLUS;
1212
2.74k
        break;
1213
1214
1.03k
        case OP_MINPLUS:
1215
1.03k
        *code += OP_POSPLUS - OP_MINPLUS;
1216
1.03k
        break;
1217
1218
6.53k
        case OP_QUERY:
1219
6.53k
        *code += OP_POSQUERY - OP_QUERY;
1220
6.53k
        break;
1221
1222
447
        case OP_MINQUERY:
1223
447
        *code += OP_POSQUERY - OP_MINQUERY;
1224
447
        break;
1225
1226
0
        case OP_UPTO:
1227
0
        *code += OP_POSUPTO - OP_UPTO;
1228
0
        break;
1229
1230
0
        case OP_MINUPTO:
1231
0
        *code += OP_POSUPTO - OP_MINUPTO;
1232
0
        break;
1233
12.2k
        }
1234
12.2k
      }
1235
18.6k
    c = *code;
1236
18.6k
    }
1237
193k
  else if (c == OP_CLASS || c == OP_NCLASS
1238
190k
#ifdef SUPPORT_WIDE_CHARS
1239
190k
           || c == OP_XCLASS || c == OP_ECLASS
1240
193k
#endif
1241
193k
           )
1242
3.27k
    {
1243
3.27k
#ifdef SUPPORT_WIDE_CHARS
1244
3.27k
    if (c == OP_XCLASS || c == OP_ECLASS)
1245
514
      repeat_opcode = code + GET(code, 1);
1246
2.76k
    else
1247
2.76k
#endif
1248
2.76k
      repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1249
1250
3.27k
    c = *repeat_opcode;
1251
3.27k
    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1252
2.18k
      {
1253
      /* The return from get_chr_property_list() will never be NULL when
1254
      *code (aka c) is one of the four class opcodes. However, gcc with
1255
      -fanalyzer notes that a NULL return is possible, and grumbles. Hence we
1256
      put in a check. */
1257
1258
2.18k
      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
1259
2.18k
      list[1] = (c & 1) == 0;
1260
1261
2.18k
      if (end != NULL &&
1262
2.18k
          compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
1263
740
        {
1264
740
        switch (c)
1265
740
          {
1266
299
          case OP_CRSTAR:
1267
325
          case OP_CRMINSTAR:
1268
325
          *repeat_opcode = OP_CRPOSSTAR;
1269
325
          break;
1270
1271
229
          case OP_CRPLUS:
1272
239
          case OP_CRMINPLUS:
1273
239
          *repeat_opcode = OP_CRPOSPLUS;
1274
239
          break;
1275
1276
150
          case OP_CRQUERY:
1277
176
          case OP_CRMINQUERY:
1278
176
          *repeat_opcode = OP_CRPOSQUERY;
1279
176
          break;
1280
1281
0
          case OP_CRRANGE:
1282
0
          case OP_CRMINRANGE:
1283
0
          *repeat_opcode = OP_CRPOSRANGE;
1284
0
          break;
1285
740
          }
1286
740
        }
1287
2.18k
      }
1288
3.27k
    c = *code;
1289
3.27k
    }
1290
1291
211k
  switch(c)
1292
211k
    {
1293
1.02k
    case OP_END:
1294
1.02k
    return 0;
1295
1296
94
    case OP_TYPESTAR:
1297
101
    case OP_TYPEMINSTAR:
1298
859
    case OP_TYPEPLUS:
1299
1.20k
    case OP_TYPEMINPLUS:
1300
2.70k
    case OP_TYPEQUERY:
1301
2.89k
    case OP_TYPEMINQUERY:
1302
2.91k
    case OP_TYPEPOSSTAR:
1303
3.76k
    case OP_TYPEPOSPLUS:
1304
6.65k
    case OP_TYPEPOSQUERY:
1305
6.65k
    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1306
6.65k
    break;
1307
1308
0
    case OP_TYPEUPTO:
1309
0
    case OP_TYPEMINUPTO:
1310
0
    case OP_TYPEEXACT:
1311
0
    case OP_TYPEPOSUPTO:
1312
0
    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1313
0
      code += 2;
1314
0
    break;
1315
1316
0
    case OP_CALLOUT_STR:
1317
0
    code += GET(code, 1 + 2*LINK_SIZE);
1318
0
    break;
1319
1320
0
#ifdef SUPPORT_WIDE_CHARS
1321
514
    case OP_XCLASS:
1322
514
    case OP_ECLASS:
1323
514
    code += GET(code, 1);
1324
514
    break;
1325
0
#endif
1326
1327
0
    case OP_MARK:
1328
0
    case OP_COMMIT_ARG:
1329
0
    case OP_PRUNE_ARG:
1330
0
    case OP_SKIP_ARG:
1331
0
    case OP_THEN_ARG:
1332
0
    code += code[1];
1333
0
    break;
1334
211k
    }
1335
1336
  /* Add in the fixed length from the table */
1337
1338
210k
  code += PRIV(OP_lengths)[c];
1339
1340
  /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1341
  followed by a multi-byte character. The length in the table is a minimum, so
1342
  we have to arrange to skip the extra code units. */
1343
1344
210k
#ifdef MAYBE_UTF_MULTI
1345
210k
  if (utf) switch(c)
1346
20.4k
    {
1347
3.66k
    case OP_CHAR:
1348
13.6k
    case OP_CHARI:
1349
13.6k
    case OP_NOT:
1350
13.6k
    case OP_NOTI:
1351
13.6k
    case OP_STAR:
1352
13.6k
    case OP_MINSTAR:
1353
13.6k
    case OP_PLUS:
1354
13.6k
    case OP_MINPLUS:
1355
13.7k
    case OP_QUERY:
1356
13.7k
    case OP_MINQUERY:
1357
13.7k
    case OP_UPTO:
1358
13.7k
    case OP_MINUPTO:
1359
13.7k
    case OP_EXACT:
1360
13.7k
    case OP_POSSTAR:
1361
13.8k
    case OP_POSPLUS:
1362
13.8k
    case OP_POSQUERY:
1363
13.8k
    case OP_POSUPTO:
1364
13.8k
    case OP_STARI:
1365
13.9k
    case OP_MINSTARI:
1366
13.9k
    case OP_PLUSI:
1367
13.9k
    case OP_MINPLUSI:
1368
13.9k
    case OP_QUERYI:
1369
14.0k
    case OP_MINQUERYI:
1370
14.0k
    case OP_UPTOI:
1371
14.0k
    case OP_MINUPTOI:
1372
14.0k
    case OP_EXACTI:
1373
14.0k
    case OP_POSSTARI:
1374
14.4k
    case OP_POSPLUSI:
1375
14.7k
    case OP_POSQUERYI:
1376
14.7k
    case OP_POSUPTOI:
1377
14.7k
    case OP_NOTSTAR:
1378
14.7k
    case OP_NOTMINSTAR:
1379
14.7k
    case OP_NOTPLUS:
1380
14.7k
    case OP_NOTMINPLUS:
1381
14.7k
    case OP_NOTQUERY:
1382
14.7k
    case OP_NOTMINQUERY:
1383
14.7k
    case OP_NOTUPTO:
1384
14.7k
    case OP_NOTMINUPTO:
1385
14.7k
    case OP_NOTEXACT:
1386
14.7k
    case OP_NOTPOSSTAR:
1387
14.7k
    case OP_NOTPOSPLUS:
1388
14.7k
    case OP_NOTPOSQUERY:
1389
14.7k
    case OP_NOTPOSUPTO:
1390
14.7k
    case OP_NOTSTARI:
1391
14.7k
    case OP_NOTMINSTARI:
1392
14.7k
    case OP_NOTPLUSI:
1393
14.7k
    case OP_NOTMINPLUSI:
1394
14.7k
    case OP_NOTQUERYI:
1395
14.7k
    case OP_NOTMINQUERYI:
1396
14.7k
    case OP_NOTUPTOI:
1397
14.7k
    case OP_NOTMINUPTOI:
1398
14.7k
    case OP_NOTEXACTI:
1399
14.7k
    case OP_NOTPOSSTARI:
1400
14.7k
    case OP_NOTPOSPLUSI:
1401
14.7k
    case OP_NOTPOSQUERYI:
1402
14.7k
    case OP_NOTPOSUPTOI:
1403
14.7k
    if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1404
14.7k
    break;
1405
20.4k
    }
1406
#else
1407
  (void)(utf);  /* Keep compiler happy by referencing function argument */
1408
#endif  /* SUPPORT_WIDE_CHARS */
1409
210k
  }
1410
1.02k
}
1411
1412
/* End of pcre2_auto_possess.c */