Coverage Report

Created: 2026-04-12 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/pcre2/src/pcre2_auto_possess.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
/* This module contains functions that scan a compiled pattern and change
43
repeats into possessive repeats where possible. */
44
45
46
#include "pcre2_internal.h"
47
48
49
50
/* This macro represents the max size of list[] and that is used to keep
51
track of UCD info in several places, it should be kept on sync with the
52
value used by GenerateUcd.py */
53
218k
#define MAX_LIST 8
54
55
/*************************************************
56
*        Tables for auto-possessification        *
57
*************************************************/
58
59
/* This table is used to check whether auto-possessification is possible
60
between adjacent character-type opcodes. The left-hand (repeated) opcode is
61
used to select the row, and the right-hand opcode is use to select the column.
62
A value of 1 means that auto-possessification is OK. For example, the second
63
value in the first row means that \D+\d can be turned into \D++\d.
64
65
The Unicode property types (\P and \p) have to be present to fill out the table
66
because of what their opcode values are, but the table values should always be
67
zero because property types are handled separately in the code. The last four
68
columns apply to items that cannot be repeated, so there is no need to have
69
rows for them. Note that OP_DIGIT etc. are generated only when PCRE2_UCP is
70
*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
71
72
#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
73
#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
74
75
static const uint8_t autoposstab[APTROWS][APTCOLS] = {
76
/* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
77
  { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
78
  { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
79
  { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
80
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
81
  { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
82
  { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
83
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
84
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
85
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
86
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
87
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
88
  { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
89
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
90
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
91
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
92
  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
93
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
94
};
95
96
#ifdef SUPPORT_UNICODE
97
/* This table is used to check whether auto-possessification is possible
98
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
99
left-hand (repeated) opcode is used to select the row, and the right-hand
100
opcode is used to select the column. The values are as follows:
101
102
  0   Always return FALSE (never auto-possessify)
103
  1   Character groups are distinct (possessify if both are OP_PROP)
104
  2   Check character categories in the same group (general or particular)
105
  3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
106
107
  4   Check left general category vs right particular category
108
  5   Check right general category vs left particular category
109
110
  6   Left alphanum vs right general category
111
  7   Left space vs right general category
112
  8   Left word vs right general category
113
114
  9   Right alphanum vs left general category
115
 10   Right space vs left general category
116
 11   Right word vs left general category
117
118
 12   Left alphanum vs right particular category
119
 13   Left space vs right particular category
120
 14   Left word vs right particular category
121
122
 15   Right alphanum vs left particular category
123
 16   Right space vs left particular category
124
 17   Right word vs left particular category
125
*/
126
127
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
128
/* LAMP GC  PC  SC  SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
129
  { 3,  0,  0,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_LAMP */
130
  { 0,  2,  4,  0,   0,    9,   10,     10,  11,    0,   0,    0,    0 },  /* PT_GC */
131
  { 0,  5,  2,  0,   0,   15,   16,     16,  17,    0,   0,    0,    0 },  /* PT_PC */
132
  { 0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SC */
133
  { 0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SCX */
134
  { 3,  6, 12,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_ALNUM */
135
  { 1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_SPACE */
136
  { 1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_PXSPACE */
137
  { 0,  8, 14,  0,   0,    0,    1,      1,   3,    0,   0,    0,    0 },  /* PT_WORD */
138
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_CLIST */
139
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   3,    0,    0 },  /* PT_UCNC */
140
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_BIDICL */
141
  { 0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 }   /* PT_BOOL */
142
  /* PT_ANY does not need a record. */
143
};
144
145
/* This table is used to check whether auto-possessification is possible
146
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
147
specifies a general category and the other specifies a particular category. The
148
row is selected by the general category and the column by the particular
149
category. The value is 1 if the particular category is not part of the general
150
category. */
151
152
static const uint8_t catposstab[7][30] = {
153
/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
154
  { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
155
  { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
156
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
157
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
158
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
159
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
160
  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
161
};
162
163
/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
164
a general or particular category. The properties in each row are those
165
that apply to the character set in question. Duplication means that a little
166
unnecessary work is done when checking, but this keeps things much simpler
167
because they can all use the same code. For more details see the comment where
168
this table is used.
169
170
Note: SPACE and PXSPACE used to be different because Perl excluded VT from
171
"space", but from Perl 5.18 it's included, so both categories are treated the
172
same here. */
173
174
static const uint8_t posspropstab[3][4] = {
175
  { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
176
  { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
177
  { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
178
};
179
#endif  /* SUPPORT_UNICODE */
180
181
182
183
#ifdef SUPPORT_UNICODE
184
/*************************************************
185
*        Check a character and a property        *
186
*************************************************/
187
188
/* This function is called by compare_opcodes() when a property item is
189
adjacent to a fixed character.
190
191
Arguments:
192
  c            the character
193
  ptype        the property type
194
  pdata        the data for the type
195
  negated      TRUE if it's a negated property (\P or \p{^)
196
197
Returns:       TRUE if auto-possessifying is OK
198
*/
199
200
static BOOL
201
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
202
  BOOL negated)
203
100k
{
204
100k
BOOL ok, rc;
205
100k
const uint32_t *p;
206
100k
const ucd_record *prop = GET_UCD(c);
207
208
100k
switch(ptype)
209
100k
  {
210
1.95k
  case PT_LAMP:
211
1.95k
  return (prop->chartype == ucp_Lu ||
212
1.26k
          prop->chartype == ucp_Ll ||
213
923
          prop->chartype == ucp_Lt) == negated;
214
215
5.56k
  case PT_GC:
216
5.56k
  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
217
218
14.8k
  case PT_PC:
219
14.8k
  return (pdata == prop->chartype) == negated;
220
221
9.19k
  case PT_SC:
222
9.19k
  return (pdata == prop->script) == negated;
223
224
2.75k
  case PT_SCX:
225
2.75k
  ok = (pdata == prop->script
226
2.43k
        || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
227
2.75k
  return ok == negated;
228
229
  /* These are specials */
230
231
1.70k
  case PT_ALNUM:
232
1.70k
  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
233
1.21k
          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
234
235
  /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
236
  means that Perl space and POSIX space are now identical. PCRE was changed
237
  at release 8.34. */
238
239
27.6k
  case PT_SPACE:    /* Perl space */
240
28.2k
  case PT_PXSPACE:  /* POSIX space */
241
28.2k
  switch(c)
242
28.2k
    {
243
70.9k
    HSPACE_CASES:
244
70.9k
    VSPACE_CASES:
245
11.3k
    rc = negated;
246
11.3k
    break;
247
248
16.9k
    default:
249
16.9k
    rc = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
250
28.2k
    }
251
28.2k
  return rc;
252
253
11.4k
  case PT_WORD:
254
11.4k
  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
255
8.08k
          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
256
7.51k
          c == CHAR_UNDERSCORE) == negated;
257
258
0
  case PT_CLIST:
259
0
  p = PRIV(ucd_caseless_sets) + prop->caseset;
260
0
  for (;;)
261
0
    {
262
0
    if (c < *p) return !negated;
263
0
    if (c == *p++) return negated;
264
0
    }
265
  /* LCOV_EXCL_START */
266
0
  PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
267
0
  break;
268
  /* LCOV_EXCL_STOP */
269
270
  /* Haven't yet thought these through. */
271
272
5.50k
  case PT_BIDICL:
273
5.50k
  return FALSE;
274
275
2.52k
  case PT_BOOL:
276
2.52k
  return FALSE;
277
100k
  }
278
279
16.5k
return FALSE;
280
100k
}
281
#endif  /* SUPPORT_UNICODE */
282
283
284
285
/*************************************************
286
*        Base opcode of repeated opcodes         *
287
*************************************************/
288
289
/* Returns the base opcode for repeated single character type opcodes. If the
290
opcode is not a repeated character type, it returns with the original value.
291
292
Arguments:  c opcode
293
Returns:    base opcode for the type
294
*/
295
296
static PCRE2_UCHAR
297
get_repeat_base(PCRE2_UCHAR c)
298
7.47M
{
299
7.47M
return (c > OP_TYPEPOSUPTO)? c :
300
7.47M
       (c >= OP_TYPESTAR)?   OP_TYPESTAR :
301
7.47M
       (c >= OP_NOTSTARI)?   OP_NOTSTARI :
302
5.26M
       (c >= OP_NOTSTAR)?    OP_NOTSTAR :
303
5.20M
       (c >= OP_STARI)?      OP_STARI :
304
5.02M
                             OP_STAR;
305
7.47M
}
306
307
308
/*************************************************
309
*        Fill the character property list        *
310
*************************************************/
311
312
/* Checks whether the code points to an opcode that can take part in auto-
313
possessification, and if so, fills a list with its properties.
314
315
Arguments:
316
  code        points to start of expression
317
  utf         TRUE if in UTF mode
318
  ucp         TRUE if in UCP mode
319
  fcc         points to the case-flipping table
320
  list        points to output list
321
              list[0] will be filled with the opcode
322
              list[1] will be non-zero if this opcode
323
                can match an empty character string
324
              list[2..7] depends on the opcode
325
326
Returns:      points to the start of the next opcode if *code is accepted
327
              NULL if *code is not accepted
328
*/
329
330
static PCRE2_SPTR
331
get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
332
  uint32_t *list)
333
6.53M
{
334
6.53M
PCRE2_UCHAR c = *code;
335
6.53M
PCRE2_UCHAR base;
336
6.53M
PCRE2_SPTR end;
337
6.53M
PCRE2_SPTR class_end;
338
6.53M
uint32_t chr;
339
340
6.53M
#ifdef SUPPORT_UNICODE
341
6.53M
uint32_t *clist_dest;
342
6.53M
const uint32_t *clist_src;
343
#else
344
(void)utf;    /* Suppress "unused parameter" compiler warnings */
345
(void)ucp;
346
#endif
347
348
6.53M
list[0] = c;
349
6.53M
list[1] = FALSE;
350
6.53M
code++;
351
352
6.53M
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
353
3.76M
  {
354
3.76M
  base = get_repeat_base(c);
355
3.76M
  c -= (base - OP_STAR);
356
357
3.76M
  if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
358
260k
    code += IMM2_SIZE;
359
360
3.76M
  list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
361
2.67M
             c != OP_POSPLUS);
362
363
3.76M
  switch(base)
364
3.76M
    {
365
2.02M
    case OP_STAR:
366
2.02M
    list[0] = OP_CHAR;
367
2.02M
    break;
368
369
508k
    case OP_STARI:
370
508k
    list[0] = OP_CHARI;
371
508k
    break;
372
373
87.8k
    case OP_NOTSTAR:
374
87.8k
    list[0] = OP_NOT;
375
87.8k
    break;
376
377
24.7k
    case OP_NOTSTARI:
378
24.7k
    list[0] = OP_NOTI;
379
24.7k
    break;
380
381
1.11M
    case OP_TYPESTAR:
382
1.11M
    list[0] = *code;
383
1.11M
    code++;
384
1.11M
    break;
385
3.76M
    }
386
3.76M
  c = list[0];
387
3.76M
  }
388
389
6.53M
switch(c)
390
6.53M
  {
391
43.1k
  case OP_NOT_DIGIT:
392
66.6k
  case OP_DIGIT:
393
141k
  case OP_NOT_WHITESPACE:
394
193k
  case OP_WHITESPACE:
395
247k
  case OP_NOT_WORDCHAR:
396
284k
  case OP_WORDCHAR:
397
662k
  case OP_ANY:
398
719k
  case OP_ALLANY:
399
837k
  case OP_ANYNL:
400
906k
  case OP_NOT_HSPACE:
401
957k
  case OP_HSPACE:
402
968k
  case OP_NOT_VSPACE:
403
976k
  case OP_VSPACE:
404
1.11M
  case OP_EXTUNI:
405
1.12M
  case OP_EODN:
406
1.13M
  case OP_EOD:
407
1.16M
  case OP_DOLL:
408
1.16M
  case OP_DOLLM:
409
1.16M
  return code;
410
411
3.60M
  case OP_CHAR:
412
3.71M
  case OP_NOT:
413
3.71M
  GETCHARINCTEST(chr, code);
414
3.71M
  list[2] = chr;
415
3.71M
  list[3] = NOTACHAR;
416
3.71M
  return code;
417
418
917k
  case OP_CHARI:
419
944k
  case OP_NOTI:
420
944k
  list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
421
944k
  GETCHARINCTEST(chr, code);
422
944k
  list[2] = chr;
423
424
944k
#ifdef SUPPORT_UNICODE
425
944k
  if (chr < 128 || (chr < 256 && !utf && !ucp))
426
815k
    list[3] = fcc[chr];
427
129k
  else
428
129k
    list[3] = UCD_OTHERCASE(chr);
429
#elif defined SUPPORT_WIDE_CHARS
430
  list[3] = (chr < 256) ? fcc[chr] : chr;
431
#else
432
  list[3] = fcc[chr];
433
#endif
434
435
  /* The othercase might be the same value. */
436
437
944k
  if (chr == list[3])
438
681k
    list[3] = NOTACHAR;
439
262k
  else
440
262k
    list[4] = NOTACHAR;
441
944k
  return code;
442
443
0
#ifdef SUPPORT_UNICODE
444
99.8k
  case OP_PROP:
445
220k
  case OP_NOTPROP:
446
220k
  if (code[0] != PT_CLIST)
447
166k
    {
448
166k
    list[2] = code[0];
449
166k
    list[3] = code[1];
450
166k
    return code + 2;
451
166k
    }
452
453
  /* Convert only if we have enough space. */
454
455
54.4k
  clist_src = PRIV(ucd_caseless_sets) + code[1];
456
54.4k
  clist_dest = list + 2;
457
54.4k
  code += 2;
458
459
218k
  do {
460
218k
     if (clist_dest >= list + MAX_LIST)
461
0
       {
462
       /* Early return if there is not enough space. GenerateUcd.py
463
       generated a list with more than 5 characters and something
464
       must be done about that going forward. */
465
0
       PCRE2_DEBUG_UNREACHABLE();   /* Remove if it ever triggers */
466
0
       list[2] = code[0];
467
0
       list[3] = code[1];
468
0
       return code;
469
0
       }
470
218k
     *clist_dest++ = *clist_src;
471
218k
     }
472
218k
  while(*clist_src++ != NOTACHAR);
473
474
  /* All characters are stored. The terminating NOTACHAR is copied from the
475
  clist itself. */
476
477
54.4k
  list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
478
54.4k
  return code;
479
0
#endif
480
481
71.1k
  case OP_NCLASS:
482
213k
  case OP_CLASS:
483
213k
#ifdef SUPPORT_WIDE_CHARS
484
245k
  case OP_XCLASS:
485
258k
  case OP_ECLASS:
486
258k
  if (c == OP_XCLASS || c == OP_ECLASS)
487
44.8k
    end = code + GET(code, 0) - 1;
488
213k
  else
489
213k
#endif
490
213k
    end = code + 32 / sizeof(PCRE2_UCHAR);
491
258k
  class_end = end;
492
493
258k
  switch(*end)
494
258k
    {
495
36.7k
    case OP_CRSTAR:
496
40.5k
    case OP_CRMINSTAR:
497
69.5k
    case OP_CRQUERY:
498
75.9k
    case OP_CRMINQUERY:
499
76.5k
    case OP_CRPOSSTAR:
500
79.0k
    case OP_CRPOSQUERY:
501
79.0k
    list[1] = TRUE;
502
79.0k
    end++;
503
79.0k
    break;
504
505
55.3k
    case OP_CRPLUS:
506
63.3k
    case OP_CRMINPLUS:
507
64.5k
    case OP_CRPOSPLUS:
508
64.5k
    end++;
509
64.5k
    break;
510
511
61.2k
    case OP_CRRANGE:
512
82.6k
    case OP_CRMINRANGE:
513
83.3k
    case OP_CRPOSRANGE:
514
83.3k
    list[1] = (GET2(end, 1) == 0);
515
83.3k
    end += 1 + 2 * IMM2_SIZE;
516
83.3k
    break;
517
258k
    }
518
258k
  list[2] = (uint32_t)(end - code);
519
258k
  list[3] = (uint32_t)(end - class_end);
520
258k
  return end;
521
6.53M
  }
522
523
237k
return NULL;    /* Opcode not accepted */
524
6.53M
}
525
526
527
528
/*************************************************
529
*    Scan further character sets for match       *
530
*************************************************/
531
532
/* Checks whether the base and the current opcode have a common character, in
533
which case the base cannot be possessified.
534
535
Arguments:
536
  code        points to the byte code
537
  utf         TRUE in UTF mode
538
  ucp         TRUE in UCP mode
539
  cb          compile data block
540
  base_list   the data list of the base opcode
541
  base_end    the end of the base opcode
542
  rec_limit   points to recursion depth counter
543
544
Returns:      TRUE if the auto-possessification is possible
545
*/
546
547
static BOOL
548
compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
549
  const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
550
3.91M
{
551
3.91M
PCRE2_UCHAR c;
552
3.91M
uint32_t list[MAX_LIST];
553
3.91M
const uint32_t *chr_ptr;
554
3.91M
const uint32_t *ochr_ptr;
555
3.91M
const uint32_t *list_ptr;
556
3.91M
PCRE2_SPTR next_code;
557
3.91M
#ifdef SUPPORT_WIDE_CHARS
558
3.91M
PCRE2_SPTR xclass_flags;
559
3.91M
#endif
560
3.91M
const uint8_t *class_bitset;
561
3.91M
const uint8_t *set1, *set2, *set_end;
562
3.91M
uint32_t chr;
563
3.91M
BOOL accepted, invert_bits;
564
3.91M
BOOL entered_a_group = FALSE;
565
566
3.91M
if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */
567
568
/* Note: the base_list[1] contains whether the current opcode has a greedy
569
(represented by a non-zero value) quantifier. This is a different from
570
other character type lists, which store here that the character iterator
571
matches to an empty string (also represented by a non-zero value). */
572
573
3.11M
for(;;)
574
5.75M
  {
575
5.75M
  PCRE2_SPTR bracode;
576
577
  /* All operations move the code pointer forward.
578
  Therefore infinite recursions are not possible. */
579
580
5.75M
  c = *code;
581
582
  /* Skip over callouts */
583
584
5.75M
  if (c == OP_CALLOUT)
585
244k
    {
586
244k
    code += PRIV(OP_lengths)[c];
587
244k
    continue;
588
244k
    }
589
590
5.51M
  if (c == OP_CALLOUT_STR)
591
5.21k
    {
592
5.21k
    code += GET(code, 1 + 2*LINK_SIZE);
593
5.21k
    continue;
594
5.21k
    }
595
596
  /* At the end of a branch, skip to the end of the group and process it. */
597
598
5.50M
  if (c == OP_ALT)
599
261k
    {
600
2.89M
    do code += GET(code, 1); while (*code == OP_ALT);
601
261k
    c = *code;
602
261k
    }
603
604
  /* Inspect the next opcode. */
605
606
5.50M
  switch(c)
607
5.50M
    {
608
    /* We can always possessify a greedy iterator at the end of the pattern,
609
    which is reached after skipping over the final OP_KET. A non-greedy
610
    iterator must never be possessified. */
611
612
39.9k
    case OP_END:
613
39.9k
    return base_list[1] != 0;
614
615
    /* When an iterator is at the end of certain kinds of group we can inspect
616
    what follows the group by skipping over the closing ket. Note that this
617
    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
618
    iteration is variable (could be another iteration or could be the next
619
    item). As these two opcodes are not listed in the next switch, they will
620
    end up as the next code to inspect, and return FALSE by virtue of being
621
    unsupported. */
622
623
2.35M
    case OP_KET:
624
2.37M
    case OP_KETRPOS:
625
    /* The non-greedy case cannot be converted to a possessive form. */
626
627
2.37M
    if (base_list[1] == 0) return FALSE;
628
629
    /* If the bracket is capturing it might be referenced by an OP_RECURSE
630
    so its last iterator can never be possessified if the pattern contains
631
    recursions. (This could be improved by keeping a list of group numbers that
632
    are called by recursion.) */
633
634
2.26M
    bracode = code - GET(code, 1);
635
2.26M
    switch(*bracode)
636
2.26M
      {
637
262k
      case OP_CBRA:
638
262k
      case OP_SCBRA:
639
263k
      case OP_CBRAPOS:
640
276k
      case OP_SCBRAPOS:
641
276k
      if (cb->had_recurse) return FALSE;
642
236k
      break;
643
644
      /* A script run might have to backtrack if the iterated item can match
645
      characters from more than one script. So give up unless repeating an
646
      explicit character. */
647
648
236k
      case OP_SCRIPT_RUN:
649
7.35k
      if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
650
1.31k
        return FALSE;
651
6.03k
      break;
652
653
      /* Atomic sub-patterns and forward assertions can always auto-possessify
654
      their last iterator. However, if the group was entered as a result of
655
      checking a previous iterator, this is not possible. */
656
657
8.88k
      case OP_ASSERT:
658
31.9k
      case OP_ASSERT_NOT:
659
35.2k
      case OP_ONCE:
660
35.2k
      return !entered_a_group;
661
662
      /* Fixed-length lookbehinds can be treated the same way, but variable
663
      length lookbehinds must not auto-possessify their last iterator. Note
664
      that in order to identify a variable length lookbehind we must check
665
      through all branches, because some may be of fixed length. */
666
667
10.7k
      case OP_ASSERTBACK:
668
43.9k
      case OP_ASSERTBACK_NOT:
669
43.9k
      do
670
46.5k
        {
671
46.5k
        if (bracode[1+LINK_SIZE] == OP_VREVERSE) return FALSE;  /* Variable */
672
3.65k
        bracode += GET(bracode, 1);
673
3.65k
        }
674
43.9k
      while (*bracode == OP_ALT);
675
1.01k
      return !entered_a_group;  /* Not variable length */
676
677
      /* Non-atomic assertions - don't possessify last iterator. This needs
678
      more thought. */
679
680
6.39k
      case OP_ASSERT_NA:
681
15.5k
      case OP_ASSERTBACK_NA:
682
15.5k
      return FALSE;
683
2.26M
      }
684
685
    /* Skip over the bracket and inspect what comes next. */
686
687
2.12M
    code += PRIV(OP_lengths)[c];
688
2.12M
    continue;
689
690
    /* Handle cases where the next item is a group. */
691
692
7.37k
    case OP_ONCE:
693
63.7k
    case OP_BRA:
694
222k
    case OP_CBRA:
695
222k
    next_code = code + GET(code, 1);
696
222k
    code += PRIV(OP_lengths)[c];
697
698
    /* Check each branch. We have to recurse a level for all but the last
699
    branch. */
700
701
285k
    while (*next_code == OP_ALT)
702
124k
      {
703
124k
      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
704
60.9k
        return FALSE;
705
63.0k
      code = next_code + 1 + LINK_SIZE;
706
63.0k
      next_code += GET(next_code, 1);
707
63.0k
      }
708
709
161k
    entered_a_group = TRUE;
710
161k
    continue;
711
712
62.1k
    case OP_BRAZERO:
713
69.1k
    case OP_BRAMINZERO:
714
715
69.1k
    next_code = code + 1;
716
69.1k
    if (*next_code != OP_BRA && *next_code != OP_CBRA &&
717
19.2k
        *next_code != OP_ONCE) return FALSE;
718
719
56.8k
    do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
720
721
    /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
722
723
53.4k
    next_code += 1 + LINK_SIZE;
724
53.4k
    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
725
53.4k
         rec_limit))
726
16.9k
      return FALSE;
727
728
36.4k
    code += PRIV(OP_lengths)[c];
729
36.4k
    continue;
730
731
    /* The next opcode does not need special handling; fall through and use it
732
    to see if the base can be possessified. */
733
734
2.80M
    default:
735
2.80M
    break;
736
5.50M
    }
737
738
  /* We now have the next appropriate opcode to compare with the base. Check
739
  for a supported opcode, and load its properties. */
740
741
2.80M
  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
742
2.80M
  if (code == NULL) return FALSE;    /* Unsupported */
743
744
  /* If either opcode is a small character list, set pointers for comparing
745
  characters from that list with another list, or with a property. */
746
747
2.56M
  if (base_list[0] == OP_CHAR)
748
1.61M
    {
749
1.61M
    chr_ptr = base_list + 2;
750
1.61M
    list_ptr = list;
751
1.61M
    }
752
954k
  else if (list[0] == OP_CHAR)
753
752k
    {
754
752k
    chr_ptr = list + 2;
755
752k
    list_ptr = base_list;
756
752k
    }
757
758
  /* Character bitsets can also be compared to certain opcodes. */
759
760
201k
  else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
761
175k
#if PCRE2_CODE_UNIT_WIDTH == 8
762
      /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
763
175k
      || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
764
201k
#endif
765
201k
      )
766
38.2k
    {
767
38.2k
#if PCRE2_CODE_UNIT_WIDTH == 8
768
38.2k
    if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
769
#else
770
    if (base_list[0] == OP_CLASS)
771
#endif
772
23.6k
      {
773
23.6k
      set1 = (const uint8_t *)(base_end - base_list[2]);
774
23.6k
      list_ptr = list;
775
23.6k
      }
776
14.6k
    else
777
14.6k
      {
778
14.6k
      set1 = (const uint8_t *)(code - list[2]);
779
14.6k
      list_ptr = base_list;
780
14.6k
      }
781
782
38.2k
    invert_bits = FALSE;
783
38.2k
    switch(list_ptr[0])
784
38.2k
      {
785
9.03k
      case OP_CLASS:
786
11.3k
      case OP_NCLASS:
787
11.3k
      set2 = (const uint8_t *)
788
11.3k
        ((list_ptr == list ? code : base_end) - list_ptr[2]);
789
11.3k
      break;
790
791
0
#ifdef SUPPORT_WIDE_CHARS
792
2.19k
      case OP_XCLASS:
793
2.19k
      xclass_flags = (list_ptr == list ? code : base_end) -
794
2.19k
        list_ptr[2] + LINK_SIZE;
795
2.19k
      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
796
1.71k
      if ((*xclass_flags & XCL_MAP) == 0)
797
980
        {
798
        /* No bits are set for characters < 256. */
799
980
        if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
800
        /* Might be an empty repeat. */
801
498
        continue;
802
980
        }
803
737
      set2 = (const uint8_t *)(xclass_flags + 1);
804
737
      break;
805
0
#endif
806
807
3.75k
      case OP_NOT_DIGIT:
808
3.75k
      invert_bits = TRUE;
809
3.75k
      PCRE2_FALLTHROUGH /* Fall through */
810
4.18k
      case OP_DIGIT:
811
4.18k
      set2 = (const uint8_t *)(cb->cbits + cbit_digit);
812
4.18k
      break;
813
814
759
      case OP_NOT_WHITESPACE:
815
759
      invert_bits = TRUE;
816
759
      PCRE2_FALLTHROUGH /* Fall through */
817
4.11k
      case OP_WHITESPACE:
818
4.11k
      set2 = (const uint8_t *)(cb->cbits + cbit_space);
819
4.11k
      break;
820
821
1.65k
      case OP_NOT_WORDCHAR:
822
1.65k
      invert_bits = TRUE;
823
1.65k
      PCRE2_FALLTHROUGH /* Fall through */
824
6.26k
      case OP_WORDCHAR:
825
6.26k
      set2 = (const uint8_t *)(cb->cbits + cbit_word);
826
6.26k
      break;
827
828
10.1k
      default:
829
10.1k
      return FALSE;
830
38.2k
      }
831
832
    /* Because the bit sets are unaligned bytes, we need to perform byte
833
    comparison here. */
834
835
26.6k
    set_end = set1 + 32;
836
26.6k
    if (invert_bits)
837
6.16k
      {
838
6.16k
      do
839
81.9k
        {
840
81.9k
        if ((*set1++ & ~(*set2++)) != 0) return FALSE;
841
81.9k
        }
842
77.6k
      while (set1 < set_end);
843
6.16k
      }
844
20.4k
    else
845
20.4k
      {
846
20.4k
      do
847
340k
        {
848
340k
        if ((*set1++ & *set2++) != 0) return FALSE;
849
340k
        }
850
327k
      while (set1 < set_end);
851
20.4k
      }
852
853
9.83k
    if (list[1] == 0) return TRUE;
854
    /* Might be an empty repeat. */
855
3.22k
    continue;
856
9.83k
    }
857
858
  /* Some property combinations also acceptable. Unicode property opcodes are
859
  processed specially; the rest can be handled with a lookup table. */
860
861
163k
  else
862
163k
    {
863
163k
    uint32_t leftop, rightop;
864
865
163k
    leftop = base_list[0];
866
163k
    rightop = list[0];
867
868
163k
#ifdef SUPPORT_UNICODE
869
163k
    accepted = FALSE; /* Always set in non-unicode case. */
870
163k
    if (leftop == OP_PROP || leftop == OP_NOTPROP)
871
25.5k
      {
872
25.5k
      if (rightop == OP_EOD)
873
632
        accepted = TRUE;
874
24.9k
      else if (rightop == OP_PROP || rightop == OP_NOTPROP)
875
20.3k
        {
876
20.3k
        int n;
877
20.3k
        const uint8_t *p;
878
20.3k
        BOOL same = leftop == rightop;
879
20.3k
        BOOL lisprop = leftop == OP_PROP;
880
20.3k
        BOOL risprop = rightop == OP_PROP;
881
20.3k
        BOOL bothprop = lisprop && risprop;
882
883
        /* There's a table that specifies how each combination is to be
884
        processed:
885
          0   Always return FALSE (never auto-possessify)
886
          1   Character groups are distinct (possessify if both are OP_PROP)
887
          2   Check character categories in the same group (general or particular)
888
          3   Return TRUE if the two opcodes are not the same
889
          ... see comments below
890
        */
891
892
20.3k
        n = propposstab[base_list[2]][list[2]];
893
20.3k
        switch(n)
894
20.3k
          {
895
624
          case 0: break;
896
773
          case 1: accepted = bothprop; break;
897
753
          case 2: accepted = (base_list[3] == list[3]) != same; break;
898
3.06k
          case 3: accepted = !same; break;
899
900
1.59k
          case 4:  /* Left general category, right particular category */
901
1.59k
          accepted = risprop && catposstab[base_list[3]][list[3]] == same;
902
1.59k
          break;
903
904
658
          case 5:  /* Right general category, left particular category */
905
658
          accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
906
658
          break;
907
908
          /* This code is logically tricky. Think hard before fiddling with it.
909
          The posspropstab table has four entries per row. Each row relates to
910
          one of PCRE's special properties such as ALNUM or SPACE or WORD.
911
          Only WORD actually needs all four entries, but using repeats for the
912
          others means they can all use the same code below.
913
914
          The first two entries in each row are Unicode general categories, and
915
          apply always, because all the characters they include are part of the
916
          PCRE character set. The third and fourth entries are a general and a
917
          particular category, respectively, that include one or more relevant
918
          characters. One or the other is used, depending on whether the check
919
          is for a general or a particular category. However, in both cases the
920
          category contains more characters than the specials that are defined
921
          for the property being tested against. Therefore, it cannot be used
922
          in a NOTPROP case.
923
924
          Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
925
          Underscore is covered by ucp_P or ucp_Po. */
926
927
1.92k
          case 6:  /* Left alphanum vs right general category */
928
3.33k
          case 7:  /* Left space vs right general category */
929
4.80k
          case 8:  /* Left word vs right general category */
930
4.80k
          p = posspropstab[n-6];
931
4.80k
          accepted = risprop && lisprop ==
932
2.73k
            (list[3] != p[0] &&
933
1.66k
             list[3] != p[1] &&
934
1.32k
            (list[3] != p[2] || !lisprop));
935
4.80k
          break;
936
937
277
          case 9:   /* Right alphanum vs left general category */
938
1.02k
          case 10:  /* Right space vs left general category */
939
2.79k
          case 11:  /* Right word vs left general category */
940
2.79k
          p = posspropstab[n-9];
941
2.79k
          accepted = lisprop && risprop ==
942
1.85k
            (base_list[3] != p[0] &&
943
1.53k
             base_list[3] != p[1] &&
944
843
            (base_list[3] != p[2] || !risprop));
945
2.79k
          break;
946
947
331
          case 12:  /* Left alphanum vs right particular category */
948
1.71k
          case 13:  /* Left space vs right particular category */
949
2.35k
          case 14:  /* Left word vs right particular category */
950
2.35k
          p = posspropstab[n-12];
951
2.35k
          accepted = risprop && lisprop ==
952
1.79k
            (catposstab[p[0]][list[3]] &&
953
1.46k
             catposstab[p[1]][list[3]] &&
954
1.23k
            (list[3] != p[3] || !lisprop));
955
2.35k
          break;
956
957
778
          case 15:  /* Right alphanum vs left particular category */
958
2.51k
          case 16:  /* Right space vs left particular category */
959
2.96k
          case 17:  /* Right word vs left particular category */
960
2.96k
          p = posspropstab[n-15];
961
2.96k
          accepted = lisprop && risprop ==
962
1.77k
            (catposstab[p[0]][base_list[3]] &&
963
1.00k
             catposstab[p[1]][base_list[3]] &&
964
603
            (base_list[3] != p[3] || !risprop));
965
2.96k
          break;
966
20.3k
          }
967
20.3k
        }
968
25.5k
      }
969
970
138k
    else
971
138k
#endif  /* SUPPORT_UNICODE */
972
973
138k
    accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
974
120k
           rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
975
114k
           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
976
977
163k
    if (!accepted) return FALSE;
978
979
29.0k
    if (list[1] == 0) return TRUE;
980
    /* Might be an empty repeat. */
981
4.75k
    continue;
982
29.0k
    }
983
984
  /* Control reaches here only if one of the items is a small character list.
985
  All characters are checked against the other side. */
986
987
2.36M
  do
988
2.53M
    {
989
2.53M
    chr = *chr_ptr;
990
991
2.53M
    switch(list_ptr[0])
992
2.53M
      {
993
1.52M
      case OP_CHAR:
994
1.52M
      ochr_ptr = list_ptr + 2;
995
1.52M
      do
996
1.63M
        {
997
1.63M
        if (chr == *ochr_ptr) return FALSE;
998
1.52M
        ochr_ptr++;
999
1.52M
        }
1000
1.52M
      while(*ochr_ptr != NOTACHAR);
1001
1.41M
      break;
1002
1003
1.41M
      case OP_NOT:
1004
80.7k
      ochr_ptr = list_ptr + 2;
1005
80.7k
      do
1006
88.6k
        {
1007
88.6k
        if (chr == *ochr_ptr)
1008
5.76k
          break;
1009
82.8k
        ochr_ptr++;
1010
82.8k
        }
1011
82.8k
      while(*ochr_ptr != NOTACHAR);
1012
80.7k
      if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
1013
5.76k
      break;
1014
1015
      /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
1016
      set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
1017
1018
9.89k
      case OP_DIGIT:
1019
9.89k
      if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
1020
8.93k
      break;
1021
1022
17.3k
      case OP_NOT_DIGIT:
1023
17.3k
      if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
1024
1.92k
      break;
1025
1026
22.8k
      case OP_WHITESPACE:
1027
22.8k
      if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
1028
21.7k
      break;
1029
1030
55.7k
      case OP_NOT_WHITESPACE:
1031
55.7k
      if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
1032
2.50k
      break;
1033
1034
15.7k
      case OP_WORDCHAR:
1035
15.7k
      if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
1036
10.3k
      break;
1037
1038
22.2k
      case OP_NOT_WORDCHAR:
1039
22.2k
      if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
1040
4.79k
      break;
1041
1042
33.5k
      case OP_HSPACE:
1043
33.5k
      switch(chr)
1044
33.5k
        {
1045
10.5k
        HSPACE_CASES: return FALSE;
1046
23.0k
        default: break;
1047
33.5k
        }
1048
23.0k
      break;
1049
1050
45.1k
      case OP_NOT_HSPACE:
1051
45.1k
      switch(chr)
1052
45.1k
        {
1053
19.4k
        HSPACE_CASES: break;
1054
25.7k
        default: return FALSE;
1055
45.1k
        }
1056
19.4k
      break;
1057
1058
67.5k
      case OP_ANYNL:
1059
73.3k
      case OP_VSPACE:
1060
73.3k
      switch(chr)
1061
73.3k
        {
1062
4.03k
        VSPACE_CASES: return FALSE;
1063
69.3k
        default: break;
1064
73.3k
        }
1065
69.3k
      break;
1066
1067
69.3k
      case OP_NOT_VSPACE:
1068
7.48k
      switch(chr)
1069
7.48k
        {
1070
3.63k
        VSPACE_CASES: break;
1071
3.84k
        default: return FALSE;
1072
7.48k
        }
1073
3.63k
      break;
1074
1075
22.3k
      case OP_DOLL:
1076
25.6k
      case OP_EODN:
1077
25.6k
      switch (chr)
1078
25.6k
        {
1079
318
        case CHAR_CR:
1080
604
        case CHAR_LF:
1081
1.16k
        case CHAR_VT:
1082
1.61k
        case CHAR_FF:
1083
2.19k
        case CHAR_NEL:
1084
2.19k
#ifndef EBCDIC
1085
2.48k
        case 0x2028:
1086
2.76k
        case 0x2029:
1087
2.76k
#endif  /* Not EBCDIC */
1088
2.76k
        return FALSE;
1089
25.6k
        }
1090
22.8k
      break;
1091
1092
22.8k
      case OP_EOD:    /* Can always possessify before \z */
1093
1.09k
      break;
1094
1095
0
#ifdef SUPPORT_UNICODE
1096
23.4k
      case OP_PROP:
1097
100k
      case OP_NOTPROP:
1098
100k
      if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1099
100k
            list_ptr[0] == OP_NOTPROP))
1100
67.7k
        return FALSE;
1101
32.4k
      break;
1102
32.4k
#endif
1103
1104
51.8k
      case OP_NCLASS:
1105
51.8k
      if (chr > 255) return FALSE;
1106
51.5k
      PCRE2_FALLTHROUGH /* Fall through */
1107
51.5k
1108
159k
      case OP_CLASS:
1109
159k
      if (chr > 255) break;
1110
158k
      class_bitset = (const uint8_t *)
1111
158k
        ((list_ptr == list ? code : base_end) - list_ptr[2]);
1112
158k
      if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;
1113
100k
      break;
1114
1115
100k
#ifdef SUPPORT_WIDE_CHARS
1116
100k
      case OP_XCLASS:
1117
22.0k
      if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1118
22.0k
          list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))
1119
8.93k
        return FALSE;
1120
13.0k
      break;
1121
1122
13.0k
      case OP_ECLASS:
1123
11.0k
      if (PRIV(eclass)(chr,
1124
11.0k
          (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,
1125
11.0k
          (list_ptr == list ? code : base_end) - list_ptr[3],
1126
11.0k
          (const uint8_t*)cb->start_code, utf))
1127
6.89k
        return FALSE;
1128
4.19k
      break;
1129
4.19k
#endif /* SUPPORT_WIDE_CHARS */
1130
1131
306k
      default:
1132
306k
      return FALSE;
1133
2.53M
      }
1134
1135
1.75M
    chr_ptr++;
1136
1.75M
    }
1137
2.36M
  while(*chr_ptr != NOTACHAR);
1138
1139
  /* At least one character must be matched from this opcode. */
1140
1141
1.59M
  if (list[1] == 0) return TRUE;
1142
1.59M
  }
1143
1144
/* LCOV_EXCL_START */
1145
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
1146
0
return FALSE;              /* Avoid compiler warnings */
1147
/* LCOV_EXCL_STOP */
1148
3.11M
}
1149
1150
1151
1152
/*************************************************
1153
*    Scan compiled regex for auto-possession     *
1154
*************************************************/
1155
1156
/* Replaces single character iterations with their possessive alternatives
1157
if appropriate. This function modifies the compiled opcode! Hitting a
1158
non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1159
bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1160
overly complicated or large patterns. In these cases, the check just stops,
1161
leaving the remainder of the pattern unpossessified.
1162
1163
Arguments:
1164
  code        points to start of the byte code
1165
  cb          compile data block
1166
1167
Returns:      0 for success
1168
              -1 if a non-existant opcode is encountered
1169
*/
1170
1171
int
1172
PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
1173
54.4k
{
1174
54.4k
PCRE2_UCHAR c;
1175
54.4k
PCRE2_SPTR end;
1176
54.4k
PCRE2_UCHAR *repeat_opcode;
1177
54.4k
uint32_t list[MAX_LIST];
1178
54.4k
int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
1179
54.4k
BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
1180
54.4k
BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
1181
1182
54.4k
for (;;)
1183
73.8M
  {
1184
73.8M
  c = *code;
1185
1186
  /* LCOV_EXCL_START */
1187
73.8M
  if (c >= OP_TABLE_LENGTH)
1188
0
    {
1189
0
    PCRE2_DEBUG_UNREACHABLE();
1190
0
    return -1;   /* Something gone wrong */
1191
0
    }
1192
  /* LCOV_EXCL_STOP */
1193
1194
73.8M
  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1195
3.71M
    {
1196
3.71M
    c -= get_repeat_base(c) - OP_STAR;
1197
3.71M
    end = (c <= OP_MINUPTO) ?
1198
3.71M
      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
1199
3.71M
    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1200
1201
3.71M
    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
1202
3.53M
        &rec_limit))
1203
1.44M
      {
1204
1.44M
      switch(c)
1205
1.44M
        {
1206
337k
        case OP_STAR:
1207
337k
        *code += OP_POSSTAR - OP_STAR;
1208
337k
        break;
1209
1210
58.1k
        case OP_MINSTAR:
1211
58.1k
        *code += OP_POSSTAR - OP_MINSTAR;
1212
58.1k
        break;
1213
1214
364k
        case OP_PLUS:
1215
364k
        *code += OP_POSPLUS - OP_PLUS;
1216
364k
        break;
1217
1218
62.6k
        case OP_MINPLUS:
1219
62.6k
        *code += OP_POSPLUS - OP_MINPLUS;
1220
62.6k
        break;
1221
1222
436k
        case OP_QUERY:
1223
436k
        *code += OP_POSQUERY - OP_QUERY;
1224
436k
        break;
1225
1226
91.0k
        case OP_MINQUERY:
1227
91.0k
        *code += OP_POSQUERY - OP_MINQUERY;
1228
91.0k
        break;
1229
1230
85.3k
        case OP_UPTO:
1231
85.3k
        *code += OP_POSUPTO - OP_UPTO;
1232
85.3k
        break;
1233
1234
11.6k
        case OP_MINUPTO:
1235
11.6k
        *code += OP_POSUPTO - OP_MINUPTO;
1236
11.6k
        break;
1237
1.44M
        }
1238
1.44M
      }
1239
3.71M
    c = *code;
1240
3.71M
    }
1241
70.1M
  else if (c == OP_CLASS || c == OP_NCLASS
1242
69.6M
#ifdef SUPPORT_WIDE_CHARS
1243
69.6M
           || c == OP_XCLASS || c == OP_ECLASS
1244
70.1M
#endif
1245
70.1M
           )
1246
773k
    {
1247
773k
#ifdef SUPPORT_WIDE_CHARS
1248
773k
    if (c == OP_XCLASS || c == OP_ECLASS)
1249
246k
      repeat_opcode = code + GET(code, 1);
1250
527k
    else
1251
527k
#endif
1252
527k
      repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1253
1254
773k
    c = *repeat_opcode;
1255
773k
    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1256
196k
      {
1257
      /* The return from get_chr_property_list() will never be NULL when
1258
      *code (aka c) is one of the four class opcodes. However, gcc with
1259
      -fanalyzer notes that a NULL return is possible, and grumbles. Hence we
1260
      put in a check. */
1261
1262
196k
      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
1263
196k
      list[1] = (c & 1) == 0;
1264
1265
196k
      if (end != NULL &&
1266
196k
          compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
1267
91.8k
        {
1268
91.8k
        switch (c)
1269
91.8k
          {
1270
8.81k
          case OP_CRSTAR:
1271
10.4k
          case OP_CRMINSTAR:
1272
10.4k
          *repeat_opcode = OP_CRPOSSTAR;
1273
10.4k
          break;
1274
1275
17.5k
          case OP_CRPLUS:
1276
20.5k
          case OP_CRMINPLUS:
1277
20.5k
          *repeat_opcode = OP_CRPOSPLUS;
1278
20.5k
          break;
1279
1280
13.5k
          case OP_CRQUERY:
1281
14.9k
          case OP_CRMINQUERY:
1282
14.9k
          *repeat_opcode = OP_CRPOSQUERY;
1283
14.9k
          break;
1284
1285
35.1k
          case OP_CRRANGE:
1286
45.9k
          case OP_CRMINRANGE:
1287
45.9k
          *repeat_opcode = OP_CRPOSRANGE;
1288
45.9k
          break;
1289
91.8k
          }
1290
91.8k
        }
1291
196k
      }
1292
773k
    c = *code;
1293
773k
    }
1294
1295
73.8M
  switch(c)
1296
73.8M
    {
1297
54.4k
    case OP_END:
1298
54.4k
    return 0;
1299
1300
307k
    case OP_TYPESTAR:
1301
371k
    case OP_TYPEMINSTAR:
1302
555k
    case OP_TYPEPLUS:
1303
589k
    case OP_TYPEMINPLUS:
1304
716k
    case OP_TYPEQUERY:
1305
767k
    case OP_TYPEMINQUERY:
1306
826k
    case OP_TYPEPOSSTAR:
1307
895k
    case OP_TYPEPOSPLUS:
1308
926k
    case OP_TYPEPOSQUERY:
1309
926k
    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1310
926k
    break;
1311
1312
72.1k
    case OP_TYPEUPTO:
1313
77.9k
    case OP_TYPEMINUPTO:
1314
123k
    case OP_TYPEEXACT:
1315
175k
    case OP_TYPEPOSUPTO:
1316
175k
    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1317
7.58k
      code += 2;
1318
175k
    break;
1319
1320
12.0k
    case OP_CALLOUT_STR:
1321
12.0k
    code += GET(code, 1 + 2*LINK_SIZE);
1322
12.0k
    break;
1323
1324
0
#ifdef SUPPORT_WIDE_CHARS
1325
176k
    case OP_XCLASS:
1326
246k
    case OP_ECLASS:
1327
246k
    code += GET(code, 1);
1328
246k
    break;
1329
0
#endif
1330
1331
54.0k
    case OP_MARK:
1332
56.0k
    case OP_COMMIT_ARG:
1333
60.4k
    case OP_PRUNE_ARG:
1334
117k
    case OP_SKIP_ARG:
1335
133k
    case OP_THEN_ARG:
1336
133k
    code += code[1];
1337
133k
    break;
1338
73.8M
    }
1339
1340
  /* Add in the fixed length from the table */
1341
1342
73.8M
  code += PRIV(OP_lengths)[c];
1343
1344
  /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1345
  followed by a multi-byte character. The length in the table is a minimum, so
1346
  we have to arrange to skip the extra code units. */
1347
1348
73.8M
#ifdef MAYBE_UTF_MULTI
1349
73.8M
  if (utf) switch(c)
1350
7.20M
    {
1351
1.02M
    case OP_CHAR:
1352
2.74M
    case OP_CHARI:
1353
2.75M
    case OP_NOT:
1354
2.76M
    case OP_NOTI:
1355
2.76M
    case OP_STAR:
1356
2.77M
    case OP_MINSTAR:
1357
2.78M
    case OP_PLUS:
1358
2.78M
    case OP_MINPLUS:
1359
2.79M
    case OP_QUERY:
1360
2.80M
    case OP_MINQUERY:
1361
2.80M
    case OP_UPTO:
1362
2.80M
    case OP_MINUPTO:
1363
2.80M
    case OP_EXACT:
1364
2.81M
    case OP_POSSTAR:
1365
2.83M
    case OP_POSPLUS:
1366
2.84M
    case OP_POSQUERY:
1367
2.84M
    case OP_POSUPTO:
1368
2.85M
    case OP_STARI:
1369
2.86M
    case OP_MINSTARI:
1370
2.86M
    case OP_PLUSI:
1371
2.87M
    case OP_MINPLUSI:
1372
2.88M
    case OP_QUERYI:
1373
2.90M
    case OP_MINQUERYI:
1374
2.90M
    case OP_UPTOI:
1375
2.90M
    case OP_MINUPTOI:
1376
2.90M
    case OP_EXACTI:
1377
2.93M
    case OP_POSSTARI:
1378
2.95M
    case OP_POSPLUSI:
1379
2.98M
    case OP_POSQUERYI:
1380
2.99M
    case OP_POSUPTOI:
1381
2.99M
    case OP_NOTSTAR:
1382
2.99M
    case OP_NOTMINSTAR:
1383
2.99M
    case OP_NOTPLUS:
1384
2.99M
    case OP_NOTMINPLUS:
1385
2.99M
    case OP_NOTQUERY:
1386
2.99M
    case OP_NOTMINQUERY:
1387
2.99M
    case OP_NOTUPTO:
1388
2.99M
    case OP_NOTMINUPTO:
1389
2.99M
    case OP_NOTEXACT:
1390
2.99M
    case OP_NOTPOSSTAR:
1391
2.99M
    case OP_NOTPOSPLUS:
1392
2.99M
    case OP_NOTPOSQUERY:
1393
3.00M
    case OP_NOTPOSUPTO:
1394
3.00M
    case OP_NOTSTARI:
1395
3.00M
    case OP_NOTMINSTARI:
1396
3.00M
    case OP_NOTPLUSI:
1397
3.00M
    case OP_NOTMINPLUSI:
1398
3.00M
    case OP_NOTQUERYI:
1399
3.00M
    case OP_NOTMINQUERYI:
1400
3.00M
    case OP_NOTUPTOI:
1401
3.00M
    case OP_NOTMINUPTOI:
1402
3.00M
    case OP_NOTEXACTI:
1403
3.00M
    case OP_NOTPOSSTARI:
1404
3.00M
    case OP_NOTPOSPLUSI:
1405
3.00M
    case OP_NOTPOSQUERYI:
1406
3.00M
    case OP_NOTPOSUPTOI:
1407
3.00M
    if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1408
3.00M
    break;
1409
7.20M
    }
1410
#else
1411
  (void)(utf);  /* Keep compiler happy by referencing function argument */
1412
#endif  /* SUPPORT_WIDE_CHARS */
1413
73.8M
  }
1414
54.4k
}
1415
1416
/* End of pcre2_auto_possess.c */